In [68]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Send an HTTP request to the web page and get the HTML content
url = 'https://en.wikipedia.org/wiki/List_of_best-selling_manga'
response = requests.get(url)
html = response.content

# Parse the HTML using Beautiful Soup and find all the tables with a given class
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table', {'class': 'wikitable'})

# Convert each HTML table into a pandas DataFrame and store them in a list
dfs = []
for table in tables:
    df = pd.read_html(str(table))[0]
    df.columns.values[7] = 'Average sales per volume'
    dfs.append(df)



In [69]:
anime_df = pd.read_csv('anime_filtered.csv')
anime_df["Revenue (Millions)"] = 0.0
anime_df["Sales per volume"] = 0.0

anime_df.head()

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,Revenue (Millions),Sales per volume
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ...",0.0,0.0
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi...",0.0,0.0
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ...",0.0,0.0
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ...",0.0,0.0
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)...",0.0,0.0


In [70]:
anime_df.columns

Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme', 'Revenue (Millions)',
       'Sales per volume'],
      dtype='object')

In [71]:
df_grouped = anime_df.groupby('type').agg({'score': 'mean', 'rank': 'first'}).reset_index()

df_grouped.head(7)

Unnamed: 0,type,score,rank
0,Movie,6.023003,11408.0
1,Music,5.177479,7814.0
2,ONA,5.415927,7654.0
3,OVA,6.193354,6435.0
4,Special,6.308249,4041.0
5,TV,6.532639,1274.0
6,Unknown,0.0,


In [72]:
# use pivot_table method to create new dataframe with counts
df_counts = pd.pivot_table(anime_df, index='type', columns='source', aggfunc='size', fill_value=0).reset_index()

print(df_counts.shape)
print(df_counts.columns)
df_counts.head(7)

(7, 17)
Index(['type', '4-koma manga', 'Book', 'Card game', 'Digital manga', 'Game',
       'Light novel', 'Manga', 'Music', 'Novel', 'Original', 'Other',
       'Picture book', 'Radio', 'Unknown', 'Visual novel', 'Web manga'],
      dtype='object', name='source')


source,type,4-koma manga,Book,Card game,Digital manga,Game,Light novel,Manga,Music,Novel,Original,Other,Picture book,Radio,Unknown,Visual novel,Web manga
0,Movie,12,41,8,0,97,52,389,4,98,763,105,24,1,1005,18,7
1,Music,0,0,0,0,19,0,25,290,1,296,9,1,0,204,3,1
2,ONA,16,3,4,0,99,16,134,3,47,423,72,8,2,251,17,49
3,OVA,27,13,1,3,93,102,871,2,44,493,67,16,1,1244,624,7
4,Special,44,6,7,0,87,104,430,1,39,419,47,2,1,668,72,21
5,TV,123,30,38,7,210,285,1263,14,129,963,107,45,4,835,153,65
6,Unknown,2,0,0,0,2,4,7,0,1,10,1,0,0,3,0,0


In [73]:
final_scrapp = pd.concat(dfs)
final_scrapp.columns

Index(['Manga series', 'Author(s)', 'Publisher', 'Demographic',
       'No. of collected volumes', 'Serialized', 'Approximate sales',
       'Average sales per volume'],
      dtype='object')

In [74]:
anime_df['title_lower'] = anime_df['title'].str.lower()
final_scrapp['title_lower'] = final_scrapp['Manga series'].str.lower()

final_anime = anime_df.merge(final_scrapp, on='title_lower', how='inner')

final_anime['Sales (Million)'] = final_anime['Approximate sales'].str.extract(r'^([\d\.]+)', expand=False)
final_anime['Average Sales Per Volume (Million)'] = final_anime['Average sales per volume'].str.extract(r'^([\d\.]+)', expand=False)

final_anime.head(70)



Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,Manga series,Author(s),Publisher,Demographic,No. of collected volumes,Serialized,Approximate sales,Average sales per volume,Sales (Million),Average Sales Per Volume (Million)
0,269,Bleach,Bleach,BLEACH - ブリーチ -,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,366,Finished Airing,...,Bleach,Tite Kubo,Shueisha,Shōnen,74,2001–2016,130 million[17],1.75 million,130,1.75
1,6045,Kimi ni Todoke,Kimi ni Todoke: From Me to You,君に届け,Reaching You,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Kimi ni Todoke,Karuho Shiina,Shueisha,Shōjo,30,2005–2017,33 million[90],1.10 million,33,1.10
2,210,Ranma ½,Ranma ½,らんま1/2,"Ranma 1/2, Ranma ½ Nettou Hen",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,161,Finished Airing,...,Ranma ½,Rumiko Takahashi,Shogakukan,Shōnen,38,1987–1996,55 million[52],1.44 million,55,1.44
3,10800,Chihayafuru,Chihayafuru,ちはやふる,Chihayafull,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Chihayafuru,Yuki Suetsugu,Kodansha,Josei,50,2007–2022,28 million[111],0.56 million,28,0.56
4,3731,Itazura na Kiss,ItaKiss,イタズラなKiss,"Naughty Kiss, Teasing Kiss, Mischievous Kiss, ...",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Itazura na Kiss,Kaoru Tada,Shueisha,Shōjo,23,1990–1999,35 million[84],1.52 million,35,1.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,18179,Yowamushi Pedal,Yowamushi Pedal,弱虫ペダル,Yowapeda,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,38,Finished Airing,...,Yowamushi Pedal,Wataru Watanabe,Akita Shoten,Shōnen,80,2008–present,28 million[113],0.35 million,28,0.35
66,1093,Oishinbo,,美味しんぼ,The Gourmet,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,136,Finished Airing,...,Oishinbo,"Tetsu Kariya, Akira Hanasaki",Shogakukan,Seinen,111,1983–2014 (on hiatus),135 million[16],1.22 million,135,1.22
67,14513,Magi: The Labyrinth of Magic,Magi: The Labyrinth of Magic,マギ The labyrinth of magic,Magi Season 1,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Magi: The Labyrinth of Magic,Shinobu Ohtaka,Shogakukan,Shōnen,37,2009–2017,25 million[127],0.67 million,25,0.67
68,35974,Swan,Swan,Swan,"Nabowa, Drawings",https://myanimelist.cdn-dena.com/images/anime/...,Music,Original,1,Finished Airing,...,Swan,Kyoko Ariyoshi,Shueisha,Shōjo,21,1976–1981,20 million[177],0.95 million,20,0.95


In [76]:
# bin the score column into discrete intervals
# anime_df['score_bin'] = pd.cut(anime_df['score'], bins=[0, 0.9, 1.9, 2.9, 3.9, 4.9, 5.9, 6.9, 7.9, 8.9, 9.9])
anime_df['score_bin'] = pd.cut(anime_df['score'], bins=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# create a pivot table to count the number of shows by rating and score interval
rating_grading_df = anime_df.pivot_table(index='rating', columns='score_bin', values='title', aggfunc='count').reset_index()

rating_grading_df.columns = ['rating', '0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9', '9-10']


print(rating_grading_df.columns)
# print the pivot table
rating_grading_df.head(7)

Index(['rating', '0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-7', '7-8', '8-9',
       '9-10'],
      dtype='object')


Unnamed: 0,rating,0-1,1-2,2-3,3-4,4-5,5-6,6-7,7-8,8-9,9-10
0,G - All Ages,11,10,35,291,854,1453,1354,452,52,7
1,,0,0,1,17,52,137,122,28,0,0
2,PG - Children,0,1,8,23,122,316,548,241,9,0
3,PG-13 - Teens 13 or older,0,2,5,28,168,657,1912,1793,351,12
4,R - 17+ (violence & profanity),0,0,4,12,20,113,280,422,109,4
5,R+ - Mild Nudity,0,0,4,22,46,202,354,217,31,0
6,Rx - Hentai,0,0,2,8,56,401,600,144,0,1
