In [6]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Send an HTTP request to the web page and get the HTML content
url = 'https://en.wikipedia.org/wiki/List_of_best-selling_manga'
response = requests.get(url)
html = response.content

# Parse the HTML using Beautiful Soup and find all the tables with a given class
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table', {'class': 'wikitable'})

# Convert each HTML table into a pandas DataFrame and store them in a list
dfs = []
for table in tables:
    df = pd.read_html(str(table))[0]
    dfs.append(df)



In [7]:
anime_df = pd.read_csv('anime_filtered.csv')
anime_df["Revenue (Millions)"] = 0.0
anime_df["Sales per volume"] = 0.0

anime_df.head()

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,broadcast,related,producer,licensor,studio,genre,opening_theme,ending_theme,Revenue (Millions),Sales per volume
0,11013,Inu x Boku SS,Inu X Boku Secret Service,妖狐×僕SS,Youko x Boku SS,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,12,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 17207, 'type': 'man...","Aniplex, Square Enix, Mainichi Broadcasting Sy...",Sentai Filmworks,David Production,"Comedy, Supernatural, Romance, Shounen","['""Nirvana"" by MUCC']","['#1: ""Nirvana"" by MUCC (eps 1, 11-12)', '#2: ...",0.0,0.0
1,2104,Seto no Hanayome,My Bride is a Mermaid,瀬戸の花嫁,The Inland Sea Bride,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 759, 'type': 'manga...","TV Tokyo, AIC, Square Enix, Sotsu",Funimation,Gonzo,"Comedy, Parody, Romance, School, Shounen","['""Romantic summer"" by SUN&LUNAR']","['#1: ""Ashita e no Hikari (明日への光)"" by Asuka Hi...",0.0,0.0
2,5262,Shugo Chara!! Doki,Shugo Chara!! Doki,しゅごキャラ！！どきっ,"Shugo Chara Ninenme, Shugo Chara! Second Year",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,51,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 101, 'type': 'manga...","TV Tokyo, Sotsu",,Satelight,"Comedy, Magic, School, Shoujo","['#1: ""Minna no Tamago (みんなのたまご)"" by Shugo Cha...","['#1: ""Rottara Rottara (ロッタラ ロッタラ)"" by Buono! ...",0.0,0.0
3,721,Princess Tutu,Princess Tutu,プリンセスチュチュ,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Original,38,Finished Airing,...,Fridays at Unknown,"{'Adaptation': [{'mal_id': 1581, 'type': 'mang...","Memory-Tech, GANSIS, Marvelous AQL",ADV Films,Hal Film Maker,"Comedy, Drama, Magic, Romance, Fantasy","['""Morning Grace"" by Ritsuko Okazaki']","['""Watashi No Ai Wa Chiisaikeredo"" by Ritsuko ...",0.0,0.0
4,12365,Bakuman. 3rd Season,Bakuman.,バクマン。,Bakuman Season 3,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Unknown,"{'Adaptation': [{'mal_id': 9711, 'type': 'mang...","NHK, Shueisha",,J.C.Staff,"Comedy, Drama, Romance, Shounen","['#1: ""Moshimo no Hanashi (もしもの話)"" by nano.RIP...","['#1: ""Pride on Everyday"" by Sphere (eps 1-13)...",0.0,0.0


In [8]:
for df in dfs:
    df.columns.values[7] = 'Average sales per volume'

dfs[0].columns

Index(['Manga series', 'Author(s)', 'Publisher', 'Demographic',
       'No. of collected volumes', 'Serialized', 'Approximate sales',
       'Average sales per volume'],
      dtype='object')

In [9]:
final_scrapp = pd.concat(dfs)

final_scrapp.shape

(187, 8)

In [10]:
anime_df['title_lower'] = anime_df['title'].str.lower()
final_scrapp['title_lower'] = final_scrapp['Manga series'].str.lower()

final_anime = anime_df.merge(final_scrapp, on='title_lower', how='inner')

final_anime['Sales (Million)'] = final_anime['Approximate sales'].str.extract(r'^([\d\.]+)', expand=False)
final_anime['Average Sales Per Volume (Million)'] = final_anime['Average sales per volume'].str.extract(r'^([\d\.]+)', expand=False)

final_anime.head()

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,Manga series,Author(s),Publisher,Demographic,No. of collected volumes,Serialized,Approximate sales,Average sales per volume,Sales (Million),Average Sales Per Volume (Million)
0,269,Bleach,Bleach,BLEACH - ブリーチ -,,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,366,Finished Airing,...,Bleach,Tite Kubo,Shueisha,Shōnen,74,2001–2016,130 million[17],1.75 million,130,1.75
1,6045,Kimi ni Todoke,Kimi ni Todoke: From Me to You,君に届け,Reaching You,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Kimi ni Todoke,Karuho Shiina,Shueisha,Shōjo,30,2005–2017,33 million[90],1.10 million,33,1.1
2,210,Ranma ½,Ranma ½,らんま1/2,"Ranma 1/2, Ranma ½ Nettou Hen",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,161,Finished Airing,...,Ranma ½,Rumiko Takahashi,Shogakukan,Shōnen,38,1987–1996,55 million[52],1.44 million,55,1.44
3,10800,Chihayafuru,Chihayafuru,ちはやふる,Chihayafull,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Chihayafuru,Yuki Suetsugu,Kodansha,Josei,50,2007–2022,28 million[111],0.56 million,28,0.56
4,3731,Itazura na Kiss,ItaKiss,イタズラなKiss,"Naughty Kiss, Teasing Kiss, Mischievous Kiss, ...",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,25,Finished Airing,...,Itazura na Kiss,Kaoru Tada,Shueisha,Shōjo,23,1990–1999,35 million[84],1.52 million,35,1.52
