In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
from tqdm import tqdm
from jikanpy import Jikan

    Based on our notes.md  
    We want test all the anime ids from 1 to 50000  
    Let's see which ids are valid ids  
    and what anime names we end up with

In [2]:
class ScrapeAnimeList:
    def __init__(self, RANGE=range(1,50001)):
        self.jikan_obj = Jikan()
        self.all_ids = RANGE
        self.invalid_ids = [] 
        self.valid_ids = []
        self.valid_anime_names = []
        self.rankings = []
    
    def scrape(self):
        for id in tqdm(self.all_ids):
            try:
                anime = self.jikan_obj.anime(id)
                self.valid_ids.append(id)
                self.rankings.append(anime["data"]["rank"])
                if anime["data"]["title_english"] != None:
                    self.valid_anime_names.append(anime["data"]["title_english"]) #only 400/3000 have english titles
                else:
                    self.valid_anime_names.append(anime["data"]["title"]) #to get the rest of the titles, might be a little japanese though
            except:
                self.invalid_ids.append(id)
        return self.valid_ids, self.invalid_ids, self.valid_anime_names

    def save_df(self, csv_name):
        # print(self.valid_ids, self.rankings, self.valid_anime_names)
        df = pd.DataFrame(list(zip(self.valid_ids, self.rankings, self.valid_anime_names)), columns =['valid_id', 'rank', 'anime_name'])
        df.to_csv(f"./data/scraped_anime_list/{csv_name}", index=False)

In [13]:
scraper = ScrapeAnimeList(RANGE=range(1,10001))
valid_ids, invalid_ids, valid_anime_names = scraper.scrape()
scraper.save_df("AnimeList_0-10k.csv")

100%|██████████| 10000/10000 [1:39:42<00:00,  1.67it/s] 


In [14]:
scraper = ScrapeAnimeList(RANGE=range(10001,20001))
valid_ids, invalid_ids, valid_anime_names = scraper.scrape()
scraper.save_df("AnimeList_10-20k.csv")

100%|██████████| 10000/10000 [3:47:04<00:00,  1.36s/it]  


In [3]:
scraper = ScrapeAnimeList(RANGE=range(20001,30001))
valid_ids, invalid_ids, valid_anime_names = scraper.scrape()
scraper.save_df("AnimeList_20-30k.csv")

100%|██████████| 10000/10000 [10:45:48<00:00,  3.87s/it] 


In [4]:
scraper = ScrapeAnimeList(RANGE=range(30001,40001))
valid_ids, invalid_ids, valid_anime_names = scraper.scrape()
scraper.save_df("AnimeList_30-40k.csv")

100%|██████████| 10000/10000 [2:58:35<00:00,  1.07s/it]  


In [5]:
scraper = ScrapeAnimeList(RANGE=range(40001,50001))
valid_ids, invalid_ids, valid_anime_names = scraper.scrape()
scraper.save_df("AnimeList_40-50k.csv")

100%|██████████| 10000/10000 [1:15:04<00:00,  2.22it/s] 


In [16]:
anime_df1 = pd.read_csv("./data/scraped_anime_list/AnimeList_0-10k.csv")
anime_df2 = pd.read_csv("./data/scraped_anime_list/AnimeList_10-20k.csv")
anime_df3 = pd.read_csv("./data/scraped_anime_list/AnimeList_20-30k.csv")
anime_df4 = pd.read_csv("./data/scraped_anime_list/AnimeList_30-40k.csv")
anime_df5 = pd.read_csv("./data/scraped_anime_list/AnimeList_40-50k.csv")
anime_dfs = [anime_df1, anime_df2, anime_df3, anime_df4, anime_df5]
anime_df = pd.concat(anime_dfs)
anime_df.to_csv("./data/scraped_anime_list/AnimeList_0-50k.csv", index=False)

In [19]:
# df = pd.DataFrame(list(zip(valid_ids, valid_anime_names)), columns =['valid_id', 'anime_name'])
# df.to_csv("./data/scraped_anime_list/AnimeList_3k.csv", index=False)

In [56]:
# class ScrapeAnimeListExtended:
#     def __init__(self, ID_RANGE=50001):
#         self.jikan_obj = Jikan()
#         self.all_ids = range(1,ID_RANGE)
#         # scraped attributes, there has to be a better way to do this, I should ask someone
#         # I did this in build using Excel text-to-columns, but there has to be a more pythonic way
#         self.mal_id = []
#         self.url = []
#         self.images = []
#         self.trailer = []
#         self.approved = []
#         self.titles = []
#         self.title = []
#         self.title_english = []
#         self.title_japanese = []
#         self.title_synonyms = []
#         self.type = []
#         self.source = []
#         self.episodes = []
#         self.status = []
#         self.airing = []
#         self.aired = []
#         self.duration = []
#         self.rating = []
#         self.score = []
#         self.scored_by = []
#         self.rank = []
#         self.popularity = []
#         self.members = []
#         self.favorites = []
#         self.synopsis = []
#         self.background = []
#         self.season = []
#         self.year = []
#         self.broadcast = []
#         self.producers = []
#         self.licensors = []
#         self.studios = []
#         self.genres = []
#         self.explicit_genres = []
#         self.themes = []
#         self.demographics = []

#     def scrape(self):
#         for id in tqdm(self.all_ids):
#             try:
#                 anime = self.jikan_obj.anime(id)
#                 self.mal_id.append(anime["data"]["mal_id"])
#                 self.url.append(anime["data"]["url"])
#                 self.images.append(anime["data"]["images"])
#                 self.trailer.append(anime["data"]["trailer"])
#                 self.approved.append(anime["data"]["approved"])
#                 self.titles.append(anime["data"]["titles"])
#                 self.title.append(anime["data"]["title"])
#                 # self.title_english.append(anime["data"]["title_english"])
#                 self.title_japanese.append(anime["data"]["title_japanese"])
#                 self.title_synonyms.append(anime["data"]["title_synonyms"])
#                 self.type.append(anime["data"]["type"])
#                 self.source.append(anime["data"]["source"])
#                 self.episodes.append(anime["data"]["episodes"])
#                 self.status.append(anime["data"]["status"])
#                 self.airing.append(anime["data"]["airing"])
#                 self.aired.append(anime["data"]["aired"])
#                 self.duration.append(anime["data"]["duration"])
#                 self.rating.append(anime["data"]["rating"])
#                 self.score.append(anime["data"]["score"])
#                 self.scored_by.append(anime["data"]["scored_by"])
#                 self.rank.append(anime["data"]["rank"])
#                 self.popularity.append(anime["data"]["popularity"])
#                 self.members.append(anime["data"]["members"])
#                 self.favorites.append(anime["data"]["favorites"])
#                 self.synopsis.append(anime["data"]["synopsis"])
#                 self.background.append(anime["data"]["background"])
#                 self.season.append(anime["data"]["season"])
#                 self.year.append(anime["data"]["year"])
#                 self.broadcast.append(anime["data"]["broadcast"])
#                 self.producers.append(anime["data"]["producers"])
#                 self.licensors.append(anime["data"]["licensors"])
#                 self.studios.append(anime["data"]["studios"])
#                 self.genres.append(anime["data"]["genres"])
#                 self.explicit_genres.append(anime["data"]["explicit_genres"])
#                 self.themes.append(anime["data"]["themes"])
#                 self.demographics.append(anime["data"]["demographics"])
#             except:
#                 pass
#         return self.mal_id, self.title

#     def save_df(self, csv_name):
#         df = pd.DataFrame(list(zip(self.mal_id,	self.url,	self.images,	self.trailer,	self.approved,	self.titles,\
#                                 self.title,	self.title_english,	self.title_japanese,	self.title_synonyms,	\
#                                 self.type,	self.source,	self.episodes,	self.status,	self.airing,	\
#                                 self.aired,	self.duration,	self.rating,	self.score,	self.scored_by,	self.rank,	\
#                                 self.popularity,	self.members,	self.favorites,	self.synopsis,	self.background, \
#                                 self.season,	self.year,	self.broadcast,	self.producers,	self.licensors,	self.studios,	
#                                 self.genres,	self.explicit_genres,	self.themes,	self.demographics,)),
#                           columns =["mal_id",	"url",	"images",	"trailer",	"approved",	"titles",	\
#                             "title",	"title_english",	"title_japanese",	"title_synonyms",	\
#                             "type",	"source",	"episodes",	"status",	"airing",	\
#                             "aired",	"duration",	"rating",	"score",	"scored_by",	"rank",	\
#                             "popularity",	"members",	"favorites",	"synopsis",	"background",	\
#                             "season",	"year",	"broadcast",	"producers",	"licensors",	"studios",	\
#                             "genres",	"explicit_genres",	"themes",	"demographics",])
#         df.to_csv(f"./data/scraped_anime_list/{csv_name}", index=False)