### Import Library

In [1]:
# ---- Imports ----
from bs4 import BeautifulSoup   # pip install bs4
import json                     # pip install json
import numpy as np              # pip install numpy
import pandas as pd             # pip install pandas
import re                       # pip install re
import requests                 # pip install requests
import time                     # pip install time

# ---- Settings ----
pd.set_option('display.max_columns', None)

# ---- Constants ----
DATASETS_PATH = ('./datasets')

TRIES_LIMIT = (3)

# ANIME_IDS = (np.arange(0, 60000)) # all animes
# USER_IDS = (np.arange(0, 1030000)) # all users

ANIME_IDS = (np.arange(0, 200)) # sample animes
USER_IDS = (np.arange(0, 200)) # sample users

STATUS_CODE = (7)
BATCH_SIZE = (250) # number of users to fetch in each batch
MIN_DELAY_SECONDS = (60)
MAX_DELAY_SECONDS = (90)

### Anime Data Fetcher

In [2]:
# ---- Dataset Structure ----
header = [
    'MAL_ID','Name', 'English_name', 'Japanese_name', 'Score',
    'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
    'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
    'Rank', 'Popularity', 'Favorites', 'Members', 'Watching',
    'Completed', 'On_hold', 'Dropped', 'Plan_to_watch', 'Scored_by',  'Score_1',
    'Score_2', 'Score_3', 'Score_4', 'Score_5', 'Score_6', 'Score_7', 'Score_8',
    'Score_9', 'Score_10'
]
full_anime_df = pd.DataFrame(columns=header)
full_anime_df

# ---- API Requisitions ----
for anime_id in ANIME_IDS:
    anime_api_url = f'https://api.jikan.moe/v4/anime/{anime_id}'
    statistics_api_url = f'https://api.jikan.moe/v4/anime/{anime_id}/statistics'
    anime_page = None
    statistics_page = None

    tries = 0
    while tries < TRIES_LIMIT:
        tries += 1
        anime_page = requests.get(anime_api_url)
        statistics_page = requests.get(statistics_api_url)
        
        if anime_page.status_code == 200 and statistics_page.status_code == 200: break
        time.sleep(2)
    
    # if the requisition successfully returned the anime data, it will be processed
    if anime_page.status_code == 200 and statistics_page.status_code == 200:
        anime_json_data = anime_page.json()
        statistics_json_data = statistics_page.json()
    
        # if 'data' property is present, it's formatted and inserted into the dataset
        if 'data' in anime_json_data and 'data' in statistics_json_data:
            anime = {}
        
            anime['MAL_ID'] = [anime_id]
            anime['Name'] = [anime_json_data['data'].get('title')]
            anime['English_name'] = [anime_json_data['data'].get('title_english')]
            anime['Japanese_name'] = [anime_json_data['data'].get('title_japanese')]
            anime['Score'] = [anime_json_data['data'].get('score')]
            anime['Type'] = [anime_json_data['data'].get('type')]
            anime['Episodes'] = [anime_json_data['data'].get('episodes')]
            anime['Aired'] = [anime_json_data['data'].get('aired', {}).get('string')]
        
            premiered = anime_json_data['data'].get('season')
            year = anime_json_data['data'].get('year')
            if year is not None: premiered += ' ' + str(year)
            anime['Premiered'] = [premiered]
        
            anime['Status'] = [anime_json_data['data'].get('status')]
            anime['Producers'] = [', '.join([producer['name'] for producer in anime_json_data['data'].get('producers', [])])]
            anime['Licensors'] = [', '.join([license['name'] for license in anime_json_data['data'].get('licensors', [])])]
            anime['Studios'] = [', '.join([studio['name'] for studio in anime_json_data['data'].get('studios', [])])]
            anime['Source'] = [anime_json_data['data'].get('source')]
            anime['Duration'] = [anime_json_data['data'].get('duration')]
            anime['Rating'] = [anime_json_data['data'].get('rating')]
            anime['Rank'] = [anime_json_data['data'].get('rank')]
            anime['Popularity'] = [anime_json_data['data'].get('popularity')]
            anime['Favorites'] = [anime_json_data['data'].get('favorites')]
            anime['Members'] = [anime_json_data['data'].get('members')]
            anime['Watching'] = [statistics_json_data['data'].get('watching')]
            anime['Completed'] = [statistics_json_data['data'].get('completed')]
            anime['On_hold'] = [statistics_json_data['data'].get('on_hold')]
            anime['Dropped'] = [statistics_json_data['data'].get('dropped')]
            anime['Plan_to_watch'] = [statistics_json_data['data'].get('plan_to_watch')]
            
            anime['Scored_by'] = [anime_json_data['data'].get('scored_by')]
            anime['Score_1'] = [statistics_json_data['data'].get('scores')[0].get('votes')]
            anime['Score_2'] = [statistics_json_data['data'].get('scores')[1].get('votes')]
            anime['Score_3'] = [statistics_json_data['data'].get('scores')[2].get('votes')]
            anime['Score_4'] = [statistics_json_data['data'].get('scores')[3].get('votes')]
            anime['Score_5'] = [statistics_json_data['data'].get('scores')[4].get('votes')]
            anime['Score_6'] = [statistics_json_data['data'].get('scores')[5].get('votes')]
            anime['Score_7'] = [statistics_json_data['data'].get('scores')[6].get('votes')]
            anime['Score_8'] = [statistics_json_data['data'].get('scores')[7].get('votes')]
            anime['Score_9'] = [statistics_json_data['data'].get('scores')[8].get('votes')]
            anime['Score_10'] = [statistics_json_data['data'].get('scores')[9].get('votes')]
        
            anime_df = pd.DataFrame.from_dict(anime)
            full_anime_df = pd.concat([full_anime_df, anime_df])
            
        # if 'data' property is not present, the anime id is skipped
        else: print('Skipping anime {}: Invalid data'.format(anime_id))
        
    # if the requisition fails 'TRIES_LIMIT' times to fetch the data, the anime id is skipped
    else: print('Skipping anime {}: Not existent'.format(anime_id))

print('Finished fetching animes.')

Skipping anime 0: Invalid data


  full_anime_df = pd.concat([full_anime_df, anime_df])


Skipping anime 2: Not existent
Skipping anime 3: Not existent
Skipping anime 4: Not existent
Skipping anime 9: Not existent
Skipping anime 10: Not existent
Skipping anime 11: Not existent
Skipping anime 12: Not existent
Skipping anime 13: Not existent
Skipping anime 14: Not existent
Skipping anime 34: Not existent
Skipping anime 35: Not existent
Skipping anime 36: Not existent
Skipping anime 37: Not existent
Skipping anime 38: Not existent
Skipping anime 39: Not existent
Skipping anime 40: Not existent
Skipping anime 41: Not existent
Skipping anime 42: Not existent
Skipping anime 70: Not existent
Skipping anime 78: Not existent
Skipping anime 140: Not existent
Skipping anime 172: Not existent
Finished fetching animes.


In [3]:
# ---- Transforming Datas for Better CSV Exportation ----
full_anime_df.set_index('MAL_ID', inplace=True)
full_anime_df['Name'] = full_anime_df['Name'].str.replace(';', ' ')
full_anime_df['English_name'] = full_anime_df['English_name'].str.replace(';', ' ')
full_anime_df['Japanese_name'] = full_anime_df['Japanese_name'].str.replace(';', ' ')
full_anime_df

Unnamed: 0_level_0,Name,English_name,Japanese_name,Score,Type,Episodes,Aired,Premiered,Status,Producers,Licensors,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Members,Watching,Completed,On_hold,Dropped,Plan_to_watch,Scored_by,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,Score_9,Score_10
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,TV,26,"Apr 3, 1998 to Apr 24, 1999",spring 1998,Finished Airing,Bandai Visual,Funimation,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),46,43,82686,1871594,173523,1081861,107126,43450,465634,968293,2304,1128,2017,5024,13706,32556,97619,204868,273401,335670
5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,Movie,1,"Sep 1, 2001",,Finished Airing,"Sunrise, Bandai Visual","Sony Pictures Entertainment, Funimation",Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),198,619,1591,379634,7105,280971,3006,1214,87399,216227,455,148,301,770,2553,7917,30890,67993,64569,40659
6,Trigun,Trigun,トライガン,8.22,TV,26,"Apr 1, 1998 to Sep 30, 1998",spring 1998,Finished Airing,Victor Entertainment,Funimation,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,344,253,16069,765871,46623,449235,33671,18867,217475,374435,645,415,925,2560,7686,20453,65951,114649,96382,64769
7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.24,TV,26,"Jul 3, 2002 to Dec 25, 2002",summer 2002,Finished Airing,"Bandai Visual, Dentsu, Victor Entertainment, T...","Funimation, Bandai Entertainment",Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,3022,1862,649,117272,5738,52819,5970,6252,46501,44216,176,196,412,1233,3319,6628,13188,11106,5210,2748
8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.93,TV,52,"Sep 30, 2004 to Sep 29, 2005",fall 2004,Finished Airing,"TV Tokyo, Dentsu",Illumitoon Entertainment,Toei Animation,Manga,23 min per ep,PG - Children,4506,5350,15,15774,836,8561,838,1242,4298,6843,57,52,95,288,718,1208,1884,1326,808,407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Onegai☆Teacher,Please Teacher!,おねがい☆ティーチャー,7.10,TV,12,"Jan 10, 2002 to Mar 28, 2002",winter 2002,Finished Airing,"Bandai Visual, Genco","Nozomi Entertainment, Bandai Entertainment",Daume,Original,22 min per ep,PG-13 - Teens 13 or older,3749,1349,974,177889,5499,127105,3305,5091,36897,95886,579,821,1432,3523,8411,15545,26430,20583,10702,7864
196,Onegai☆Twins,Please Twins!,おねがい☆ツインズ,6.81,TV,12,"Jul 15, 2003 to Oct 14, 2003",summer 2003,Finished Airing,"Bandai Visual, Genco, Lantis","Nozomi Entertainment, Bandai Entertainment",Daume,Original,23 min per ep,PG-13 - Teens 13 or older,5066,2237,189,90194,2481,67650,1722,2488,15853,50281,346,521,951,2341,5320,9692,14339,9413,4518,2840
197,Rizelmine,,りぜるまいん,6.50,TV,24,"Apr 3, 2002 to Dec 22, 2002",spring 2002,Finished Airing,m.o.e.,,"Madhouse, Imagin",Manga,15 min per ep,R+ - Mild Nudity,6801,4387,49,25718,1233,13902,1098,1874,7613,10944,172,231,339,706,1508,2269,2713,1636,829,541
198,Speed Grapher,Speed Grapher,スピードグラファー,7.30,TV,24,"Apr 8, 2005 to Sep 30, 2005",spring 2005,Finished Airing,"TV Asahi, WAO World, TAP",Funimation,Gonzo,Original,23 min per ep,R+ - Mild Nudity,2714,1894,602,115021,6296,47946,5529,5763,49480,41560,230,325,474,1278,2909,5642,11275,10813,5437,3175


In [4]:
# ---- Storing Dataset into Disk ----
full_anime_df.to_csv(f'{DATASETS_PATH}/anime.csv')

### Anime With Synopsis

In [5]:
# ---- Dataset Structure ----
header = [
    'MAL_ID', 'Name', 'Score', 'Genres', 'synopsis', 'Rank'
]
full_anime_df = pd.DataFrame(columns=header)
full_anime_df

# ---- API Requisitions ----
for anime_id in ANIME_IDS:
    anime_api_url = f'https://api.jikan.moe/v4/anime/{anime_id}'
    statistics_api_url = f'https://api.jikan.moe/v4/anime/{anime_id}/statistics'
    anime_page = None
    statistics_page = None

    tries = 0
    while tries < TRIES_LIMIT:
        tries += 1
        anime_page = requests.get(anime_api_url)
        statistics_page = requests.get(statistics_api_url)
        
        if anime_page.status_code == 200 and statistics_page.status_code == 200: break
        time.sleep(2)
    
    # if the requisition successfully returned the anime data, it will be processed
    if anime_page.status_code == 200 and statistics_page.status_code == 200:
        anime_json_data = anime_page.json()
        statistics_json_data = statistics_page.json()
    
        # if 'data' property is present, it's formatted and inserted into the dataset
        if 'data' in anime_json_data and 'data' in statistics_json_data:
            anime = {}
        
            anime['MAL_ID'] = [anime_id]
            anime['Name'] = [anime_json_data['data'].get('title')]
            anime['Score'] = [anime_json_data['data'].get('score')]
            anime['Rank'] = [anime_json_data['data'].get('rank')]
            anime['Genres']  = [', '.join([genre['name'] for genre in anime_json_data['data'].get('genres', [])])]
        
            synopsis =  anime_json_data['data'].get('synopsis')
            if synopsis is not None:
                cleared_synopsis = re.sub(r'\[.*?\]', '', synopsis).strip() # removing all text into brackets and the brackets itself
                anime['synopsis'] = [cleared_synopsis]
            else:
                anime['synopsis'] = ['']
                
            anime_df = pd.DataFrame.from_dict(anime)
            full_anime_df = pd.concat([full_anime_df, anime_df])
            
        # if 'data' property is not present, the anime id is skipped
        else: print('Skipping anime {}: Invalid data'.format(anime_id))
        
    # if the requisition fails 'TRIES_LIMIT' times to fetch the data, the anime id is skipped
    else: print('Skipping anime {}: Not existent'.format(anime_id))

print('Finished fetching animes.')

Skipping anime 0: Invalid data


  full_anime_df = pd.concat([full_anime_df, anime_df])


Skipping anime 2: Not existent
Skipping anime 3: Not existent
Skipping anime 4: Not existent
Skipping anime 9: Not existent
Skipping anime 10: Not existent
Skipping anime 11: Not existent
Skipping anime 12: Not existent
Skipping anime 13: Not existent
Skipping anime 14: Not existent
Skipping anime 34: Not existent
Skipping anime 35: Not existent
Skipping anime 36: Not existent
Skipping anime 37: Not existent
Skipping anime 38: Not existent
Skipping anime 39: Not existent
Skipping anime 40: Not existent
Skipping anime 41: Not existent
Skipping anime 42: Not existent
Skipping anime 70: Not existent
Skipping anime 78: Not existent
Skipping anime 140: Not existent
Skipping anime 172: Not existent
Finished fetching animes.


In [6]:
# ---- Transforming Datas for Better CSV Exportation ----
full_anime_df.set_index('MAL_ID', inplace=True)
full_anime_df['Name'] = full_anime_df['Name'].str.replace(';', ' ')
full_anime_df

Unnamed: 0_level_0,Name,Score,Genres,synopsis,Rank
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",46
5,Cowboy Bebop: Tengoku no Tobira,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",198
6,Trigun,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",344
7,Witch Hunter Robin,7.24,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,3022
8,Bouken Ou Beet,6.93,"Action, Adventure, Fantasy",It is the dark century and the people are suff...,4506
...,...,...,...,...,...
195,Onegai☆Teacher,7.10,"Drama, Romance, Sci-Fi","One day, Kei Kusanagi notices a strange glowin...",3749
196,Onegai☆Twins,6.81,"Comedy, Drama, Sci-Fi",Maiku Kamishiro's past has always been somewha...,5066
197,Rizelmine,6.50,"Comedy, Romance, Sci-Fi, Ecchi",Iwaki Tomonori is an average 15-year-old boy w...,6801
198,Speed Grapher,7.30,"Action, Mystery, Supernatural","Ten years after the Bubble War, the dichotomy ...",2714


In [7]:
# ---- Storing Dataset into Disk ----
full_anime_df.to_csv(f'{DATASETS_PATH}/synopsis.csv')

### User Data Fetcher

In [9]:
# ---- Dataset Structure ----
header = ['user_id', 'username', 'url']
full_users_df = pd.DataFrame(columns=header)

# ---- API Requisitions ----
for user_id in USER_IDS:
    user_api_url = f'https://api.jikan.moe/v4/users/userbyid/{user_id}'
    user_page = None
    
    tries = 0
    while tries < TRIES_LIMIT:
        tries += 1
        user_page = requests.get(user_api_url)
        if user_page.status_code == 200: break
        time.sleep(1)
    
    # if the requisition successfully returned the user data, it's processed
    if user_page.status_code == 200:
        user_json_data = user_page.json()
        
        # if 'data' property is present, it's formatted and inserted into the dataset
        if 'data' in user_json_data:
            user = {}
            
            user['user_id'] = [user_id]
            user['username'] = [user_json_data['data'].get('username')]
            user['url'] = [user_json_data['data'].get('url')]
            
            user_df = pd.DataFrame.from_dict(user)
            full_users_df = pd.concat([full_users_df, user_df])
        
        # if 'data' property is not present, the user id is skipped
        else: print('Skipping user {}: Invalid data.'.format(user_id))
    
    # if the requisition fails 'TRIES_LIMIT' times to fetch the data, the user id is skipped
    else: print('Skipping user {}: Not existent.'.format(user_id))

print('Finished fetching users.')

Skipping user 0: Invalid data.
Skipping user 2: Not existent.
Skipping user 5: Not existent.
Skipping user 6: Not existent.
Skipping user 7: Not existent.
Skipping user 8: Not existent.
Skipping user 10: Not existent.
Skipping user 11: Not existent.
Skipping user 12: Not existent.
Skipping user 13: Not existent.
Skipping user 14: Not existent.
Skipping user 15: Not existent.
Skipping user 16: Not existent.
Skipping user 17: Not existent.
Skipping user 19: Not existent.
Skipping user 21: Not existent.
Skipping user 22: Not existent.
Skipping user 24: Not existent.
Skipping user 25: Not existent.
Skipping user 26: Not existent.
Skipping user 27: Not existent.
Skipping user 28: Not existent.
Skipping user 29: Not existent.
Skipping user 30: Not existent.
Skipping user 31: Not existent.
Skipping user 32: Not existent.
Skipping user 33: Not existent.
Skipping user 34: Not existent.
Skipping user 35: Not existent.
Skipping user 38: Not existent.
Skipping user 39: Not existent.
Skipping user 

In [10]:
# ---- Transforming Datas for Better CSV Exportation ----
full_users_df.set_index('user_id', inplace=True)
full_users_df

Unnamed: 0_level_0,username,url
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Xinil,https://myanimelist.net/profile/Xinil
3,Aokaado,https://myanimelist.net/profile/Aokaado
4,Crystal,https://myanimelist.net/profile/Crystal
9,Arcane,https://myanimelist.net/profile/Arcane
18,Mad,https://myanimelist.net/profile/Mad
20,vondur,https://myanimelist.net/profile/vondur
23,Amuro,https://myanimelist.net/profile/Amuro
36,Baman,https://myanimelist.net/profile/Baman
37,megan,https://myanimelist.net/profile/megan
44,beddan,https://myanimelist.net/profile/beddan


In [11]:
# ---- Storing Dataset into Disk ----
full_users_df.to_csv(f'{DATASETS_PATH}/users.csv')