# **Extract all anime from MyAnimeList**
6/7/2022
## The goal was to create a simple script that would extract and clean data from the MyAnimeList database and store it in a CSV files.

## Acknowledgments

References:

* [MyAnimeList API (beta ver.)](https://myanimelist.net/apiconfig/references/api/v2)

## Usage

To generate .md file from Jupyter Notebook use the following command:
* jupyter nbconvert --to markdown anime_extract.ipynb --output README.md


## Import

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import ast

CLIENT_ID = 'YOUR_CLIENT_ID'  # Client ID from MyAnimeList API
DATA_FOLDER = 'data/' # Folder to save data to

In [2]:
#data = requests.get('https://api.myanimelist.net/v2/anime/16498?fields=id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,num_episodes,start_season,broadcast,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics&limit=4', headers={'X-MAL-CLIENT-ID': CLIENT_ID}) 
data = requests.get('https://api.myanimelist.net/v2/anime/ranking?ranking_type=all&limit=500', headers={'X-MAL-CLIENT-ID': CLIENT_ID}) 
print(data.json().keys()) 
data.json() 
print(data.json()['paging']['next']) # next page

dict_keys(['data', 'paging'])
https://api.myanimelist.net/v2/anime/ranking?offset=500&ranking_type=all&limit=500


## Loop through all ranked anime and add anime id and title to a pandas dataframe

In [2]:
%%time
data = requests.get('https://api.myanimelist.net/v2/anime/ranking?ranking_type=all&limit=500', headers={'X-MAL-CLIENT-ID': CLIENT_ID}) # get all anime from ranking
df_anime_ids = pd.json_normalize(data.json()['data']).drop(['node.main_picture.medium', 'node.main_picture.large','ranking.rank'], axis=1) # get only anime ids and convert to dataframe
next = data.json()['paging']['next'] # get next page url
while next != None: # while there is a next page
    data = requests.get(next, headers={'X-MAL-CLIENT-ID': CLIENT_ID}) # get next page
    df_anime_ids = pd.concat([df_anime_ids, pd.json_normalize(data.json()['data']).drop(['node.main_picture.medium', 'node.main_picture.large','ranking.rank'], axis=1)], ignore_index=True) # concatenate dataframe and drop unnecessary columns
    try:
        next = data.json()['paging']['next'] # get next page url
    except:
        next = None # no more pages       
df_anime_ids.head()

CPU times: total: 14.8 s
Wall time: 3min 17s


Unnamed: 0,node.id,node.title
0,5114,Fullmetal Alchemist: Brotherhood
1,28977,Gintama°
2,9253,Steins;Gate
3,38524,Shingeki no Kyojin Season 3 Part 2
4,9969,Gintama'


In [4]:
df_anime_ids.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20521 entries, 0 to 20520
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   node.id     20521 non-null  int64 
 1   node.title  20521 non-null  object
dtypes: int64(1), object(1)
memory usage: 320.8+ KB


In [52]:
data = requests.get('https://api.myanimelist.net/v2/anime/' + str(5114) + '?fields=' + 'id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,my_list_status,num_episodes,start_season,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics', headers={'X-MAL-CLIENT-ID': CLIENT_ID})
data.json()

{'id': 3287,
 'title': 'Tenkuu Danzai Skelter+Heaven',
 'main_picture': {'medium': 'https://api-cdn.myanimelist.net/images/anime/9/62115.jpg',
  'large': 'https://api-cdn.myanimelist.net/images/anime/9/62115l.jpg'},
 'alternative_titles': {'synonyms': [],
  'en': 'Skelter Heaven',
  'ja': '天空断罪スケルターヘブン'},
 'start_date': '2004-12-08',
 'end_date': '2004-12-08',
 'synopsis': 'When a mysterious entity suddenly appears in the center of Tokyo, the Alta Mira Agency is tasked to repel the extraterrestrial threat. Otsuya Funagai must guide his all-female unit of "Battle Sole" pilots to take down the strange being no matter the cost. However, his intimate relationship with one of the pilots, Rin Ichikawa, may spell trouble for the team and the mission itself.\n\n[Written by MAL Rewrite]',
 'mean': 1.85,
 'rank': 12692,
 'popularity': 3278,
 'num_list_users': 34266,
 'num_scoring_users': 24204,
 'nsfw': 'white',
 'created_at': '2007-10-23T23:11:35+00:00',
 'updated_at': '2022-04-12T23:58:49+00:0

## Loop through all anime ids from previous dataframe and extract anime information for each anime
### - Filter out anime without mean score and rank (adult anime)
### - Pause for 5 minutes every 500 requests to avoid being blocked

In [None]:
%%time
print("Starting data aquasition for every anime ID in the list")
rq_limit = 500
print("Limited to " + str(rq_limit) + " sequential requests, otherwise server denies requests")

fields = 'id,title,main_picture,alternative_titles,start_date,end_date,synopsis,mean,rank,popularity,num_list_users,num_scoring_users,nsfw,created_at,updated_at,media_type,status,genres,my_list_status,num_episodes,start_season,source,average_episode_duration,rating,pictures,background,related_anime,related_manga,recommendations,studios,statistics'

data = requests.get('https://api.myanimelist.net/v2/anime/' + str(df_anime_ids['node.id'][0]) + '?fields=' + fields, headers={'X-MAL-CLIENT-ID': CLIENT_ID}) # get json data for anime with myanimelist api
df_anime = pd.json_normalize(data.json()) # convert json to pandas dataframe

for cnt in range(1, len(df_anime_ids['node.id'])): # loop through all anime IDs
    data = requests.get('https://api.myanimelist.net/v2/anime/' + str(df_anime_ids['node.id'][cnt]) + '?fields=' + fields, headers={'X-MAL-CLIENT-ID': CLIENT_ID})
    try: # if the anime is found
        anim_json = data.json()  # get the json
        if(not np.isnan(anim_json['mean']) and not np.isnan(anim_json['rank'])): # if mean and rank are not nan
            #anim_json.pop('background', None) # remove background          
            df_anime = pd.concat([df_anime, pd.json_normalize(anim_json)], ignore_index=True)#.drop(['node.main_picture.medium', 'node.main_picture.large','ranking.rank'], axis=1)
    except:
        None
    if (cnt > 1 and cnt % 100 == 0): # print progress every 100 requests
        print(str(cnt) + " requests")
    if (cnt % rq_limit == 0):                           
        print("Waiting for 5 minutes before continuing") # pause the requests for 5 minutes, otherwise, the server will deny requests
        minutes = 6
        while (minutes > 1): # wait for 5 minutes
            print(str(minutes-1) + " minutes left")
            time.sleep(60)
            minutes -= 1
        print("Starting up again")
df_anime.head(5)

In [6]:
df_anime.columns 

Index(['mal_id', 'title', 'start_date', 'end_date', 'synopsis', 'mean', 'rank',
       'popularity', 'num_list_users', 'num_scoring_users', 'nsfw',
       'created_at', 'updated_at', 'media_type', 'status', 'genres',
       'num_episodes', 'source', 'average_episode_duration', 'rating',
       'pictures', 'background', 'related_anime', 'related_manga',
       'recommendations', 'studios', 'main_picture.medium',
       'main_picture.large', 'alternative_titles.synonyms',
       'alternative_titles.en', 'alternative_titles.ja', 'start_season.year',
       'start_season.season', 'statistics.status.watching',
       'statistics.status.completed', 'statistics.status.on_hold',
       'statistics.status.dropped', 'statistics.status.plan_to_watch',
       'statistics.num_list_users'],
      dtype='object')

### Save extracted anime information to a csv file

In [24]:
df_anime.index.name = 'Index' # rename the index
df_anime.rename(columns={"id": "mal_id"}, inplace=True) # rename the id column to mal_id
df_anime = df_anime[df_anime['media_type'].isin(['tv', 'movie', 'ova', 'special', 'ona'])] # only include TV, Movie, OVA, Special, ONA
df_anime.reset_index(drop=True, inplace=True) # reset the index
df_anime.to_csv(DATA_FOLDER + 'anime_extract.csv', sep=';', encoding='utf-8') # save the dataframe as a csv file
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11613 entries, 0 to 11612
Data columns (total 39 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   mal_id                           11613 non-null  int64  
 1   title                            11613 non-null  object 
 2   start_date                       11603 non-null  object 
 3   end_date                         11501 non-null  object 
 4   synopsis                         11410 non-null  object 
 5   mean                             11613 non-null  float64
 6   rank                             11613 non-null  int64  
 7   popularity                       11613 non-null  int64  
 8   num_list_users                   11613 non-null  int64  
 9   num_scoring_users                11613 non-null  int64  
 10  nsfw                             11613 non-null  object 
 11  created_at                       11613 non-null  object 
 12  updated_at        

df_anime = pd.read_csv('data/anime_extract.csv', sep=';', encoding='utf-8')
df_anime.index.name = 'Index'
df_anime.drop(df_anime.columns[0], axis=1, inplace=True)
df_anime.rename(columns={"id": "mal_id"}, inplace=True)
df_anime = df_anime[df_anime['media_type'].isin(['tv', 'movie', 'ova', 'special', 'ona'])]
df_anime.reset_index(drop=True, inplace=True)
df_anime.info()

### Save anime ids and titles to a csv file

In [25]:
df_anime_titles = df_anime[['mal_id', 'title']].copy() # copy the dataframe columns to a new dataframe
df_anime_titles.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_titles.index.name = "Index" # rename the index
df_anime_titles.to_csv(DATA_FOLDER + 'anime_titles.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_titles.head()

Unnamed: 0_level_0,mal_id,title
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,Cowboy Bebop
1,5,Cowboy Bebop: Tengoku no Tobira
2,6,Trigun
3,7,Witch Hunter Robin
4,8,Bouken Ou Beet


### Save anime rankings to csv file

In [26]:
df_anime_ranking = df_anime[['mal_id', 'mean', 'rank', 'popularity', 'rating', 'num_scoring_users']].copy() # copy the dataframe columns to a new dataframe
df_anime_ranking.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_ranking.index.name = "Index" # set the index name to "Index"
df_anime_ranking.to_csv(DATA_FOLDER + 'anime_ranking.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_ranking.head()

Unnamed: 0_level_0,mal_id,mean,rank,popularity,rating,num_list_users,num_scoring_users
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,8.76,37,42,r,1617259,832701
1,5,8.38,175,566,r,334185,192661
2,6,8.22,310,242,pg_13,659514,328258
3,7,7.26,2708,1678,pg_13,105582,41521
4,8,6.96,4073,4843,pg,14304,6239


### Save anime ratings to csv file

In [None]:
df_anime_rating = df_anime[['mal_id', 'rating']].copy() # copy the dataframe columns to a new dataframe
df_anime_rating.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_rating.index.name = "Index" # set the index name to "Index"
df_anime_rating.to_csv(DATA_FOLDER + 'anime_rating.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_rating.head()

### Save anime dates to csv file

In [27]:
df_anime_dates = df_anime[['mal_id', 'start_date', 'end_date', 'start_season.year', 'start_season.season']].copy() # copy the dataframe columns to a new dataframe
df_anime_dates.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_dates.index.name = "Index" # set the index name to "Index"

df_anime_dates.rename(columns={"start_season.year": "anime_season_year"}, inplace=True) # rename the column
df_anime_dates.rename(columns={"start_season_season": "anime_season"}, inplace=True) # rename the column
df_anime_dates["start_season_year"] = df_anime_dates["start_season_year"].fillna(0).astype(np.int64) # fill the NaN with 0 and convert to int64

df_anime_dates.to_csv(DATA_FOLDER + 'anime_dates.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_dates.head()

Unnamed: 0_level_0,mal_id,start_date,end_date,start_season_year,start_season.season
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1998-04-03,1999-04-24,1998,spring
1,5,2001-09-01,2001-09-01,2001,summer
2,6,1998-04-01,1998-09-30,1998,spring
3,7,2002-07-03,2002-12-25,2002,summer
4,8,2004-09-30,2005-09-29,2004,fall


### Save anime genres and demographics to csv files

In [50]:
df_anime_genres = pd.DataFrame(columns=['mal_id', 'genre_id']) # create a dataframe with the columns
df_genres_d = pd.DataFrame(columns=['genre_id', 'genre_de']) # create a dataframe with the columns
df_anime_demographic = pd.DataFrame(columns=['mal_id', 'demo_id']) # create a dataframe with the columns
df_demographic_d = pd.DataFrame(columns=['demo_id', 'demo_de']) # create a dataframe with the columns

for row in df_anime.iterrows(): # iterate through the dataframe
    genres_str = row[1]['genres'] 
    if(pd.isna(genres_str)): # if the genres_str is NaN continue
        continue
    genres_str = '{"genres": ' + genres_str.replace("'id'", "'genre_id'").replace("name", "genre_de") + '}'
    genre_d = ast.literal_eval(genres_str) # convert the string to a dictionary
    genres_d = pd.json_normalize(genre_d['genres']) # normalize the json
    
    for genre in genre_d['genres']: # iterate through the genres and demographics
        if(genre['genre_id'] in [15, 25, 27, 42, 43]): # if the genre is in the list of demographics
            df_anime_demographic.loc[df_anime_demographic.shape[0]] = [row[1]['mal_id'], genre['genre_id']] # add the demographic to the dataframe
            if(genre['genre_id'] not in df_demographic_d['demo_id'].values): # if the demographic is not in the dataframe
                df_demographic_d.loc[df_demographic_d.shape[0]] = [genre['genre_id'], genre['genre_de']] # if the demographic not already in add it to the dataframe
        else: # if the genre is not in the list of demographics
            df_anime_genres.loc[df_anime_genres.shape[0]] = [row[1]['mal_id'], genre['genre_id']] # add the genre to the dataframe
            if(genre['genre_id'] not in df_genres_d['genre_id'].values): # if the genre is not in the dataframe
                df_genres_d.loc[df_genres_d.shape[0]] = [genre['genre_id'], genre['genre_de']] # if the genre not already in add it to the dataframe

In [42]:
df_anime_demographic.sort_values(by=['mal_id', 'demo_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id and demo_id
df_anime_demographic.index.name = "Index" # set the index name to "Index"
df_anime_demographic.to_csv(DATA_FOLDER + 'anime_demographics.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_demographic.head()

Unnamed: 0_level_0,mal_id,demo_id
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6,27
1,8,27
2,15,27
3,16,43
4,17,27


In [51]:
df_demographic_d.sort_values(by=['demo_id'], inplace=True, ignore_index=True) # sort the dataframe by the demo_id
df_demographic_d.index.name = "Index" # set the index name to "Index"
df_demographic_d.to_csv(DATA_FOLDER + 'demographics_d.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_demographic_d.head()

Unnamed: 0_level_0,demo_id,demo_de
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,15,Kids
1,25,Shoujo
2,27,Shounen
3,42,Seinen
4,43,Josei


In [46]:
df_genres_d.sort_values(by=['genre_id'], inplace=True, ignore_index=True) # sort the dataframe by the genre_id
df_genres_d.index.name = "Index" # set the index name to "Index"
df_genres_d.to_csv(DATA_FOLDER + 'genres_d.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_genres_d.head()

Unnamed: 0_level_0,genre_id,genre_de
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,Action
1,2,Adventure
2,3,Racing
3,4,Comedy
4,5,Avant Garde


In [30]:
df_anime_genres.sort_values(by=['mal_id', 'genre_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id and by genre_id
df_anime_genres.index.name = "Index" # set the index name to "Index"
df_anime_genres.to_csv(DATA_FOLDER + 'anime_genres.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_genres.head()

Unnamed: 0_level_0,mal_id,genre_id
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,1
1,1,24
2,1,29
3,1,50
4,5,1


### Save anime media types and nsfw to csv file

In [31]:
df_anime_media_type = df_anime[['mal_id', 'media_type', 'nsfw']].copy() # copy the dataframe columns to a new dataframe
df_anime_media_type.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_media_type.index.name = "Index" # set the index name to "Index"
df_anime_media_type.to_csv(DATA_FOLDER + 'media_type_d.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_media_type.head()

Unnamed: 0_level_0,mal_id,media_type,source,nsfw
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,tv,original,white
1,5,movie,original,white
2,6,tv,manga,white
3,7,tv,original,white
4,8,tv,manga,white


### Save anime sources to csv file

In [None]:
df_anime_source = df_anime[['mal_id', 'source']].copy() # copy the dataframe columns to a new dataframe
df_anime_source.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_source.index.name = "Index" # set the index name to "Index"
df_anime_source.to_csv(DATA_FOLDER + 'anime_source.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_source.head()

### Save anime synopses to csv file

In [32]:
df_synopsis_d = df_anime[['mal_id', 'synopsis']].copy() # copy the dataframe columns to a new dataframe
df_synopsis_d.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_synopsis_d.index.name = "Index" # set the index name to "Index"
df_synopsis_d.to_csv(DATA_FOLDER + 'synopsis_d.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_synopsis_d.head()

Unnamed: 0_level_0,mal_id,synopsis
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,"Crime is timeless. By the year 2071, humanity ..."
1,5,"Another day, another bounty—such is the life o..."
2,6,"Vash the Stampede is the man with a $$60,000,0..."
3,7,Witches are individuals with special powers li...
4,8,It is the dark century and the people are suff...


### Save anime studios to csv file

In [34]:
df_anime_studios = pd.DataFrame(columns=['mal_id', 'studio_id']) # create a dataframe with the columns
df_studios_d = pd.DataFrame(columns=['studio_id', 'studio_de']) # create a dataframe with the columns

cnt = 0
for row in df_anime.iterrows(): # iterate through the dataframe
    studios_str = row[1]['studios']
    if(pd.isna(genres_str)):
        continue
    studios_str = '{"studios": ' + studios_str.replace("id", 'studio_id').replace("name", 'studio_de') + '}'
    studio_d = ast.literal_eval(studios_str) # convert the string to a dictionary
    studios_d = pd.json_normalize(studio_d['studios']) # convert the dictionary to a dataframe
    
    df_studios_d = pd.concat([df_studios_d, studios_d], ignore_index=True).drop_duplicates() # concatenate the dataframes
    if(studios_d.empty): # if the dataframe is empty
        df_anime_studios.loc[df_anime_studios.shape[0]] = [int(row[1]['mal_id']), np.nan] # add the row to the dataframe and set it to NaN
        continue
    for studio in studio_d['studios']: # iterate through the studios
        df_anime_studios.loc[df_anime_studios.shape[0]] = [int(row[1]['mal_id']), int(studio['studio_id'])] # add the row to the dataframe    

In [35]:
df_studios_d.sort_values(by=['studio_id'], inplace=True, ignore_index=True) # sort the dataframe by the genre_id
df_studios_d.index.name = "Index" # set the index name to "Index"
df_studios_d.to_csv(DATA_FOLDER + 'studios_d.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_studios_d.head()

Unnamed: 0_level_0,studio_id,studio_de
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,Pierrot
1,2,Kyoto Animation
2,3,Gonzo
3,4,Bones
4,5,Bee Train


In [36]:
df_anime_studios.sort_values(by=['mal_id', 'studio_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id and by genre_id
df_anime_studios.index.name = "Index" # set the index name to "Index"
df_anime_studios["mal_id"] = df_anime_studios["mal_id"].astype(np.int64) # convert the mal_id to an integer
df_anime_studios["studio_id"] = df_anime_studios["studio_id"].fillna(0).astype(np.int64) # convert the studio_id to an integer
df_anime_studios.to_csv(DATA_FOLDER + 'anime_studios.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_studios.head()

Unnamed: 0_level_0,mal_id,studio_id
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,14
1,5,4
2,6,11
3,7,14
4,8,18


### Save anime recommendations to csv file

In [38]:
df_anime_recommendations = pd.DataFrame(columns=['mal_id', 'mal_id_rd', 'num_recommendations']) # create a dataframe with the columns
for row in df_anime.iterrows(): # iterate through the dataframe
    recommended_str = row[1]['recommendations']
    if(pd.isna(recommended_str)): # if the recommended_str is NaN
        continue
    recommended_str = '{"recommendations": ' + str(recommended_str) + '}'
    recom_d = ast.literal_eval(recommended_str) # convert the string to a dictionary
    for recommendation in recom_d['recommendations']: # iterate through the recommendations
        df_anime_recommendations.loc[df_anime_recommendations.shape[0]] = [row[1]['mal_id'], recommendation['node']['id'], recommendation['num_recommendations']] # add the row to the dataframe

Unnamed: 0,mal_id,mal_id_rd,num_recommendations
0,5114,11061,101
1,5114,16498,39
2,5114,1482,23
3,5114,9919,17
4,5114,1575,15


In [39]:
df_anime_recommendations.sort_values(by=['mal_id', 'num_recommendations'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id and by num_recommendations
df_anime_recommendations.index.name = "Index" # set the index name to "Index"
df_anime_recommendations.to_csv(DATA_FOLDER + 'anime_recommendations.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_recommendations.head()

Unnamed: 0_level_0,mal_id,mal_id_rd,num_recommendations
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,13601,13
1,1,918,14
2,1,2025,16
3,1,4087,18
4,1,2251,30


### Save anime relations to csv file

In [40]:
df_anime_relations = pd.DataFrame(columns=['mal_id', 'mal_id_re', 'relation_type']) # create a dataframe with the columns
for row in df_anime.iterrows(): # iterate through the dataframe
    related_str = row[1]['related_anime']
    if(pd.isna(related_str)): # if the related_str is NaN
        continue
    related_d = ast.literal_eval(related_str) # convert the string to a dictionary
    for relation in related_d: # iterate through the relations
        df_anime_relations.loc[df_anime_relations.shape[0]] = [row[1]['mal_id'], relation['node']['id'], relation['relation_type']] # add the row to the dataframe

In [41]:
df_anime_relations.sort_values(by=['mal_id', 'mal_id_re'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id and by mal_id_re
df_anime_relations.index.name = "Index" # set the index name to "Index"
df_anime_relations.to_csv(DATA_FOLDER + 'anime_relations.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_relations.head()

Unnamed: 0_level_0,mal_id,mal_id_re,relation_type
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,5,side_story
1,1,4037,summary
2,1,17205,side_story
3,5,1,parent_story
4,6,4106,side_story


### Save anime cover images to csv file (urls)

In [42]:
df_anime_covers = df_anime[['mal_id', 'main_picture.medium', 'main_picture.large']].copy() # copy the dataframe columns to a new dataframe #'pictures' 
df_anime_covers.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_covers.rename(columns={"main_picture.medium": "main_picture_medium"}, inplace=True) # rename the column
df_anime_covers.rename(columns={"main_picture.large": "main_picture_large"}, inplace=True)
df_anime_covers.index.name = "Index" # set the index name to "Index"
df_anime_covers.to_csv(DATA_FOLDER + 'anime_covers.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_covers.head()

Unnamed: 0_level_0,mal_id,main_picture.medium,main_picture.large,pictures
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,https://api-cdn.myanimelist.net/images/anime/4...,https://api-cdn.myanimelist.net/images/anime/4...,[{'medium': 'https://api-cdn.myanimelist.net/i...
1,5,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,[{'medium': 'https://api-cdn.myanimelist.net/i...
2,6,https://api-cdn.myanimelist.net/images/anime/7...,https://api-cdn.myanimelist.net/images/anime/7...,[{'medium': 'https://api-cdn.myanimelist.net/i...
3,7,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,[{'medium': 'https://api-cdn.myanimelist.net/i...
4,8,https://api-cdn.myanimelist.net/images/anime/7...,https://api-cdn.myanimelist.net/images/anime/7...,[{'medium': 'https://api-cdn.myanimelist.net/i...


### Save anime statistics to csv file

In [43]:
df_anime_stats = df_anime[['mal_id', 'num_episodes', 'average_episode_duration', 
                           'statistics.status.watching', 'statistics.status.completed', 
                           'statistics.status.on_hold', 'statistics.status.dropped', 
                           'statistics.status.plan_to_watch', 'statistics.num_list_users']].copy() # copy the dataframe columns to a new dataframe
df_anime_stats.sort_values(by=['mal_id'], inplace=True, ignore_index=True) # sort the dataframe by the mal_id
df_anime_stats.rename(columns={"statistics.status.watching": "num_watching"}, inplace=True) # rename the columns
df_anime_stats.rename(columns={"statistics.status.completed": "num_completed"}, inplace=True) 
df_anime_stats.rename(columns={"statistics.status.on_hold": "num_on_hold"}, inplace=True)
df_anime_stats.rename(columns={"statistics.status.dropped": "num_dropped"}, inplace=True)
df_anime_stats.rename(columns={"statistics.status.plan_to_watch": "num_plan_to_watch"}, inplace=True)
df_anime_stats.rename(columns={"statistics.num_list_users": "num_list_users"}, inplace=True)
df_anime_stats.index.name = "Index" # set the index name to "Index"
df_anime_stats.to_csv(DATA_FOLDER + 'anime_stats.csv', sep=';', encoding='utf-8') # save the dataframe to a csv file
df_anime_stats.head()

Unnamed: 0_level_0,mal_id,num_episodes,average_episode_duration,statistics.status.watching,statistics.status.completed,statistics.status.on_hold,statistics.status.dropped,statistics.status.plan_to_watch,statistics.num_list_users
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1,26,1440,151257,921226,91835,35042,417977,1617337
1,5,1,6911,5606,249064,2472,974,76075,334191
2,6,26,1480,35889,393056,29722,16350,184521,659538
3,7,26,1500,4914,49236,5571,5785,40071,105577
4,8,52,1380,709,7754,801,1178,3861,14303
