In [None]:
import json
import requests 
import pandas as pd
import time
import numpy as np
import re

# Part 1: Data Collection from MyAnimeList API
This part deals with the data collection from the API and the initial cleaning of the training/exploration dataset.

In [None]:
# obtain ids of all anime from past 4 seasons
anime_ids = []

for i in ['2022/spring', '2022/winter', '2021/fall', '2021/summer']:
  url = 'https://api.myanimelist.net/v2/anime/season/' + i + '?limit=500'
  response = requests.get(url, headers = {
      'X-MAL-CLIENT-ID': '45cc2a86444f783d1ebe6ac1c201e87a'})
  temp = json.loads(response.text)
  anime_ids.extend([j['node']['id'] for j in temp['data']])
  time.sleep(.5)

In [None]:
animes = []
# get data for all training anime
for i in anime_ids:
  url = ('https://api.myanimelist.net/v2/anime/' + str(i) + '?fields=synopsis,'
    + 'genres,studios,rank,source,rating,popularity,related_anime,mean,'
    + 'num_list_users,num_scoring_users,average_episode_duration,media_type,'
    + 'num_episodes')
  response = requests.get(url, headers = {
      'X-MAL-CLIENT-ID': '45cc2a86444f783d1ebe6ac1c201e87a'})
  animes.append(json.loads(response.text))
  time.sleep(.5)

In [None]:
df_anime = pd.json_normalize(animes)

# remove unneeded columns
df_anime = df_anime.drop(['main_picture.medium', 
                          'main_picture.large', 'related_anime'], axis=1)

# convert columns from json arrays to string arrays
df_anime['genres'] = df_anime['genres'].map(
    lambda anime: [i['name'] for i in anime] if 
    type(anime) == list else anime)
df_anime['studios'] = df_anime['studios'].map(
    lambda anime: [i['name'] for i in anime] if 
    type(anime) == list else anime)

# remove duplicate anime
df_anime = df_anime.loc[df_anime.drop(
    ['genres', 'studios'], axis=1).drop_duplicates(subset='id').index]

# remove anime without a mean
df_anime = df_anime.dropna(subset=['mean'])
df_anime = df_anime.set_index('id')
df_anime.head()

# get all unique genres
def traverse_list(list_x, list_y):
  for i in list_x:
    if i not in list_y:
      list_y.append(i)

unique_genres = []
df_anime['genres'].map(lambda anime: traverse_list(anime, unique_genres) 
                       if type(anime) == list else anime)

unique_studios = []
df_anime['studios'].map(lambda anime: traverse_list(anime, unique_studios) 
                       if type(anime) == list else anime)

# add columns for each distinct genre and studio
genres = [pd.Series(
    data=df_anime['genres'].map(
        lambda anime: i in anime if type(anime) == list else anime), name=i) 
    for i in unique_genres] 
df_anime = pd.concat((df_anime, pd.DataFrame(genres).T), axis=1)

studios = [pd.Series(
    data=df_anime['studios'].map(
        lambda anime: i in anime if type(anime) == list else anime), name=i) 
    for i in unique_studios] 
df_anime = pd.concat((df_anime, pd.DataFrame(studios).T), axis=1)

df_anime = df_anime.drop(['genres', 'studios'], axis=1)

# create csv file for later analysis/cleaning
df_anime.to_csv('anime_clean2.csv')
df_anime

Unnamed: 0_level_0,title,synopsis,rank,source,rating,popularity,mean,num_list_users,num_scoring_users,average_episode_duration,...,AXsiZ,Fanworks,Space Neko Company,Vega Entertainment,Bee Media,Shirogumi,DRAWIZ,Quebico,Studio Ponoc,DLE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21,One Piece,"Gol D. Roger was known as the ""Pirate King,"" t...",65.0,manga,pg_13,26,8.65,1882485,1070399,1440,...,False,False,False,False,False,False,False,False,False,False
50265,Spy x Family,"Corrupt politicians, frenzied nationalists, an...",10.0,manga,pg_13,197,9.04,743717,233149,1449,...,False,False,False,False,False,False,False,False,False,False
50160,Kingdom 4th Season,Following the conclusion of the large-scale co...,30.0,manga,r,3723,8.81,26245,3916,1495,...,False,False,False,False,False,False,False,False,False,False
48916,Love Live! Nijigasaki Gakuen School Idol Douko...,Second season of Love Live! Nijigasaki Gakuen ...,666.0,other,pg_13,3848,7.93,24677,3332,1440,...,False,False,False,False,False,False,False,False,False,False
49570,Wu Dong Qian Kun 3rd Season,Lin Dong continues his journey to find the anc...,3584.0,novel,pg_13,10768,7.08,1416,143,1440,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49848,Magia Record: Mahou Shoujo Madoka☆Magica Gaide...,Recap of the first 7 episodes of Magia Record:...,9375.0,game,pg_13,8341,5.93,3581,861,1420,...,False,False,False,False,False,False,False,False,False,False
49410,Shiroi Suna no Aquatope Mini,Short episodes with super-deformed characters ...,9408.0,original,pg_13,8702,5.91,3096,337,65,...,False,False,False,False,False,False,False,False,False,False
48678,Nomad: Megalo Box 2 Short Anime,New short anime included in the BD box set.,6173.0,original,pg_13,8404,6.54,3496,159,660,...,False,False,False,False,False,False,False,False,False,False
49396,Kyoto Animation: Souzou-hen,,5729.0,original,g,9101,6.62,2689,1228,30,...,False,False,False,False,False,False,False,False,False,False


# Part 2: Test Set and Cleaning for Machine Learning
This part involves creating the test set and cleaning both the training and test set for Machine Learning.

In [None]:
# obtain ids of all anime from summer 2022 to get test data for ML

test_ids = []

url = 'https://api.myanimelist.net/v2/anime/season/2022/summer?limit=500'
response = requests.get(url, headers = {
    'X-MAL-CLIENT-ID': '45cc2a86444f783d1ebe6ac1c201e87a'})
temp = json.loads(response.text)
test_ids.extend([j['node']['id'] for j in temp['data']])

In [None]:
test_animes = []

# get data for all test anime
for i in test_ids:
  url = ('https://api.myanimelist.net/v2/anime/' + str(i) + '?fields=synopsis,'
    + 'genres,studios,rank,source,rating,popularity,related_anime,mean,'
    + 'num_list_users,num_scoring_users,average_episode_duration,media_type,'
    + 'num_episodes')
  response = requests.get(url, headers = {
      'X-MAL-CLIENT-ID': '45cc2a86444f783d1ebe6ac1c201e87a'})
  test_animes.append(json.loads(response.text))
  time.sleep(.5)

In [None]:
df_test = pd.json_normalize(test_animes)

# drop unneeded columns
df_test = df_test.drop(['main_picture.medium', 
                          'main_picture.large', 'related_anime'], axis=1)

# convert json array columns to string array columns 
df_test['genres'] = df_test['genres'].map(
    lambda anime: [i['name'] for i in anime] if 
    type(anime) == list else anime)
df_test['studios'] = df_test['studios'].map(
    lambda anime: [i['name'] for i in anime] if 
    type(anime) == list else anime)

# remove duplicate anime
df_test = df_test.loc[df_test.drop(
    ['genres', 'studios'], axis=1).drop_duplicates(subset='id').index]

# make a mean column with NaN for testing
df_test['mean'] = np.nan
df_test.head()

# get all unique genres
def traverse_list(list_x, list_y):
  for i in list_x:
    if i not in list_y:
      list_y.append(i)

unique_genres = []
df_test['genres'].map(lambda anime: traverse_list(anime, unique_genres) 
                       if type(anime) == list else anime)

unique_studios = []
df_test['studios'].map(lambda anime: traverse_list(anime, unique_studios) 
                       if type(anime) == list else anime)

# add distinct genres and studios to cols
genres = [pd.Series(
    data=df_test['genres'].map(
        lambda anime: i in anime if type(anime) == list else anime), name=i) 
    for i in unique_genres] 
df_test = pd.concat((df_test, pd.DataFrame(genres).T), axis=1)

studios = [pd.Series(
    data=df_test['studios'].map(
        lambda anime: i in anime if type(anime) == list else anime), name=i) 
    for i in unique_studios] 
df_test = pd.concat((df_test, pd.DataFrame(studios).T), axis=1)

df_test = df_test.drop(['genres', 'studios'], axis=1)

# create csv file for later analysis/cleaning
df_test = df_test.set_index('id')
df_test.to_csv('tanime_clean2.csv')
df_test

Unnamed: 0_level_0,title,synopsis,source,rating,popularity,num_list_users,num_scoring_users,average_episode_duration,media_type,num_episodes,...,Zexcs,A.C.G.T.,CLAP,Diomedéa,Yumeta Company,Graphinica,MAPPA,feel.,Science SARU,Studio 3Hz
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51805,Ginga Eiyuu Densetsu: Die Neue These - Sakubou,Fourth season of Ginga Eiyuu Densetsu: Die Neu...,novel,r,8408,3493,0,0,movie,3,...,False,False,False,False,False,False,False,False,False,False
50379,Shoot! Goal to the Future,Atsushi Kamiya is a former captain at Kakegawa...,original,pg_13,8202,3776,0,0,tv,0,...,False,False,False,False,False,False,False,False,False,False
50470,Kami Kuzu☆Idol,"Yuuya, one half of the boy pop duo ZINGS, may ...",manga,pg_13,8443,3451,0,0,tv,0,...,False,False,False,False,False,False,False,False,False,False
48573,Uta no☆Prince-sama♪ Movie: Maji Love ST☆RISH T...,The first installment of the new Uta no☆Prince...,game,pg_13,8496,3356,0,0,movie,1,...,False,False,False,False,False,False,False,False,False,False
51251,D4DJ: Double Mix,A special episode focusing on the story of Mer...,mixed_media,pg_13,9600,2213,0,0,special,1,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50339,Kakegurui Twin,As a first-year student of the renowned Hyakka...,manga,pg_13,4429,17873,1,0,ona,0,...,False,False,False,False,False,False,True,False,False,False
49776,Kumichou Musume to Sewagakari,WHO'S YOUR NANNY?\n\nTooru Kirishima is the ri...,manga,pg_13,4450,17709,1,0,tv,0,...,False,False,False,False,False,False,False,True,False,False
49220,Isekai Ojisan,"Seventeen years ago, Takafumi's uncle fell int...",manga,pg_13,4823,14459,0,0,tv,0,...,False,False,False,False,False,False,False,False,False,False
49590,Yojouhan Time Machine Blues,Yojouhan Time Machine Blues takes place during...,novel,,4783,14709,0,0,unknown,0,...,False,False,False,False,False,False,False,False,True,False


In [None]:
# load csvs
df_anime = pd.read_csv('anime_clean2.csv')
df_anime = df_anime.set_index('id')
df_anime.head()

df_test = pd.read_csv('tanime_clean2.csv')
df_test = df_test.set_index('id')
df_test.head()

# find common cols so we don't hit errors for nonexistent cols later!
common_cols = np.intersect1d(df_anime.columns, df_test.columns)
df_anime = df_anime[common_cols]
df_test = df_test[common_cols]

# drop generally unknown/impossible metrics columns for new anime 
df_anime = df_anime.drop(
    ['rank', 'num_scoring_users', 
     'average_episode_duration'], axis=1)
df_test = df_test.drop(
    ['rank', 'num_scoring_users', 
     'average_episode_duration'], axis=1)

# reorder cols
ordering = ['title', 'synopsis', 'source', 'rating', 'popularity', 
          'num_list_users', 'num_episodes', 'media_type', 'mean', 
          'Action', 'Adventure',
          'Comedy', 'Drama', 'Fantasy', 
          'Mystery', 'Romance', 'Sci-Fi',
          'Slice of Life', 'Sports', 'Supernatural', 'Suspense', 
          'Ecchi',
          'Josei', 'Kids', 'Shounen', 'Shoujo', 'Seinen', 'Adult Cast', 
          'CGDCT', 'Childcare', 
          'Detective', 'Gag Humor', 'Harem',
          'Historical', 'Idols (Female)',
          'Idols (Male)', 'Isekai', 'Iyashikei',
          'Mahou Shoujo', 'Mecha', 'Military', 'Music', 'Mythology',
          'Organized Crime', 'Parody', 
          'Psychological', 
          'Reverse Harem', 'Romantic Subtext', 
          'Samurai', 'School', 'Space', 'Strategy Game',
          'Team Sports', 'Time Travel',
          'Vampire', 'Video Game']
ordering.extend(df_anime.columns.drop(ordering))

df_anime = df_anime[ordering]
df_test = df_test[ordering]

# make True/False into 1/0
df_anime = pd.concat(
    (df_anime[df_anime.columns[:9]], 
     df_anime[df_anime.columns[9:]].applymap(lambda a: 1 if a else 0)), axis=1)
df_test = pd.concat(
    (df_test[df_test.columns[:9]], 
     df_test[df_test.columns[9:]].applymap(lambda a: 1 if a else 0)), axis=1)

# take out unhelpful portion of synopsis strings
df_anime['synopsis'] = df_anime['synopsis'].map(
    lambda syn: re.sub('\[Written by MAL Rewrite\]', '', syn) 
    if type(syn) == str else syn)
df_test['synopsis'] = df_test['synopsis'].map(
    lambda syn: re.sub('\[Written by MAL Rewrite\]', '', syn) 
    if type(syn) == str else syn)

# make csvs for machine learning portion
df_anime.to_csv('anime_ml.csv')
df_test.to_csv('tanime_ml.csv')

In [None]:
df_anime.head()

Unnamed: 0_level_0,title,synopsis,source,rating,popularity,num_list_users,num_episodes,media_type,mean,Action,...,Studio 3Hz,Studio Colorido,Studio Gokumi,Studio Jemi,Studio Kai,Sunrise,TMS Entertainment,Yumeta Company,Zexcs,feel.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21,One Piece,"Gol D. Roger was known as the ""Pirate King,"" t...",manga,pg_13,26,1882485,0,tv,8.65,1,...,0,0,0,0,0,0,0,0,0,0
50265,Spy x Family,"Corrupt politicians, frenzied nationalists, an...",manga,pg_13,197,743717,12,tv,9.04,1,...,0,0,0,0,0,0,0,0,0,0
50160,Kingdom 4th Season,Following the conclusion of the large-scale co...,manga,r,3723,26245,26,tv,8.81,1,...,0,0,0,0,0,0,0,0,0,0
48916,Love Live! Nijigasaki Gakuen School Idol Douko...,Second season of Love Live! Nijigasaki Gakuen ...,other,pg_13,3848,24677,13,tv,7.93,0,...,0,0,0,0,0,1,0,0,0,0
49570,Wu Dong Qian Kun 3rd Season,Lin Dong continues his journey to find the anc...,novel,pg_13,10768,1416,12,ona,7.08,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_test.head()

Unnamed: 0_level_0,title,synopsis,source,rating,popularity,num_list_users,num_episodes,media_type,mean,Action,...,Studio 3Hz,Studio Colorido,Studio Gokumi,Studio Jemi,Studio Kai,Sunrise,TMS Entertainment,Yumeta Company,Zexcs,feel.
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
51805,Ginga Eiyuu Densetsu: Die Neue These - Sakubou,Fourth season of Ginga Eiyuu Densetsu: Die Neu...,novel,r,8408,3493,3,movie,,0,...,0,0,0,0,0,0,0,0,0,0
50379,Shoot! Goal to the Future,Atsushi Kamiya is a former captain at Kakegawa...,original,pg_13,8202,3776,0,tv,,0,...,0,0,0,0,0,0,0,0,0,0
50470,Kami Kuzu☆Idol,"Yuuya, one half of the boy pop duo ZINGS, may ...",manga,pg_13,8443,3451,0,tv,,0,...,0,0,1,0,0,0,0,0,0,0
48573,Uta no☆Prince-sama♪ Movie: Maji Love ST☆RISH T...,The first installment of the new Uta no☆Prince...,game,pg_13,8496,3356,1,movie,,0,...,0,0,0,0,0,0,0,0,0,0
51251,D4DJ: Double Mix,A special episode focusing on the story of Mer...,mixed_media,pg_13,9600,2213,1,special,,0,...,0,0,0,0,0,0,0,0,0,0
