# Getting anime details
* We store anime metadata in `data/mal/anime_facts/anime.csv`
* This notebook should take 1-2 days to finish

In [1]:
import json
import os
import time

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/mal/anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
token = json.load(open("../mal_authentication/token.json", "r"))

In [None]:
@sleep_and_retry
@limits(calls=1, period=0.75)
def call_api(url):
    try:
        response = requests.get(
            url, headers={"Authorization": f'Bearer {token["access_token"]}'}
        )
        if response.status_code == 500:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        retry_timeout = 3600
        print(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        return call_api(url)
    return response

In [5]:
all_fields = [
    "id",
    "title",
    "main_picture",
    "alternative_titles",
    "start_date",
    "end_date",
    "synopsis",
    "mean",
    "rank",
    "popularity",
    "num_list_users",
    "num_scoring_users",
    "nsfw",
    "created_at",
    "updated_at",
    "media_type",
    "status",
    "genres",
    "my_list_status",
    "num_episodes",
    "start_season",
    "broadcast",
    "source",
    "average_episode_duration",
    "rating",
    "pictures",
    "background",
    "related_anime",
    "related_manga",
    "recommendations",
    "studios",
    "statistics",
]

In [6]:
relevant_fields = [
    "id",
    "title",
    "media_type",
    "status",
    "genres",
    "num_episodes",
    "source",
    "status",
    "related_anime",
]

In [7]:
def process_json(json):
    return pd.DataFrame.from_dict(
        {
            "anime_id": [json["id"]],
            "title": [json["title"]],
            "genres": [
                [x["name"] for x in json["genres"]] if "genres" in json else "Na"
            ],
            "source": [json["source"] if "source" in json else "NaN"],
            "related_anime": [
                [
                    {"anime_id": x["node"]["id"], "relation": x["relation_type"]}
                    for x in json["related_anime"]
                ]
            ],
        }
    )

In [8]:
def get_anime_details(anime_id):
    url = f'https://api.myanimelist.net/v2/anime/{anime_id}?fields={",".join(relevant_fields)}'
    response = call_api(url)

    if response.status_code == 500:
        # The MAL API can spuriously reject for rate limits
        print(f"Error code {response.status_code} for {anime_id}")
        time.sleep(60)
        response = call_api(url)

    if response.status_code == 404:
        return None, False

    response.raise_for_status()
    df = process_json(response.json())
    return df, True

In [9]:
anime_details = []
failed_ids = []
tentaive_max_anime_id = 10
anime_id = 0
continue_searching = True

In [10]:
# MAL anime id's are sequential, but we don't know what the largest anime id is
# Let's do a simple exponential search
while continue_searching:
    continue_searching = False
    for anime_id in tqdm(range(anime_id + 1, tentaive_max_anime_id)):
        details, ok = get_anime_details(anime_id)
        if ok:
            anime_details.append(details)
            continue_searching = True
        else:
            failed_ids.append(anime_id)
        anime_id += 1
    # we found more series; we probably haven't hit the max yet
    tentaive_max_anime_id = int(tentaive_max_anime_id * 2)

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:06<00:00,  1.46it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:06<00:00,  1.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:14<00:00,  1.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [00:30<00:00,  1.29it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 79/79 [01:01<00:00,  1.28it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 159/159 [02:03<00:00,  1.29it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 319/319 [04:17<00:00,  1.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 639/639 [08:11<00:00,  1.30it/s]
100%|███████████████████████████████████

HTTPError: 500 Server Error: Internal Server Error for url: https://api.myanimelist.net/v2/anime/118488?fields=id,title,media_type,status,genres,num_episodes,source,status,related_anime

In [13]:
anime_list = pd.concat(anime_details, ignore_index=True)

In [16]:
# Sanity check the results of the exponential search
# If this fails then we have stopped the search too early
spring_2021_anime_id = 41587
assert anime_id > spring_2021_anime_id

In [15]:
anime_list.to_csv(os.path.join(data_path, "anime.csv"))