# Getting anime details
* We store anime metadata in `data/mal/anime_facts/anime.csv`

In [None]:
import json
import os
import time
import urllib.request

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [None]:
data_path = "../../data/mal/anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [None]:
token = json.load(open("../mal_authentication/token.json", "r"))

In [None]:
@sleep_and_retry
@limits(calls=1, period=0.75)
def call_api(url):
    try:
        response = requests.get(
            url, headers={"Authorization": f'Bearer {token["access_token"]}'}
        )
        if response.status_code == 500:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        retry_timeout = 3600
        print(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        return call_api(url)
    return response

In [None]:
relevant_fields = [
    "id",
    "title",
    "main_picture",
    "alternative_titles",
    "start_date",
    "end_date",
    "synopsis",
    "num_list_users",
    "num_scoring_users",
    "nsfw",
    "genres",
    "media_type",
    "status",
    "related_anime",
    "num_episodes",
    "start_season",
    "source",
    "average_episode_duration",
    "studios",
    "recommendations",
]

In [None]:
def process_json(json):
    def get_key(key):
        return [json[key] if key in json else "NaN"]
    
    special_cols = {
            "related_anime": [
                [
                    {"anime_id": x["node"]["id"], "relation": x["relation_type"]}
                    for x in json["related_anime"]
                ]
            ],
            "recommendations": [
                [
                    {
                        "anime_id": x["node"]["id"],
                        "num_recommendations": x["num_recommendations"],
                    }
                    for x in json["recommendations"]
                ]
            ],
            "genres": [
                [x["name"] for x in json["genres"]] if "genres" in json else "[]"
            ],
    }
    regular_cols = {x: get_key(x) for x in relevant_fields if x not in special_cols}
    regular_cols.update(special_cols)
    df = pd.DataFrame.from_dict(regular_cols)
    df = df.rename({"id": "anime_id", "media_type": "medium"}, axis=1)
    return df

In [None]:
def get_anime_details(anime_id):
    url = f'https://api.myanimelist.net/v2/anime/{anime_id}?fields={",".join(relevant_fields)}&nsfw=true'
    response = call_api(url)
    if response.status_code == 404:
        return None, False

    response.raise_for_status()
    df = process_json(response.json())
    return df, True

In [None]:
    url = f'https://api.myanimelist.net/v2/anime/1'
    response = call_api(url)

    response.raise_for_status()

In [None]:
response.json()

In [None]:
def get_anime_details_from_cache():
    # uses https://github.com/seanbreckenridge/mal-id-cache to get an up-to-date list of MAL ids
    anime_id_cache_url = "https://raw.githubusercontent.com/seanbreckenridge/mal-id-cache/master/cache/anime_cache.json"
    anime_id_cache = eval(urllib.request.urlopen(anime_id_cache_url).read())
    valid_anime_ids = set()
    for _, v in anime_id_cache.items():
        valid_anime_ids |= set(v)

    anime_details = []
    for anime_id in tqdm(valid_anime_ids):
        details, ok = get_anime_details(anime_id)
        if ok:
            anime_details.append(details)
        else:
            print(f"Stale cache value for anime_id {anime_id}.")
    return pd.concat(anime_details, ignore_index=True)

In [None]:
def get_anime_details_from_linear_search(start_id, step = 1000):
    # we provide an alternate method to get an up-to-date list of MAL ids,
    # just in case https://github.com/seanbreckenridge/mal-id-cache stops being updated
    anime_details = []
    failed_ids = []
    continue_searching = True

    # MAL anime id's are monotonic, but we don't know what the largest anime id is
    # Let's do a simple linear search
    while continue_searching:
        continue_searching = False
        for anime_id in tqdm(range(start_id, start_id + step)):
            details, ok = get_anime_details(anime_id)
            if ok:
                anime_details.append(details)
                continue_searching = True
            else:
                failed_ids.append(anime_id)
        # we found more series. We haven't hit the max yet
        start_id += step
    return pd.concat(anime_details, ignore_index=True)

In [None]:
def verify_completion(anime_list, num_extra_ids_to_check=1000):
    print("Verifying that all anime_ids have been indexed", flush=True)
    max_valid_id = anime_list["anime_id"].max()
    for anime_id in tqdm(
        range(max_valid_id + 1, max_valid_id + 1 + num_extra_ids_to_check)
    ):
        details, ok = get_anime_details(anime_id)
        if ok:
            if details["status"].squeeze() == "not_yet_aired":
                continue
            if (
                details["num_scoring_users"].squeeze() == "NaN"
                or details["num_scoring_users"].squeeze() == 0
            ):
                continue
            raise Exception(
                f"Found a valid anime_id at {anime_id}, which is greater than the max recorded id of {max_valid_id}"
            )

In [None]:
max_anime_id = 0
try:
    anime_list = get_anime_details_from_cache()
    max_anime_id = anime_list["anime_id"].max()
    verify_completion(anime_list)
except Exception as e:
    print(f"Received error when loading anime_ids from cache: '{str(e)}'.")
    print("Regenerating anime_ids via linear search...")
    extra_anime_list = get_anime_details_from_linear_search(max_anime_id+1)
    anime_list = pd.concat([anime_list, extra_anime_list], ignore_index=True)
anime_list.to_csv("anime.csv", index=False)

In [None]:
1

In [None]:
1