# Getting anime details
* We store anime metadata in `data/mal/anime_facts/anime.csv`

In [1]:
import json
import os
import time
import urllib.request

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/mal/anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
token = json.load(open("../mal_authentication/token.json", "r"))

In [4]:
@sleep_and_retry
@limits(calls=1, period=0.75)
def call_api(url):
    try:
        response = requests.get(
            url, headers={"Authorization": f'Bearer {token["access_token"]}'}
        )
        if response.status_code == 500:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        retry_timeout = 3600
        print(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        return call_api(url)
    return response

In [5]:
all_fields = [
    "id",
    "title",
    "main_picture",
    "alternative_titles",
    "start_date",
    "end_date",
    "synopsis",
    "mean",
    "rank",
    "popularity",
    "num_list_users",
    "num_scoring_users",
    "nsfw",
    "created_at",
    "updated_at",
    "media_type",
    "status",
    "genres",
    "my_list_status",
    "num_episodes",
    "start_season",
    "broadcast",
    "source",
    "average_episode_duration",
    "rating",
    "pictures",
    "background",
    "related_anime",
    "related_manga",
    "recommendations",
    "studios",
    "statistics",
]

In [6]:
relevant_fields = [
    "id",
    "title",
    "media_type",
    "related_anime",
    "recommendations",
    "genres",
    "source",
    "num_episodes",
    "status",
    "num_scoring_users",
]

In [7]:
def process_json(json):
    def get_key(key):
        return [json[key] if key in json else "NaN"]

    return pd.DataFrame.from_dict(
        {
            "anime_id": get_key("id"),
            "title": get_key("title"),
            "medium": get_key("media_type"),
            "related_anime": [
                [
                    {"anime_id": x["node"]["id"], "relation": x["relation_type"]}
                    for x in json["related_anime"]
                ]
            ],
            "recommendations": [
                [
                    {
                        "anime_id": x["node"]["id"],
                        "num_recommendations": x["num_recommendations"],
                    }
                    for x in json["recommendations"]
                ]
            ],
            "genres": [
                [x["name"] for x in json["genres"]] if "genres" in json else "NaN"
            ],
            "source": get_key("source"),
            "num_episodes": get_key("num_episodes"),
            "status": get_key("status"),
            "num_scoring_users": get_key("num_scoring_users")
        }
    )

In [8]:
def get_anime_details(anime_id):
    url = f'https://api.myanimelist.net/v2/anime/{anime_id}?fields={",".join(relevant_fields)}'
    response = call_api(url)
    if response.status_code == 404:
        return None, False

    response.raise_for_status()
    df = process_json(response.json())
    return df, True

In [9]:
def get_anime_ids_from_cache():
    # uses https://github.com/seanbreckenridge/mal-id-cache to get an up-to-date list of MAL ids
    anime_id_cache_url = "https://raw.githubusercontent.com/seanbreckenridge/mal-id-cache/master/cache/anime_cache.json"
    anime_id_cache = eval(urllib.request.urlopen(anime_id_cache_url).read())
    valid_anime_ids = set()
    for _, v in anime_id_cache.items():
        valid_anime_ids |= set(v)

    anime_details = []
    for anime_id in tqdm(valid_anime_ids):
        details, ok = get_anime_details(anime_id)
        if ok:
            anime_details.append(details)
        else:
            print(f"Stale cache value for anime_id {anime_id}.")
    return pd.concat(anime_details, ignore_index=True)

In [10]:
def get_anime_ids_from_exponential_search():
    # we provide an alternate method to get an up-to-date list of MAL ids, 
    # just in case https://github.com/seanbreckenridge/mal-id-cache stops being updated
    anime_details = []
    failed_ids = []
    tentative_max_anime_id = 10
    anime_id = 0
    continue_searching = True

    # MAL anime id's are monotonic, but we don't know what the largest anime id is
    # Let's do a simple exponential search
    while continue_searching:
        continue_searching = False
        for anime_id in tqdm(range(anime_id + 1, tentative_max_anime_id)):
            details, ok = get_anime_details(anime_id)
            if ok:
                anime_details.append(details)
                continue_searching = True
            else:
                failed_ids.append(anime_id)
            anime_id += 1
        # we found more series; we probably haven't hit the max yet
        tentative_max_anime_id = int(tentative_max_anime_id * 2)
    return pd.concat(anime_details, ignore_index=True)

In [11]:
def verify_completion(anime_list, num_extra_ids_to_check=1000):
    print("Verifying that all anime_ids have been indexed", flush=True)
    max_valid_id = anime_list["anime_id"].max()
    for anime_id in tqdm(range(max_valid_id + 1, max_valid_id + 1 + num_extra_ids_to_check)):
        details, ok = get_anime_details(anime_id)
        if ok:
            if details["status"].squeeze() == 'not_yet_aired':
                continue
            if details["num_scoring_users"].squeeze() == "NaN" or details["num_scoring_users"].squeeze() == 0:
                continue
            raise Exception(
                f"Found a valid anime_id at {anime_id}, which is greater than the max recorded id of {max_valid_id}"
            )

In [12]:
try:
    anime_list = get_anime_ids_from_cache()
    verify_completion(anime_list)
except Exception as e:
    print(f"Received error when loading anime_ids from cache: '{str(e)}'.")
    print("Regenerating anime_ids via exponential search...")
    anime_list = get_anime_ids_from_exponential_search()
    verify_completion(anime_list)  
anime_list.to_csv("anime.csv", index=False)

 29%|█████████████████████▍                                                   | 5251/17875 [1:08:28<2:37:50,  1.33it/s]

Stale cache value for anime_id 8149.


 73%|████████████████████████████████████████████████████▎                   | 12981/17875 [2:49:22<1:01:31,  1.33it/s]

Stale cache value for anime_id 35322.


 78%|█████████████████████████████████████████████████████████▋                | 13946/17875 [3:02:08<47:43,  1.37it/s]

Stale cache value for anime_id 36925.


 86%|███████████████████████████████████████████████████████████████▌          | 15367/17875 [3:20:38<33:18,  1.26it/s]

Stale cache value for anime_id 39143.


100%|██████████████████████████████████████████████████████████████████████████| 17875/17875 [3:53:15<00:00,  1.28it/s]


Verifying that all anime_ids have been indexed


100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [13:03<00:00,  1.28it/s]
