# Getting anime details
* We store anime metadata in `data/mal/anime_facts/anime.csv`

In [None]:
import json
import logging
import os
import time
import urllib.request

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
import glob

In [None]:
data_path = "../../data/mal/anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [None]:
# logging
logger = logging.getLogger("GetAnime")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler("get_user_anime_lists.log"),
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

In [None]:
token = json.load(open(f"../mal_authentication/token.json", "r"))

In [None]:
# apply rate limiting with exponential backoff for unexpected errors
@sleep_and_retry
@limits(calls=1, period=0.75)
def call_api(url, retry_timeout=1):
    try:
        response = requests.get(
            url, headers={"Authorization": f'Bearer {token["access_token"]}'}
        )
        if response.status_code in [500, 504] and retry_timeout < 3600:
            # This can occur if MAL servers go down or if the page doesnt exist
            raise Exception(f"{response.status_code}")
    except Exception as e:
        logger.warning(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        retry_timeout = min(retry_timeout * 2, 3600)
        return call_api(url, retry_timeout)
    return response

In [None]:
relevant_fields = [
    "id",
    "title",
    "main_picture",
    "alternative_titles",
    "start_date",
    "end_date",
    "synopsis",
    "num_list_users",
    "num_scoring_users",
    "nsfw",
    "genres",
    "media_type",
    "status",
    "related_anime",
    "num_episodes",
    "start_season",
    "source",
    "average_episode_duration",
    "studios",
    "recommendations",
]

In [None]:
def process_json(json):
    def get_key(key):
        return [json[key] if key in json else "NaN"]

    special_cols = {
        "related_anime": [
            [
                {"anime_id": x["node"]["id"], "relation": x["relation_type"]}
                for x in json["related_anime"]
            ]
        ],
        "recommendations": [
            [
                {
                    "anime_id": x["node"]["id"],
                    "num_recommendations": x["num_recommendations"],
                }
                for x in json["recommendations"]
            ]
        ],
        "genres": [[x["name"] for x in json["genres"]] if "genres" in json else "[]"],
    }
    regular_cols = {x: get_key(x) for x in relevant_fields if x not in special_cols}
    regular_cols.update(special_cols)
    df = pd.DataFrame.from_dict(regular_cols)
    df = df.rename({"id": "anime_id", "media_type": "medium"}, axis=1)
    return df

In [None]:
def get_anime_details(anime_id):
    url = f'https://api.myanimelist.net/v2/anime/{anime_id}?fields={",".join(relevant_fields)}&nsfw=true'
    response = call_api(url)
    if response.status_code == 404:
        return None, False

    response.raise_for_status()
    df = process_json(response.json())
    return df, True

In [None]:
def get_all_details(anime_ids):
    anime_details = []
    for anime_id in tqdm(anime_ids):
        details, ok = get_anime_details(anime_id)
        if ok:
            anime_details.append(details)
        else:
            logger.info(f"Stale cache value for anime_id {anime_id}.")
    return pd.concat(anime_details, ignore_index=True)

In [None]:
def get_item_ids(input_fn):
    item_ids = set()
    with open(input_fn, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                idx = line.strip().split(",").index("uid")
                header = True
                continue
            item_id = line.strip().split(",")[idx]
            item_ids.add(int(item_id))
    return item_ids

In [None]:
anime_ids = set()
for file in glob.glob("../user_anime_facts/user_anime_list.*.csv"):
    anime_ids |= get_item_ids(file)
anime_ids = sorted(anime_ids)
anime_list = get_all_details(anime_ids)
anime_list.to_csv("anime.csv", index=False)