In [1]:
import json
import os

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
token = json.load(open("../mal_authentication/token.json", "r"))

In [4]:
# Limit api requests to 2 calls per second
@sleep_and_retry
@limits(calls=2, period=1)
def call_api(url):
    response = requests.get(
        url, headers={"Authorization": f'Bearer {token["access_token"]}'}
    )
    return response

In [5]:
all_fields = [
    "id",
    "title",
    "main_picture",
    "alternative_titles",
    "start_date",
    "end_date",
    "synopsis",
    "mean",
    "rank",
    "popularity",
    "num_list_users",
    "num_scoring_users",
    "nsfw",
    "created_at",
    "updated_at",
    "media_type",
    "status",
    "genres",
    "my_list_status",
    "num_episodes",
    "start_season",
    "broadcast",
    "source",
    "average_episode_duration",
    "rating",
    "pictures",
    "background",
    "related_anime",
    "related_manga",
    "recommendations",
    "studios",
    "statistics",
]

In [6]:
relevant_fields = [
    "id",
    "title",
    "media_type",
    "status",
    "genres",
    "num_episodes",
    "source",
    "status",
    "related_anime",
]

In [7]:
def process_json(json):
    return pd.DataFrame.from_dict(
        {
            "anime_id": [json["id"]],
            "title": [json["title"]],
            "genres": [[x["name"] for x in json["genres"]]],
            "source": [json["source"] if "source" in json else "NaN"],
            "related_anime": [
                [
                    {"anime_id": x["node"]["id"], "relation": x["relation_type"]}
                    for x in json["related_anime"]
                ]
            ],
        }
    )

In [8]:
def get_anime_details(anime_id):
    url = f'https://api.myanimelist.net/v2/anime/{anime_id}?fields={",".join(relevant_fields)}'
    response = call_api(url)

    if response.status_code == 404:
        return None, False

    response.raise_for_status()
    df = process_json(response.json())
    return df, True

In [9]:
# MAL anime id's are sequential, but we don't know what the largest anime id is
# Let's do a simple exponential search
anime_details = []
failed_ids = []
tentaive_max_anime_id = 10
anime_id = 0
continue_searching = True
while continue_searching:
    continue_searching = False
    for anime_id in tqdm(range(anime_id + 1, tentaive_max_anime_id)):
        details, ok = get_anime_details(anime_id)
        if ok:
            anime_details.append(details)
            continue_searching = True
        else:
            failed_ids.append(anime_id)
        anime_id += 1
    # we found more series; we probably haven't hit the max yet
    tentaive_max_anime_id = int(tentaive_max_anime_id * 2)

100%|██████████| 9/9 [00:04<00:00,  2.08it/s]
100%|██████████| 9/9 [00:12<00:00,  1.40s/it]
100%|██████████| 19/19 [00:10<00:00,  1.86it/s]
100%|██████████| 39/39 [00:20<00:00,  1.89it/s]
100%|██████████| 79/79 [01:22<00:00,  1.05s/it]
100%|██████████| 159/159 [01:40<00:00,  1.58it/s]
100%|██████████| 319/319 [03:00<00:00,  1.76it/s]
100%|██████████| 639/639 [06:13<00:00,  1.71it/s]
100%|██████████| 1279/1279 [12:41<00:00,  1.68it/s]
 43%|████▎     | 1088/2559 [28:01<37:53,  1.55s/it] 


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))