# Getting MAL anime-lists
* We collect the anime-list for each username in `data/mal/user_facts/usernames.txt`
* You can terminate or restart the notebook at any point without losing progress. All anime-lists found so far will be stored at `data/mal/user_anime_facts/user_anime_list.csv` 
* This notebook may take a long time to finish. Feel free to manually terminate once an acceptable number of anime-lists have been found
* TODO make the csv append atomic

In [1]:
import datetime
import json
import os
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/mal/user_anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
token = json.load(open("../mal_authentication/token.json", "r"))

In [4]:
@sleep_and_retry
@limits(calls=1, period=0.75)
def call_api(url):
    try:
        response = requests.get(
            url, headers={"Authorization": f'Bearer {token["access_token"]}'}
        )
        if response.status_code == 500:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        retry_timeout = 3600
        print(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        return call_api(url)
    return response

In [5]:
def process_json(json):
    return pd.DataFrame.from_records(
        [(x["node"]["id"], x["list_status"]["score"]) for x in json["data"]],
        columns=["anime_id", "my_score"],
    )

In [6]:
def get_user_anime_list(username):
    anime_lists = []
    more_pages = True
    url = f"https://api.myanimelist.net/v2/users/{username}/animelist?limit=1000&fields=list_status"
    while more_pages:
        response = call_api(url)

        if response.status_code == 403 or response.status_code == 404:
            # 403: This can occur if the user privated their list
            # 404: This can occur if the user deleted their account
            return pd.DataFrame(), False

        response.raise_for_status()
        json = response.json()
        anime_lists.append(process_json(json))

        more_pages = "next" in json["paging"]
        if more_pages:
            url = json["paging"]["next"]
    user_anime_list = pd.concat(anime_lists, ignore_index=True)
    user_anime_list["username"] = username
    user_anime_list = user_anime_list.loc[lambda x: x["my_score"] != 0]
    return user_anime_list, True

In [7]:
first_run = True
user_status = pd.DataFrame.from_dict(
    {"username": [], "access_timestamp": [], "success": [],}
)
if os.path.exists("user_status.csv"):
    first_run = False
    user_status = pd.read_csv("user_status.csv")

In [8]:
with open("../user_facts/usernames.txt", "r") as f:
    usernames = [x.strip() for x in f.readlines()]

In [9]:
usernames = list(set(usernames) - set(user_status["username"]))
np.random.shuffle(usernames)
print(f"Found the anime lists of {len(user_status)} existing users!")
print(f"Getting the anime lists of {len(usernames)} new users!")

Found the anime lists of 81104 existing users!
Getting the anime lists of 210920 new users!


In [None]:
for username in tqdm(usernames):
    user_anime_list, ok = get_user_anime_list(username)
    pd.DataFrame.from_dict(
        {
            "username": [username],
            "access_timestamp": [int(datetime.datetime.now().timestamp())],
            "success": [ok],
        }
    ).to_csv(
        "user_status.csv", index=False, mode="w" if first_run else "a", header=first_run
    )
    user_anime_list.to_csv(
        "user_anime_list.csv",
        index=False,
        mode="w" if first_run else "a",
        header=first_run,
    )
    first_run = False

 54%|████████████████████████████████████▉                               | 114419/210920 [27:07:09<28:47:27,  1.07s/it]

Recevied error 500 while accessing https://api.myanimelist.net/v2/users/OwlBlue/animelist?offset=13000&limit=1000&fields=list_status. Retrying in 3600 seconds


 87%|████████████████████████████████████████████████████████████▎        | 184277/210920 [44:21:12<7:16:53,  1.02it/s]

In [None]:
get_user_anime_list(username)