# Getting MAL anime-lists
* We collect the anime-list for each username in `data/mal/user_facts/usernames.txt`
* You can terminate or restart the notebook at any point without losing progress. All anime-lists found so far will be stored at `data/mal/user_anime_facts/user_anime_list.csv` 
* This notebook may take a long time to finish. Feel free to manually terminate once an acceptable number of anime-lists have been found
* TODO don't discard unrated shows

In [1]:
import datetime
import json
import logging
import os
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/mal/user_anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
logger = logging.getLogger("GetUserAnimeLists")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler("get_user_anime_lists.log"),
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

In [4]:
token = json.load(open("../mal_authentication/token.json", "r"))

In [5]:
# apply rate limiting with exponential backoff for unexpected errors
@sleep_and_retry
@limits(calls=1, period=0.75)
def call_api(url, retry_timeout=1):
    try:
        response = requests.get(
            url, headers={"Authorization": f'Bearer {token["access_token"]}'}
        )
        if response.status_code in [500]:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        logger.warning(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        retry_timeout = min(retry_timeout * 2, 3600)
        return call_api(url, retry_timeout)
    return response

In [6]:
def process_json(json):
    return pd.DataFrame.from_records(
        [(x["node"]["id"], x["list_status"]["score"]) for x in json["data"]],
        columns=["anime_id", "my_score"],
    )

In [7]:
def get_user_anime_list(username):
    anime_lists = []
    more_pages = True
    url = f"https://api.myanimelist.net/v2/users/{username}/animelist?limit=1000&fields=list_status"
    while more_pages:
        response = call_api(url)
        if response.status_code in [403, 404]:
            # 403: This can occur if the user privated their list
            # 404: This can occur if the user deleted their account
            return pd.DataFrame(), False
        if not response.ok:
            logger.warning(f"Error {response} received when handling {url}")
            return pd.DataFrame(), False

        json = response.json()
        anime_lists.append(process_json(json))
        more_pages = "next" in json["paging"]
        if more_pages:
            url = json["paging"]["next"]
    user_anime_list = pd.concat(anime_lists, ignore_index=True)
    user_anime_list["username"] = username
    user_anime_list = user_anime_list.loc[lambda x: x["my_score"] != 0]
    return user_anime_list, True

In [8]:
with open("../user_facts/usernames.txt", "r") as f:
    usernames = [x.strip() for x in f.readlines()]

In [9]:
first_run = not os.path.exists("user_status.csv")
if first_run:
    user_status = pd.DataFrame.from_dict(
        {
            "username": [],
            "access_timestamp": [],
            "success": [],
        }
    )
else:
    user_status = pd.read_csv("user_status.csv")

In [10]:
usernames = list(set(usernames) - set(user_status["username"]))
np.random.shuffle(usernames)
logger.info(f"Found the anime lists of {len(user_status)} existing users!")
logger.info(f"Getting the anime lists of {len(usernames)} new users!")

GetUserAnimeLists:INFO:2021-11-06 22:16:07: Found the anime lists of 207287 existing users!
GetUserAnimeLists:INFO:2021-11-06 22:16:07: Getting the anime lists of 162768 new users!


In [11]:
# Entries in user_anime_list.csv can be malformed if the notebook crashes in the 
# middle of saving a file. This function removes any malformed lines.
def verify_user_anime_list_consistency():
    logger.info("Verifying consistency of existing entries in user_anime_list.csv")
    input_fn = "user_anime_list.csv"
    output_fn = input_fn + "~"
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            successful_users = set(user_status.loc[lambda x: x['success']]['username'])            
            invalid_users = set()
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = "anime_id,my_score,username\n"
                    if line.strip() != correct_header.strip():
                        logger.warning(
                            f"Replacing malformed header line {line.strip} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    continue
                fields = line.strip().split(",")
                if len(fields) != 3:
                    logger.warning(
                        f"Deleting malformed line in user_anime_list.csv {line} "
                    )
                    continue
                anime_id, score, username = fields
                if username not in successful_users:
                    if username not in invalid_users:
                        invalid_users.add(username)
                        logger.warning(
                            f"Deleting entries in user_anime_list.csv for "
                            f" unrecognized username {username}"
                        )
                    continue
                out_file.write(line)
    os.replace(output_fn, input_fn)

In [12]:
verify_user_anime_list_consistency()

GetUserAnimeLists:INFO:2021-11-06 22:16:07: Verifying consistency of existing entries in user_anime_list.csv
42060368it [00:36, 1164308.77it/s]


In [None]:
for username in tqdm(usernames):
    user_anime_list, ok = get_user_anime_list(username)
    user_anime_list.to_csv(
        "user_anime_list.csv",
        index=False,
        mode="w" if first_run else "a",
        header=first_run,
    )
    pd.DataFrame.from_dict(
        {
            "username": [username],
            "access_timestamp": [int(datetime.datetime.now().timestamp())],
            "success": [ok],
        }
    ).to_csv(
        "user_status.csv", index=False, mode="w" if first_run else "a", header=first_run
    )
    first_run = False

  0%|                                   | 454/162768 [06:12<39:21:02,  1.15it/s]