# Getting MAL anime-lists
* We collect the anime-list for each username in `data/mal/user_facts/usernames.txt` and `data/mal/user_facts/recent_usernames.txt`
* You can terminate or restart the notebook at any point without losing progress. All anime-lists found so far will be stored at `data/mal/user_anime_facts/user_anime_list.csv` 
* This notebook will run indefinitely. You must manually terminate once an acceptable number of anime-lists have been found

In [1]:
import datetime
import json
import logging
import os
import random
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

## Basic setup

In [2]:
# outdir
data_path = "../../data/mal/user_anime_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
# logging
logger = logging.getLogger("GetUserAnimeLists")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler("get_user_anime_lists.log"),
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

## Parse MAL API

In [4]:
token = json.load(open("../mal_authentication/token.json", "r"))

In [5]:
# apply rate limiting with exponential backoff for unexpected errors
@sleep_and_retry
@limits(calls=1, period=0.75)
def call_api(url, retry_timeout=1):
    try:
        response = requests.get(
            url, headers={"Authorization": f'Bearer {token["access_token"]}'}
        )
        if response.status_code in [500] and retry_timeout < 3600:
            # This can occur if MAL servers go down or if the page doesnt exist
            raise Exception(f"{response.status_code}")
    except Exception as e:
        logger.warning(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        retry_timeout = min(retry_timeout * 2, 3600)
        return call_api(url, retry_timeout)
    return response

In [6]:
# utilities for extracting an anime list from the MAL API
def parse_json_node(x):
    uid = x["node"]["id"]
    score = x["list_status"]["score"]
    status = ""
    if "status" in x["list_status"]:
        status = x["list_status"]["status"]
    return uid, score, status


def process_json(json):
    df = pd.DataFrame.from_records(
        [parse_json_node(x) for x in json["data"]],
        columns=["anime_id", "my_score", "status"],
    )
    return df.loc[lambda x: x["status"] != "plan_to_watch"].drop("status", axis=1)


def get_user_anime_list(username):
    anime_lists = []
    more_pages = True
    url = f"https://api.myanimelist.net/v2/users/{username}/animelist?limit=1000&fields=list_status&nsfw=true"
    while more_pages:
        response = call_api(url)
        if response.status_code in [403, 404, 500]:
            # 403: This can occur if the user privated their list
            # 404: This can occur if the user deleted their account
            # 500: This can occur if the user deleted their account
            return pd.DataFrame(), False
        if not response.ok:
            logger.warning(f"Error {response} received when handling {url}")
            return pd.DataFrame(), False

        json = response.json()
        anime_lists.append(process_json(json))
        more_pages = "next" in json["paging"]
        if more_pages:
            url = json["paging"]["next"]
    user_anime_list = pd.concat(anime_lists, ignore_index=True)
    user_anime_list["username"] = username
    return user_anime_list, True

## Clean up previous run

In [7]:
# Entries in  user_status.csv  and user_anime_list.csv can be malformed if the
# notebook crashes in the middle of saving a file. This function removes any
# malformed lines.
def verify_user_status_consistency():
    logger.info("Verifying consistency of entries in user_status.csv")
    input_fn = "user_status.csv"
    if not os.path.exists(input_fn):
        return
    output_fn = input_fn + "~"
    user_status = pd.read_csv("user_status.csv")
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = "username,access_timestamp,success\n"
                    if line.strip() != correct_header.strip():
                        logger.warning(
                            f"Replacing malformed header line {line.strip} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    continue
                fields = line.strip().split(",")
                if len(fields) != 3:
                    logger.warning(
                        f"Deleting malformed line in user_anime_list.csv {line} "
                    )
                    continue
                out_file.write(line)
    os.replace(output_fn, input_fn)

## Sort users by recency

In [10]:
def read_usernames():
    usernames = []
    with open("../user_facts/usernames.txt", "r") as f:
        usernames += [x.strip() for x in f.readlines()]
    with open("../user_facts/recent_usernames.txt", "r") as f:
        usernames += [x.strip() for x in f.readlines()]
    return usernames

In [11]:
def read_user_status():
    user_status_file = "user_status.csv"
    if os.path.exists(user_status_file):
        return pd.read_csv(user_status_file)
    else:
        return pd.DataFrame.from_dict(
            {
                "username": [],
                "access_timestamp": [],
                "success": [],
            }
        )

In [12]:
def prioritize_users():
    usernames = read_usernames()
    user_status = read_user_status()
    new_users = list(set(usernames) - set(user_status["username"]))
    random.shuffle(new_users)
    existing_users = list(user_status.sort_values(by="access_timestamp")["username"])
    logger.info(
        f"Getting the anime lists of {len(new_users)} new users and refreshing "
        f"the anime lists of {len(existing_users)} existing users!"
    )
    return new_users + existing_users

## Continuously refresh anime lists
* TODO documentation

In [13]:
def merge_block(file, user_field, users):
    outfile = file + "~"
    blockfile = file + ".block"
    with open(outfile, "w") as out_file:
        # copy over all the unchaged users
        if os.path.exists(file):
            with open(file, "r") as in_file:
                header = False
                for line in tqdm(in_file):
                    if not header:
                        header = True
                        out_file.write(line)
                        continue
                    fields = line.strip().split(",")
                    if fields[user_field] not in users:
                        out_file.write(line)

        # copy over the new block
        with open(blockfile, "r") as in_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    header = True
                    continue
                out_file.write(line)
    os.replace(outfile, file)

In [14]:
def merge_blocks():
    users = set(pd.read_csv("user_status.csv.block")["username"])
    merge_block("user_anime_list.csv", 2, users)    
    merge_block("user_status.csv", 0, users)    
    logger.info(f"Merging block of {len(users)} users into the main database")    

In [None]:
# get the anime list for each new user and write to disk
while True:
    usernames = prioritize_users()[:100000]
    block = set()
    for username in tqdm(usernames):
        user_anime_list, ok = get_user_anime_list(username)
        user_anime_list.to_csv(
            "user_anime_list.csv.block",
            index=False,
            mode="w" if not block else "a+",
            header=not block,
        )
        pd.DataFrame.from_dict(
            {
                "username": [username],
                "access_timestamp": [int(datetime.datetime.now().timestamp())],
                "success": [ok],
            }
        ).to_csv(
            "user_status.csv.block",
            index=False,
            mode="w" if not block else "a+",
            header=not block,
        )
        block.add(username)
    merge_blocks()

GetUserAnimeLists:INFO:2022-04-19 05:05:34: Getting the anime lists of 675237 new users and refreshing the anime lists of 1663752 existing users!
  4%|██████▏                                                                                                                                                       | 3881/100000 [49:49<20:13:47,  1.32it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 17%|███████████████████████████                                                                                                                                | 17451/100000 [3:44:52<16:42:45,  1.37it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it