# Getting user anime-lists
* You can terminate or restart the notebook at any point without losing progress. All anime-lists found so far will be stored at `data/{source}/user_anime_facts/user_anime_list.csv`, where source is either MAL or AniList
* This notebook will run indefinitely. You must manually terminate once an acceptable number of anime-lists have been found

In [None]:
import datetime
import json
import logging
import os
import random
import time

import numpy as np
import pandas as pd
import requests
from tqdm import tqdm

## Basic setup

In [None]:
# outdir
data_path = f"../../data/{name}/user_anime_facts"
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)
os.chdir(data_path)

In [None]:
# logging
logger = logging.getLogger("GetUserAnimeLists")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler("get_user_anime_lists.log"),
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

## Sort users by recency

In [None]:
def read_user_status():
    user_status_file = "user_status.csv"
    if os.path.exists(user_status_file):
        return pd.read_csv(user_status_file, keep_default_na=False)
    else:
        return pd.DataFrame.from_dict(
            {
                "username": [],
                "access_timestamp": [],
                "success": [],
            }
        )

In [None]:
def prioritize_users():
    usernames = read_usernames()
    user_status = read_user_status()
    new_users = list(set(usernames) - set(user_status["username"]))
    random.shuffle(new_users)
    existing_users = list(user_status.sort_values(by="access_timestamp")["username"])
    logger.info(
        f"Getting the anime lists of {len(new_users)} new users and refreshing "
        f"the anime lists of {len(existing_users)} existing users!"
    )
    return new_users + existing_users

## Continuously refresh anime lists
* We take the least recently refreshed users and refresh their anime lists
* These anime lists are stored in a temporary block
* Once the block is big enough, we atomically merge it with the existing anime lists

In [None]:
def merge_block(file, user_field, users):
    outfile = file + "~"
    blockfile = file + ".block"
    first_run = not os.path.exists(file)
    with open(outfile, "w") as out_file:
        # copy over all the unchaged users
        if not first_run:
            with open(file, "r") as in_file:
                header = False
                for line in tqdm(in_file):
                    if not header:
                        header = True
                        out_file.write(line)
                        continue
                    fields = line.strip().split(",")
                    if fields[user_field] not in users:
                        out_file.write(line)

        # copy over the new block
        with open(blockfile, "r") as in_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    if first_run:
                        out_file.write(line)
                    header = True
                    continue
                out_file.write(line)
    os.replace(outfile, file)

In [None]:
def merge_blocks():
    users = set(pd.read_csv("user_status.csv.block", keep_default_na=False)["username"])
    merge_block("user_anime_list.csv", -1, users)
    merge_block("user_status.csv", 0, users)
    logger.info(f"Merging block of {len(users)} users into the main database")

In [None]:
# get the anime list for each new user and write to disk
while True:
    usernames = prioritize_users()[:50000]
    block = set()
    for username in tqdm(usernames):
        user_anime_list, ok = get_user_anime_list(username)
        user_anime_list.to_csv(
            "user_anime_list.csv.block",
            index=False,
            mode="w" if not block else "a+",
            header=not block,
        )
        
        user_status_entry = pd.DataFrame.from_dict(
            {
                "username": [username],
                "access_timestamp": [int(datetime.datetime.now().timestamp())],
                "success": [ok],
            }
        )
        user_status_entry.to_csv(
            "user_status.csv.block",
            index=False,
            mode="w" if not block else "a+",
            header=not block,
        )
        block.add(username)
    merge_blocks()