# Getting user lists
* Stores lists at `data/{NAME}/user_media_facts/user_{MEDIATYPE}_list.{PARTITION}.csv`
* The notebook will only collect data for users whose hash is equal to PARTITION_NUMBER mod NUM_PARTITIONS

In [None]:
import datetime
import glob
import json
import logging
import os
import random
import sys
import time
import traceback
from hashlib import sha256

import pandas as pd
import requests
from filelock import FileLock
from tqdm import tqdm

## Basic setup

In [None]:
sys.path.append("..")
from API import anilist_api, animeplanet_api, api_setup, kitsu_api, mal_api

PROXIES = api_setup.load_proxies(PROXY_NUMBER, NUM_PROXIES)
if NAME == "mal":
    mal_api.load_token(TOKEN_NUMBER)
    api = mal_api
    users_per_batch = 2000
elif NAME == "anilist":
    api = anilist_api
    users_per_batch = 2000
elif NAME == "kitsu":
    api = kitsu_api
    users_per_batch = 4000
elif NAME == "animeplanet":
    api = animeplanet_api
    users_per_batch = 2000
else:
    assert False

SESSION = api.make_session(PROXIES, 1)
get_user_media_list = lambda username, medium: api.get_user_media_list(
    SESSION, username, medium
)

In [None]:
# outdir
data_path = f"../../../data/{NAME}/user_media_facts"
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)
os.chdir(data_path)

In [None]:
STATUS_FILE = f"user_status.{PARTITION}.csv"
LOG_FILE = f"get_user_media_lists.{PARTITION}.log"
LOCK_FILE = f"../../get_user_media_lists.{NAME}.lock"
LOCK = FileLock(LOCK_FILE, timeout=-1)

In [None]:
def media_list_file(medium):
    return f"user_{medium}_list.{PARTITION}.csv"

In [None]:
def configure_logging(logfile):
    name = "get_media"
    logger = logging.getLogger()
    logger.handlers.clear()
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
    )
    for stream in [
        logging.handlers.RotatingFileHandler(
            logfile, "w+", maxBytes=1000000, backupCount=1
        ),
    ]:
        stream.setFormatter(formatter)
        logger.addHandler(stream)

In [None]:
configure_logging(LOG_FILE)

## Sort users by recency

In [None]:
def portable_hash(username):
    return int(sha256(username.encode("utf-8")).hexdigest(), 16)

In [None]:
# used for rebalancing when NUM_PARTITIONS changes
def repartition(fn, N, M):
    with open(f"{fn}.unified.csv", "w") as f:
        for t in range(N):
            header = False
            with open(fn + f".{t}.csv") as infile:
                for line in tqdm(infile):
                    if not header:
                        header = True
                        if t == 0:
                            f.write(line)
                        continue
                    f.write(line)
            os.remove(fn + f".{t}.csv")

    with open(f"{fn}.unified.csv") as infile:
        files = [open(fn + f".{t}.csv", "w") for t in range(M)]
        header = False
        for line in tqdm(infile):
            if not header:
                header = True
                usercol = line.strip().split(",").index("username")
                for f in files:
                    f.write(line)
                continue
            username = line.strip().split(",")[usercol]
            files[portable_hash(username) % M].write(line)
        for f in files:
            f.close()
    os.remove(f"{fn}.unified.csv")


def repartition_all(N, M):
    for base in ["user_status", "user_manga_list", "user_anime_list"]:
        repartition(f"../../../data/{NAME}/user_media_facts/{base}", N, M)

In [None]:
def read_status_file(fn):
    return pd.read_csv(
        fn,
        keep_default_na=False,
        dtype={"username": str},
    )

In [None]:
@LOCK
def prioritize_users(K):
    usernames = {
        x for x in read_usernames() if (portable_hash(x) % NUM_PARTITIONS) == PARTITION
    }
    if not os.path.exists(STATUS_FILE):
        broken_users = []
        monthly_users = []
        yearly_users = []
        refresh_users = []
        remaining_users = []
        new_users = list(usernames)
        failed_attempts = {}
    else:
        df = read_status_file(STATUS_FILE).sort_values(
            by=["access_timestamp", "last_update_timestamp"]
        )
        new_users = list(usernames - set(df.username))
        q = "failed_attempts >= 3"
        broken_users = list(df.query(q).username)
        df = df.query(f"not {q}")
        secs_per_day = 24 * 60 * 60
        year = 365 * secs_per_day
        month = 30 * secs_per_day
        week = 7 * secs_per_day
        now = datetime.datetime.now().timestamp()
        monthly_users = list(
            df.query(
                f"{week} <= access_timestamp - last_update_timestamp  < {month} "
                + f"and access_timestamp < {now - week}"
            ).username
        )
        yearly_users = list(
            df.query(
                f"{month} <= access_timestamp - last_update_timestamp  < {year} "
                f"and access_timestamp < {now - month}"
            ).username
        )
        refresh_users = list(
            df.query(f" access_timestamp < {now - 2 * month}").username
        )
        recent_users = set(monthly_users) | set(yearly_users) | set(refresh_users)
        remaining_users = [x for x in df.username if x not in recent_users]
        failed_attempts = (
            read_status_file(STATUS_FILE)
            .sort_values(by=["access_timestamp", "last_update_timestamp"])
            .set_index("username")["failed_attempts"]
            .to_dict()
        )
    logging.info(
        f"Getting the lists of {len(new_users)} new users, "
        + f"{len(monthly_users)} stale monthly users, "
        + f"{len(yearly_users)} stale yearly users, "
        + f"{len(refresh_users)} stale refresh users, "
        + f"{len(remaining_users)} remaining users, "
        + f"and skipping {len(broken_users)} broken users!"
    )
    # randomize order
    random.shuffle(broken_users)
    random.shuffle(monthly_users)
    random.shuffle(yearly_users)
    random.shuffle(refresh_users)
    random.shuffle(remaining_users)
    random.shuffle(new_users)
    # prioritize active users
    N_monthly = int(round(K * 0.33))
    N_yearly = int(round(K * 0.33))
    N_broken = int(round(K * 0.01))
    assert N_monthly > 0 and N_yearly > 0 and N_broken > 0
    monthly_track = monthly_users[:N_monthly]
    yearly_track = yearly_users[: N_yearly + (N_monthly - len(monthly_track))]
    broken_track = broken_users[:N_broken]
    refresh_track = refresh_users[:K]
    remaining_track = remaining_users[:K]
    order = (
        new_users
        + monthly_track
        + yearly_track
        + broken_track
        + refresh_track
        + remaining_track
    )[:K]
    random.shuffle(order)
    return {x: failed_attempts.get(x, 0) for x in order}

In [None]:
consecutive_failure_count = 0


def monitor_failures(ok):
    global consecutive_failure_count
    if ok:
        consecutive_failure_count = 0
    else:
        consecutive_failure_count += 1
    if consecutive_failure_count >= 20:
        timeout = 3600
        logging.info(
            f"The most recent {consecutive_failure_count} attempts failed, "
            f"pausing collection for {timeout} seconds"
        )
        time.sleep(timeout)
        consecutive_failure_count = 0

## Continuously refresh lists
* We take the least recently refreshed users and refresh their lists
* These lists are stored in a temporary block
* Once the block is big enough, we atomically merge it with the existing lists

In [None]:
@LOCK
def merge_block(file, users):
    outfile = file + "~"
    blockfile = file + ".block"
    first_run = not os.path.exists(file)
    with open(outfile, "w") as out_file:
        # copy over all the unchaged users
        if not first_run:
            valid_usernames = {
                x
                for x in read_usernames()
                if (portable_hash(x) % NUM_PARTITIONS) == PARTITION
            }
            with open(file, "r") as in_file:
                header = False
                for line in tqdm(in_file):
                    fields = line.strip().split(",")
                    if not header:
                        header = True
                        out_file.write(line)
                        user_field = fields.index("username")
                        nfields = len(fields)
                        continue
                    assert len(fields) == nfields, line
                    if (
                        fields[user_field] not in users
                        and fields[user_field] in valid_usernames
                    ):
                        out_file.write(line)

        # copy over the new block
        with open(blockfile, "r") as in_file:
            header = False
            for line in tqdm(in_file):
                fields = line.strip().split(",")
                if not header:
                    if first_run:
                        out_file.write(line)
                    header = True
                    nfields = len(fields)
                    continue
                assert len(fields) == nfields, line
                out_file.write(line)
    os.replace(outfile, file)
    os.remove(blockfile)

In [None]:
def merge_blocks():
    logging.info(f"Merging block into the main database")
    users = set(read_status_file(f"{STATUS_FILE}.block")["username"])
    for medium in ["anime", "manga"]:
        merge_block(media_list_file(medium), users)
    merge_block(STATUS_FILE, users)
    logging.info(f"Merged block of {len(users)} users into the main database")

In [None]:
# delete any corrupted files from the previous run
for file in [
    STATUS_FILE + ".block",
    media_list_file("anime") + ".block",
    media_list_file("manga") + ".block",
]:
    if os.path.exists(file):
        os.remove(file)

In [None]:
try:
    # get the list for each new user and write to disk
    prioritized_users = prioritize_users(users_per_batch)
    logging.info(f"Fetching lists")
    block = set()
    for username in tqdm(prioritized_users):
        any_ok = False
        updated_at = 0
        for medium in ["anime", "manga"]:
            user_media_list, ok = get_user_media_list(username, medium)
            any_ok |= ok
            if len(user_media_list) > 0:
                append = os.path.exists(media_list_file(medium) + ".block")
                user_media_list.to_csv(
                    f"{media_list_file(medium)}.block",
                    index=False,
                    mode="a+" if append else "w",
                    header=not append,
                )
                updated_at = max(updated_at, user_media_list["updated_at"].max())
        user_status_entry = pd.DataFrame.from_dict(
            {
                "username": [username],
                "access_timestamp": [int(datetime.datetime.now().timestamp())],
                "last_update_timestamp": [updated_at],
                "failed_attempts": [0 if any_ok else prioritized_users[username] + 1],
            }
        )
        user_status_entry.to_csv(
            f"{STATUS_FILE}.block",
            index=False,
            mode="w" if not block else "a+",
            header=not block,
        )
        block.add(username)
        monitor_failures(ok)
    merge_blocks()
except Exception as e:
    # errors require human supervision
    logging.error(traceback.format_exc())
    logging.error(str(e))
    logging.error(f"ERROR with {username}")
    while True:
        time.sleep(3600)