# Getting user lists
* You can terminate or restart the notebook at any point without losing progress. All lists found so far will be stored at `data/{NAME}/user_media_facts/user_{MEDIATYPE}_list.{PARTITION}.csv`
* The notebook will only collect data for users whose hash is equal to PARTITION_NUMBER mod NUM_PARTITIONS
* This notebook will run indefinitely. You must manually terminate once an acceptable number of lists have been found

In [None]:
import datetime
import gc
import glob
import json
import logging
import math
import os
import random
import time
from hashlib import sha256

import numpy as np
import pandas as pd
import psutil
import requests
from tqdm import tqdm

## Basic setup

In [None]:
def import_notebook(nb):
    cwd = os.getcwd()
    try:
        os.chdir(os.path.dirname(nb))
        script = os.path.basename(nb)
        %run $script
    finally:
        os.chdir(cwd)

In [None]:
import_notebook(f"../API/{NAME.capitalize()}Api.ipynb")

In [None]:
# outdir
data_path = f"../../../data/{NAME}/user_media_facts"
if not os.path.exists(data_path):
    os.makedirs(data_path, exist_ok=True)
os.chdir(data_path)

In [None]:
STATUS_FILE = f"user_status.{PARTITION}.csv"
LOG_FILE = f"get_user_media_lists.{PARTITION}.log"

In [None]:
def media_list_file(medium):
    return f"user_{medium}_list.{PARTITION}.csv"

In [None]:
users_per_batch = 25000
if NAME == "animeplanet":
    users_per_batch = 5000

In [None]:
# logging
logger = logging.getLogger(f"get_user_media_lists")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.handlers.RotatingFileHandler(
        LOG_FILE, "w+", maxBytes=1000000, backupCount=1
    ),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

## Sort users by recency

In [None]:
def portable_hash(username):
    return int(sha256(username.encode("utf-8")).hexdigest(), 16)

In [None]:
# useful for rebalancing when NUM_PARTITIONS changes, but not necessary
def repartition(fn, N, M):
    from tqdm import tqdm

    with open(f"{fn}.unified.csv", "w") as f:
        for t in range(N):
            header = False
            with open(fn + f".{t}.csv") as infile:
                for line in tqdm(infile):
                    if not header:
                        header = True
                        if t == 0:
                            f.write(line)
                        continue
                    f.write(line)
        os.remove(fn + f".{t}.csv")

    with open(f"{fn}.unified.csv") as infile:
        files = [open(fn + f".{t}.csv", "w") for t in range(M)]
        for t in range(N):
            header = False
            for line in tqdm(infile):
                if not header:
                    header = True
                    for f in files:
                        f.write(line)
                        usercol = line.strip().split(",").index("username")
                    continue
                username = line.strip().split(",")[usercol]
                files[portable_hash(username) % M].write(line)
        for f in files:
            f.close()
    os.remove(f"{fn}.unified.csv")

In [None]:
# sorts users by how recent their last refresh was and returns the K oldest
# only considers users in the given partition
# TODO generational mark-sweep prioritization
def prioritize_users(K):
    logger.info(
        f"Prioritizing users from partition {PARTITION+1} out of {NUM_PARTITIONS}"
    )
    usernames = {
        x for x in read_usernames() if (portable_hash(x) % NUM_PARTITIONS) == PARTITION
    }
    user_status_file = STATUS_FILE
    if not os.path.exists(user_status_file):
        logger.info(
            f"Getting the lists of {len(usernames)} new users, "
            f"refreshing the lists of {0} existing users, "
            f"and skipping the lists of {0} broken users!"
        )
        new_users = list(usernames)
        random.shuffle(new_users)
        return {x: 0 for x in new_users[:K]}
    else:
        oldest_existing_users = {}
        num_existing_users = 0
        num_broken_users = 0
        with open(user_status_file, "r") as in_file:
            header = False
            for line in in_file:
                if not header:
                    header = True
                    user_col = line.strip().split(",").index("username")
                    attempts_col = line.strip().split(",").index("failed_attempts")
                    continue
                fields = line.strip().split(",")
                username = fields[user_col]
                attempts = int(fields[attempts_col])
                if username not in usernames:
                    continue
                usernames.remove(username)
                if attempts >= 3:
                    num_broken_users += 1
                else:
                    num_existing_users += 1
                    if len(oldest_existing_users) < K:
                        oldest_existing_users[username] = attempts
        new_users = list(usernames)
        random.shuffle(new_users)
        logger.info(
            f"Getting the lists of {len(new_users)} new users, "
            f"refreshing the lists of {num_existing_users} existing users, "
            f"and skipping the lists of {num_broken_users} broken users!"
        )
        prioritized_users = {}
        for user in new_users:
            if len(prioritized_users) >= K:
                break
            prioritized_users[user] = 0
        for user, value in oldest_existing_users.items():
            if len(prioritized_users) >= K:
                break
            prioritized_users[user] = value
        return prioritized_users

In [None]:
consecutive_failure_count = 0


def monitor_failures(ok):
    global consecutive_failure_count
    if ok:
        consecutive_failure_count = 0
    else:
        consecutive_failure_count += 1
    if consecutive_failure_count >= 20:
        logger.info(
            f"The most recent {consecutive_failure_count} attempts failed, pausing collection"
        )
        time.sleep(3600)
        consecutive_failure_count = 0

## Continuously refresh lists
* We take the least recently refreshed users and refresh their lists
* These lists are stored in a temporary block
* Once the block is big enough, we atomically merge it with the existing lists

In [None]:
def merge_block(file, user_field, users):
    outfile = file + "~"
    blockfile = file + ".block"
    first_run = not os.path.exists(file)
    with open(outfile, "w") as out_file:
        # copy over all the unchaged users
        if not first_run:
            valid_usernames = {
                x
                for x in read_usernames()
                if (portable_hash(x) % NUM_PARTITIONS) == PARTITION
            }
            with open(file, "r") as in_file:
                header = False
                for line in tqdm(in_file):
                    if not header:
                        header = True
                        out_file.write(line)
                        continue
                    fields = line.strip().split(",")
                    if (
                        fields[user_field] not in users
                        and fields[user_field] in valid_usernames
                    ):
                        out_file.write(line)

        # copy over the new block
        with open(blockfile, "r") as in_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    if first_run:
                        out_file.write(line)
                    header = True
                    continue
                out_file.write(line)
    os.replace(outfile, file)

In [None]:
def get_lock_name():
    # we use a file lock to prevent uploading to aws in the
    # middle of a database update
    return f"../../get_user_media_lists.{NAME}.lock"


def acquire_lock(fn):
    logger.info(f"Acquiring lock")
    file = get_lock_name()
    while True:
        sleep_time = 1
        while os.path.exists(file):
            time.sleep(sleep_time)
            sleep_time = min(sleep_time * 2, 10)
        try:
            with open(file, "x") as _:
                logger.info(f"Acquired lock")
                fn()
                return
        except FileExistsError:
            pass


def release_lock():
    logger.info(f"Releasing lock")
    file = get_lock_name()
    try:
        os.remove(file)
    except:
        pass


def run_blocking_function(fn):
    acquire_lock(fn)
    release_lock()

In [None]:
def merge_blocks():
    logger.info(f"Merging block into the main database")
    users = set(
        pd.read_csv(
            f"{STATUS_FILE}.block",
            keep_default_na=False,
            usecols=["username"],
            dtype={"username": str},
        )["username"]
    )
    for medium in ["anime", "manga"]:
        merge_block(media_list_file(medium), -1, users)
    merge_block(STATUS_FILE, 0, users)
    logger.info(f"Merged block of {len(users)} users into the main database")

In [None]:
# merge any user lists from the last run
if os.path.exists(f"{STATUS_FILE}.block"):
    run_blocking_function(merge_blocks)

In [None]:
try:
    # get the list for each new user and write to disk
    while True:
        gc.collect()
        prioritized_users = prioritize_users(users_per_batch)
        logger.info(
            f"Memory usage: {psutil.Process().memory_info().rss / 1e9} {psutil.Process().memory_info().vms / 1e9}"
        )
        logger.info(f"Fetching lists")
        block = set()
        for username in tqdm(prioritized_users):
            any_ok = False
            updated_at = 0
            for medium in ["anime", "manga"]:
                user_media_list, ok = get_user_media_list(username, medium)
                user_media_list.to_csv(
                    f"{media_list_file(medium)}.block",
                    index=False,
                    mode="w" if not block else "a+",
                    header=not block,
                )
                any_ok |= ok
                if len(user_media_list) > 0:
                    updated_at = max(updated_at, user_media_list["updated_at"].max())
            user_status_entry = pd.DataFrame.from_dict(
                {
                    "username": [username],
                    "access_timestamp": [int(datetime.datetime.now().timestamp())],
                    "last_update_timestamp": [updated_at],
                    "failed_attempts": [
                        0 if any_ok else prioritized_users[username] + 1
                    ],
                }
            )
            user_status_entry.to_csv(
                f"{STATUS_FILE}.block",
                index=False,
                mode="w" if not block else "a+",
                header=not block,
            )
            block.add(username)
            monitor_failures(ok)
        run_blocking_function(merge_blocks)
        block = None
except Exception as e:
    logger.info(str(e))
    logger.info(f"ERROR with {username}")