# Generating a list of MAL usernames
* The page https://myanimelist.net/users.php contains a list of recently active users
* We continually refresh that page and write down any new users
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts/recent_usernames.txt`.
* This notebook will run indefinitely. You must manually terminate once an acceptable number of users have been found

In [1]:
import contextlib
import logging
import os
import re
import shutil
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
# outdir
data_path = "../../data/mal/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
# logging
logger = logging.getLogger("GetRecentUsernames")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler("get_recent_usernames.log"),
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

In [4]:
# if we rerunning the notebook, then resume execution where we last left off
recent_usernames = set()
if os.path.exists("recent_usernames.txt"):
    with open("recent_usernames.txt") as f:
        recent_usernames = {x.strip() for x in f.readlines() if x.strip()}


logger.info(f"Starting with {len(recent_usernames)} stored usernames")

GetRecentUsernames:INFO:2022-04-22 14:18:40: Starting with 935047 stored usernames


In [5]:
# apply rate limiting with exponential backoff for unexpected errors
@sleep_and_retry
@limits(calls=1, period=3)
def call_api(url, retry_timeout=1):
    try:
        response = requests.get(url)
        if response.status_code in [403, 429, 500, 503]:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        logger.warning(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        retry_timeout = min(retry_timeout * 2, 3600)
        return call_api(url, retry_timeout)
    return response

In [6]:
# parse the recently online users page on https://myanimelist.net/users.php
def get_users():
    url = "https://myanimelist.net/users.php"
    response = call_api(url)
    if response.status_code in [404]:
        # the user may have deleted their account
        return set()
    if not response.ok:
        logger.warning(f"Error {response} received when handling {url}")
        return set()
    friend_urls = re.findall('''/profile/[^"/#]+"''', response.text)
    users = {x[len("/profile/") : -len('"')] for x in friend_urls}
    return {x for x in users if "%" not in x}

In [7]:
# atomic saving utilities
@contextlib.contextmanager
def atomic_overwrite(filename):
    temp = filename + "~"
    with open(temp, "w") as f:
        yield f
    os.replace(temp, filename)


def atomic_to_csv(collection, filename):
    with atomic_overwrite(filename) as f:
        pd.Series(collection).to_csv(f, header=False, index=False)


@contextlib.contextmanager
def atomic_append(filename):
    temp = filename + "~"
    with open(temp, "w") as f:
        yield f

    temp2 = temp + "~"
    with open(temp2, "wb") as wfd:
        for f in [filename, temp]:
            with open(f, "rb") as fd:
                shutil.copyfileobj(fd, wfd)
    os.remove(temp)
    os.replace(temp2, filename)


def atomic_append_dataframe_to_csv(df, filename):
    first_run = not os.path.exists(filename)
    temp = filename + "~"
    with atomic_append(filename) as f:
        df.to_csv(f, index=False, header=first_run)

In [8]:
# snapshot hourly to amortize the cost of the disk I/O
def should_save(reason):
    should_save = False
    if reason not in save_reasons:
        save_reasons[reason] = (0, 1)
    iterations_since_last_write, iterations_until_next_write = save_reasons[reason]
    iterations_since_last_write += 1
    if iterations_since_last_write >= iterations_until_next_write:
        iterations_since_last_write = 0
        iterations_until_next_write = min(2 * iterations_until_next_write, 1200)
        should_save = True
        logger.info(
            f"Writing data for {reason}. Will next write data "
            f"after {iterations_until_next_write} iterations"
        )
    save_reasons[reason] = (iterations_since_last_write, iterations_until_next_write)
    return should_save


save_reasons = {}

In [None]:
# we use a generator for profiling with tqdm
def generator():
    while True:
        yield


for _ in tqdm(generator()):
    users = get_users()
    recent_usernames |= users
    if should_save("users"):
        atomic_to_csv(sorted(list(recent_usernames)), "recent_usernames.txt")
        logger.info(f"Successfully added {len(recent_usernames)} new users ")

0it [00:00, ?it/s]GetRecentUsernames:INFO:2022-04-22 14:18:41: Writing data for users. Will next write data after 2 iterations
GetRecentUsernames:INFO:2022-04-22 14:18:43: Successfully added 935050 new users 
2it [00:03,  1.68s/it]GetRecentUsernames:INFO:2022-04-22 14:18:47: Writing data for users. Will next write data after 4 iterations
GetRecentUsernames:INFO:2022-04-22 14:18:48: Successfully added 935056 new users 
6it [00:15,  2.60s/it]GetRecentUsernames:INFO:2022-04-22 14:18:58: Writing data for users. Will next write data after 8 iterations
GetRecentUsernames:INFO:2022-04-22 14:19:00: Successfully added 935064 new users 
14it [00:39,  2.86s/it]GetRecentUsernames:INFO:2022-04-22 14:19:22: Writing data for users. Will next write data after 16 iterations
GetRecentUsernames:INFO:2022-04-22 14:19:24: Successfully added 935080 new users 
30it [01:27,  3.02s/it]GetRecentUsernames:INFO:2022-04-22 14:20:10: Writing data for users. Will next write data after 32 iterations
GetRecentUsername