# Generating a list of MAL usernames
* We look up a username by querying their user id
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts/usernames_from_id.txt`.
* This notebook will run indefinitely. You must manually terminate once an acceptable number of users have been found

In [1]:
import contextlib
import logging
import os
import re
import shutil
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry

In [2]:
# outdir
data_path = "../../data/mal/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
# logging
logger = logging.getLogger("GetUserFromID")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [
    logging.FileHandler("get_user_from_id.log"),
    logging.StreamHandler(),
]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

In [4]:
# if we rerunning the notebook, then resume execution where we last left off
usernames = set()
if os.path.exists("usernames_from_id.txt"):
    with open("usernames_from_id.txt") as f:
        usernames = {x.strip() for x in f.readlines() if x.strip()}

userid = 1
if os.path.exists("current_userid.txt"):
    with open("current_userid.txt") as f:
        userid = int([x.strip() for x in f.readlines() if x.strip()][0])

logger.info(f"Starting with {len(usernames)} stored usernames at residue {userid}")

GetUserFromID:INFO:2022-07-13 04:36:56: Starting with 1649762 stored usernames at residue 8222427


In [5]:
# apply rate limiting with exponential backoff for unexpected errors
@sleep_and_retry
@limits(calls=1, period=3)
def call_api(url, retry_timeout=1):
    try:
        response = requests.get(url)
        if response.status_code in [403, 429, 500, 503, 504]:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        logger.warning(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        retry_timeout = min(retry_timeout * 2, 3600)
        return call_api(url, retry_timeout)
    return response

In [6]:
# returns all usernames that have commented on the given userid's profile
def get_usernames(userid):
    url = f"https://myanimelist.net/comments.php?id={userid}"
    response = call_api(url)
    if response.status_code in [404]:
        # the user may have deleted their account
        return set()
    if not response.ok:
        logger.warning(f"Error {response} received when handling {url}")
        return set()
    urls = re.findall('''/profile/[^"/#%]+"''', response.text)
    users = {x[len("/profile/") : -len('"')] for x in urls}
    return users

In [7]:
# atomic saving utilities
@contextlib.contextmanager
def atomic_overwrite(filename):
    temp = filename + "~"
    with open(temp, "w") as f:
        yield f
    os.replace(temp, filename)


def atomic_to_csv(collection, filename):
    with atomic_overwrite(filename) as f:
        pd.Series(collection).to_csv(f, header=False, index=False)

In [8]:
# snapshot hourly to amortize the cost of the disk I/O
def should_save(reason):
    should_save = False
    if reason not in save_reasons:
        save_reasons[reason] = (0, 1)
    iterations_since_last_write, iterations_until_next_write = save_reasons[reason]
    iterations_since_last_write += 1
    if iterations_since_last_write >= iterations_until_next_write:
        iterations_since_last_write = 0
        iterations_until_next_write = min(2 * iterations_until_next_write, 1200)
        should_save = True
        logger.info(
            f"Writing data for {reason}. Will next write data "
            f"after {iterations_until_next_write} iterations"
        )
    save_reasons[reason] = (iterations_since_last_write, iterations_until_next_write)
    return should_save


save_reasons = {}

In [None]:
def save():
    atomic_to_csv(sorted(list(usernames)), "usernames_from_id.txt")
    atomic_to_csv([userid], "current_userid.txt")
    logger.info(f"Successfully wrote {len(usernames)} users at residue {userid}")

# we search through user ids using a cyclic group to avoid temporal bias
maxid_prime_number = 14999981  # a prime number that upper bounds the number of MAL ids
cyclic_generator = 3
while True:
    while (userid * cyclic_generator) % maxid_prime_number != 1:
        usernames |= get_usernames(userid)
        userid = (userid * cyclic_generator) % maxid_prime_number
        if should_save("users"):
            save()
    save()
    maxid_prime_number *= 2

GetUserFromID:INFO:2022-07-13 04:36:57: Writing data for users. Will next write data after 2 iterations
GetUserFromID:INFO:2022-07-13 04:37:00: Successfully wrote 1649763 users at residue 9667300
GetUserFromID:INFO:2022-07-13 04:37:04: Writing data for users. Will next write data after 4 iterations
GetUserFromID:INFO:2022-07-13 04:37:07: Successfully wrote 1649765 users at residue 12005795
GetUserFromID:INFO:2022-07-13 04:37:16: Writing data for users. Will next write data after 8 iterations
GetUserFromID:INFO:2022-07-13 04:37:20: Successfully wrote 1649768 users at residue 12470611
GetUserFromID:INFO:2022-07-13 04:37:41: Writing data for users. Will next write data after 16 iterations
GetUserFromID:INFO:2022-07-13 04:37:44: Successfully wrote 1649778 users at residue 9782397
GetUserFromID:INFO:2022-07-13 04:38:29: Writing data for users. Will next write data after 32 iterations
GetUserFromID:INFO:2022-07-13 04:38:33: Successfully wrote 1649794 users at residue 7764381
GetUserFromID:IN