# Generating a list of MAL users
* We start with a list of starting usernames, which can either be stored in `data/mal/user_facts/queue.txt` or specified in the notebook
* Then, we do a breadth-first search of their friends until the friend graph is spanned
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts/usernames.txt` 
* This notebook will run indefinitely. You must manually terminate once an acceptable number of users have been found

In [1]:
import contextlib
import logging
import os
import re
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/mal/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
logger = logging.getLogger("GetUsers")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [logging.FileHandler("get_users.log"), logging.StreamHandler()]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

In [4]:
# reasonable defaults if this is the first time running the notebook
queue = ["Fro116"]
usernames = set()
closed_nodes = set()
shuffle_on_rerun = True

# if we rerunning the notebook, then resume execution where we last left off
if os.path.exists("queue.txt"):
    with open("queue.txt") as f:
        queue = [x.strip() for x in f.readlines() if x.strip()]
if os.path.exists("usernames.txt"):
    with open("usernames.txt") as f:
        usernames = {x.strip() for x in f.readlines() if x.strip()}
if os.path.exists("closed_nodes.txt"):
    with open("closed_nodes.txt") as f:
        closed_nodes = {x.strip() for x in f.readlines() if x.strip()}
# handle the case where the notebook crashes mid-save
queue = [x for x in queue if x not in closed_nodes]
queue = queue + list(usernames - closed_nodes - set(queue))
usernames |= set(queue)
open_nodes = set(queue)
if shuffle_on_rerun:
    np.random.shuffle(queue)

logger.info(
    f"Starting with {len(queue)} users in queue and {len(closed_nodes)} processed "
    f"users for a total of {len(usernames)} users!"
)

GetUsers:INFO:2021-11-06 21:10:26: Starting with 309365 users in queue and 57902 processed users for a total of 367267 users!


In [5]:
# apply rate limiting with exponential backoff for unexpected errors
@sleep_and_retry
@limits(calls=1, period=3)
def call_api(url, retry_timeout=1):
    try:
        response = requests.get(url)
        if response.status_code in [403, 429, 500, 503]:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        logger.warning(
            f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
        )
        time.sleep(retry_timeout)
        retry_timeout = min(retry_timeout * 2, 3600)
        return call_api(url, retry_timeout)
    return response

In [6]:
def get_friends(username):
    url = f"https://myanimelist.net/profile/{username}/friends"
    response = call_api(url)
    if not response.ok:
        logger.warning(f"Error {response} received when handling {url}")
        return set()
    friend_urls = re.findall(
        '''https://myanimelist.net/profile/[^"/#]+"''', response.text
    )
    return {x[len("https://myanimelist.net/profile/") : -len('"')] for x in friend_urls}

In [7]:
@contextlib.contextmanager
def atomic_overwrite(filename):
    temp = filename + "~"
    with open(temp, "w") as f:
        yield f
    os.replace(temp, filename)  # this will only happen if no exception was raised


def atomic_to_csv(collection, filename):
    with atomic_overwrite(filename) as f:
        pd.Series(collection).to_csv(f, header=False, index=False)

In [None]:
# we use a generator for profiling with tqdm
def generator(queue):
    while queue:
        yield


iterations_until_next_write = 1
iterations_since_last_write = 0
for _ in tqdm(generator(queue)):
    username = queue[0]
    queue = queue[1:]
    friends = get_friends(username)

    usernames |= friends
    new_friends = [x for x in friends if x not in closed_nodes and x not in open_nodes]
    queue = queue + new_friends
    open_nodes |= set(new_friends)
    open_nodes.remove(username)
    closed_nodes.add(username)

    # snapshot hourly to amortize the cost of the disk I/O
    iterations_since_last_write += 1
    if iterations_since_last_write >= iterations_until_next_write:
        iterations_since_last_write = 0
        iterations_until_next_write = min(2 * iterations_until_next_write, 1200)

        atomic_to_csv(sorted(list(usernames)), "usernames.txt")
        atomic_to_csv(sorted(list(closed_nodes)), "closed_nodes.txt")
        atomic_to_csv(queue, "queue.txt")
        logger.info(
            f"Successfully wrote {len(queue)} users in queue and {len(closed_nodes)} "
            f"processed users for a total of {len(usernames)} users!"
        )
        logger.info(
            f"Will next write data after {iterations_until_next_write} iterations"
        )

0it [00:00, ?it/s]GetUsers:INFO:2021-11-06 21:10:28: Successfully wrote 309365 users in queue and 57903 processed users for a total of 367268 users!
GetUsers:INFO:2021-11-06 21:10:28: Will next write data after 2 iterations
2it [00:03,  1.82s/it]GetUsers:INFO:2021-11-06 21:10:34: Successfully wrote 309371 users in queue and 57905 processed users for a total of 367276 users!
GetUsers:INFO:2021-11-06 21:10:34: Will next write data after 4 iterations
6it [00:15,  2.84s/it]GetUsers:INFO:2021-11-06 21:10:46: Successfully wrote 309374 users in queue and 57909 processed users for a total of 367283 users!
GetUsers:INFO:2021-11-06 21:10:46: Will next write data after 8 iterations
14it [00:39,  2.91s/it]GetUsers:INFO:2021-11-06 21:11:10: Successfully wrote 309394 users in queue and 57917 processed users for a total of 367311 users!
GetUsers:INFO:2021-11-06 21:11:10: Will next write data after 16 iterations
30it [01:27,  3.02s/it]GetUsers:INFO:2021-11-06 21:11:58: Successfully wrote 309426 users 