# Generating a list of MAL users
* We start with a list of starting usernames, which can either be stored in `data/mal/user_facts/queue.txt` or specified in the notebook
* Then, we do a breadth-first search of their friends until the friend graph is spanned
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts/usernames.txt` 
* This notebook will run indefinitely. You must manually terminate once an acceptable number of users have been found

In [1]:
import contextlib
import os
import re
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/mal/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
# reasonable defaults if this is the first time running the notebook
queue = ["Fro116"]
usernames = set()
closed_nodes = set()

# if we rerunning the notebook, then resume execution where we last left off
if os.path.exists("queue.txt"):
    with open("queue.txt") as f:
        queue = [x.strip() for x in f.readlines() if x.strip()]
if os.path.exists("usernames.txt"):
    with open("usernames.txt") as f:
        usernames = {x.strip() for x in f.readlines() if x.strip()}
if os.path.exists("closed_nodes.txt"):
    with open("closed_nodes.txt") as f:
        closed_nodes = {x.strip() for x in f.readlines() if x.strip()}
queue = queue + list(usernames - closed_nodes - set(queue))
np.random.shuffle(queue)
open_nodes = set(queue)

print(
    f"Starting with {len(queue)} users in queue and {len(closed_nodes)} processed users for a total of {len(usernames)} users!"
)

Starting with 284279 users in queue and 42701 processed users for a total of 326979 users!


In [4]:
@sleep_and_retry
@limits(calls=1, period=3)
def call_api(url):
    try:
        response = requests.get(url)
        if response.status_code in [403, 429, 500, 503]:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        timeout = 600
        print(
            f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
        )
        time.sleep(timeout)
        return call_api(url)
    return response

In [5]:
def get_friends(username):
    url=f"https://myanimelist.net/profile/{username}/friends"
    response = call_api(url)
    if not response.ok:
        print(f"Error {response} received when handling {url}")
        return set()
    friend_urls = re.findall(
        '''https://myanimelist.net/profile/[^"/#]+"''', response.text
    )
    return {x[len("https://myanimelist.net/profile/") : -len('"')] for x in friend_urls}

In [6]:
@contextlib.contextmanager
def atomic_overwrite(filename):
    temp = filename + "~"
    with open(temp, "w") as f:
        yield f
    os.replace(temp, filename)  # this will only happen if no exception was raised


def atomic_to_csv(collection, filename):
    with atomic_overwrite(filename) as f:
        pd.Series(collection).to_csv(f, header=False, index=False)

In [None]:
# we use a generator for profiling with tqdm
def generator(queue):
    while queue:
        yield


for _ in tqdm(generator(queue)):
    username = queue[0]
    queue = queue[1:]
    friends = get_friends(username)

    usernames |= friends
    new_friends = [x for x in friends if x not in closed_nodes and x not in open_nodes]
    queue = queue + new_friends
    open_nodes |= set(new_friends)
    open_nodes.remove(username)
    closed_nodes.add(username)

    atomic_to_csv(queue, "queue.txt")
    atomic_to_csv(sorted(list(usernames)), "usernames.txt")
    atomic_to_csv(sorted(list(closed_nodes)), "closed_nodes.txt")

11it [00:44,  6.81s/it]