# Generating a list of MAL usernames
* We start with a list of starting usernames, which can either be stored in `data/mal/user_facts/queue.txt` or specified in the notebook
* You can optionally run `notebooks/API/GetRecentUsernames` to seed the search 
* Then, we do a breadth-first search of their friends until the friend graph is spanned
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts/usernames.txt`. The adjacency list of the friend graph will be stored at `data/mal/user_facts/friends_list.csv`.
* This notebook will run indefinitely. You must manually terminate once an acceptable number of users have been found

In [None]:
import contextlib
import logging
import os
import re
import shutil
import time

import numpy as np
import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

## Basic Setup

In [None]:
# outdir
data_path = "../../data/mal/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [None]:
# logging
logger = logging.getLogger("GetUserFriends")
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter(
    "%(name)s:%(levelname)s:%(asctime)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
for stream in [logging.FileHandler("get_user_friends.log"), logging.StreamHandler()]:
    stream.setFormatter(formatter)
    logger.addHandler(stream)

## Parse MAL

In [None]:
# apply rate limiting with exponential backoff for unexpected errors
@sleep_and_retry
@limits(calls=1, period=3)
def call_api(url, retry_timeout=1):
    try:
        response = requests.get(url)
        if response.status_code in [403, 429, 500, 503]:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        logger.warning(
            f"Received error {str(e)} while accessing {url}. Retrying in {retry_timeout} seconds"
        )
        time.sleep(retry_timeout)
        retry_timeout = min(retry_timeout * 2, 3600)
        return call_api(url, retry_timeout)
    return response

In [None]:
# parse a user's MAL profile page for their friends
def get_friends(username):
    url = f"https://myanimelist.net/profile/{username}/friends"
    response = call_api(url)
    if response.status_code in [404]:
        # the user may have deleted their account
        return set()
    if not response.ok:
        logger.warning(f"Error {response} received when handling {url}")
        return set()
    friend_urls = re.findall(
        '''https://myanimelist.net/profile/[^"/%#]+"''', response.text
    )
    friends = {
        x[len("https://myanimelist.net/profile/") : -len('"')] for x in friend_urls
    }
    return friends

## Manage username database

In [None]:
# atomic saving utilities
@contextlib.contextmanager
def atomic_overwrite(filename):
    temp = filename + "~"
    with open(temp, "w") as f:
        yield f
    os.replace(temp, filename)


def atomic_to_csv(collection, filename):
    with atomic_overwrite(filename) as f:
        pd.Series(collection).to_csv(f, header=False, index=False)


@contextlib.contextmanager
def atomic_append(filename):
    temp = filename + "~"
    with open(temp, "w") as f:
        yield f

    temp2 = temp + "~"
    with open(temp2, "wb") as wfd:
        for f in [filename, temp]:
            with open(f, "rb") as fd:
                shutil.copyfileobj(fd, wfd)
    os.remove(temp)
    os.replace(temp2, filename)


def atomic_append_dataframe_to_csv(df, filename):
    if not os.path.exists(filename):
        with atomic_overwrite(filename) as f:
            df.to_csv(f, index=False)
    else:
        with atomic_append(filename) as f:
            df.to_csv(f, index=False, header=False)

In [None]:
# snapshot hourly to amortize the cost of the disk I/O
def should_save(reason):
    should_save = False
    if reason not in save_reasons:
        save_reasons[reason] = (0, 1)
    iterations_since_last_write, iterations_until_next_write = save_reasons[reason]
    iterations_since_last_write += 1
    if iterations_since_last_write >= iterations_until_next_write:
        iterations_since_last_write = 0
        iterations_until_next_write = min(2 * iterations_until_next_write, 1200)
        should_save = True
        logger.info(
            f"Writing data for {reason}. Will next write data "
            f"after {iterations_until_next_write} iterations"
        )
    save_reasons[reason] = (iterations_since_last_write, iterations_until_next_write)
    return should_save


save_reasons = {}

## Seed the graph search

In [None]:
def reseed_search():
    # reasonable defaults if this is the first time running the notebook
    queue = ["Fro116"]
    usernames = set()
    open_nodes = set()
    closed_nodes = set()
    shuffle_on_rerun = True

    # if we rerunning the notebook, then resume execution where we last left off
    if os.path.exists("queue.txt"):
        with open("queue.txt") as f:
            queue = [x.strip() for x in f.readlines() if x.strip()]
    for username_fn in ["usernames.txt", "recent_usernames.txt"]:
        if os.path.exists(username_fn):
            with open(username_fn) as f:
                usernames |= {x.strip() for x in f.readlines() if x.strip()}
    if os.path.exists("closed_nodes.txt"):
        with open("closed_nodes.txt") as f:
            closed_nodes = {x.strip() for x in f.readlines() if x.strip()}

    # verify consistency of loaded data structures
    # they might be inconsistent if the notebook crashed mid-save
    queue = [x for x in queue if x not in closed_nodes]
    queue = queue + list(usernames - closed_nodes - set(queue))
    usernames |= set(queue)
    open_nodes = set(queue)
    if shuffle_on_rerun:
        np.random.shuffle(queue)

    for x in [queue, usernames, open_nodes, closed_nodes]:
        x = filter_pct_sign(x)

    logger.info(
        f"Starting with {len(queue)} users in queue and {len(closed_nodes)} processed "
        f"users for a total of {len(usernames)} users!"
    )
    return queue, usernames, open_nodes, closed_nodes

# Continuously query user friends
* TODO documentation

In [None]:
# we use a generator for profiling with tqdm
def generate_true():
    while True:
        yield


def atomic_save_progress(usernames, friends_list, queue, closed_nodes):
    atomic_to_csv(sorted(list(usernames)), "usernames.txt")
    atomic_append_dataframe_to_csv(friends_list, "friends_list.csv")
    atomic_to_csv(queue, "queue.txt")
    atomic_to_csv(sorted(list(closed_nodes)), "closed_nodes.txt")
    logger.info(
        f"Successfully wrote {len(queue)} users in queue and {len(closed_nodes)} "
        f"processed users for a total of {len(usernames)} users!"
    )


def search_friend_graph():
    # Breadth first search
    queue, usernames, open_nodes, closed_nodes = reseed_search()
    friends_list = pd.DataFrame()
    for _ in tqdm(generate_true()):
        if not queue:
            break
        username = queue[0]
        queue = queue[1:]
        friends = get_friends(username)

        usernames |= friends
        new_friends = [
            x for x in friends if x not in closed_nodes and x not in open_nodes
        ]
        queue = queue + new_friends
        open_nodes |= set(new_friends)
        open_nodes.remove(username)
        closed_nodes.add(username)
        friends |= {username}
        friends_list = friends_list.append(
            pd.DataFrame.from_dict(
                {"Username": [username] * len(friends), "Friend": list(friends)}
            )
        )

        if should_save("users"):
            atomic_save_progress(usernames, friends_list, queue, closed_nodes)
            friends_list = pd.DataFrame()

    # Final save
    atomic_save_progress(usernames, friends_list, queue, closed_nodes)

In [None]:
while True:
    search_friend_graph()