# Generating a list of MAL users
* We start with a list of starting usernames, which can either be stored in `data/mal/user_facts/queue.txt` or specified in the notebook
* Then, we do a bread-first search of their friends until the friend graph is spanned
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts/usernames.txt` 
* This notebook may take a long time to finish. Feel free to manually terminate once an acceptable number of users have been found

In [1]:
import os
import time

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm

In [2]:
data_path = "../../data/mal/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
# reasonable defaults if this is the first time running the notebook
queue = ["Fro116"]
usernames = set()
closed_nodes = set()

# if we rerunning the notebook, then resume execution where we last left off
if os.path.exists("queue.txt"):
    with open("queue.txt") as f:
        queue = [x.strip() for x in f.readlines()]
if os.path.exists("usernames.txt"):
    with open("usernames.txt") as f:
        usernames = {x.strip() for x in f.readlines()}
if os.path.exists("closed_nodes.txt"):
    with open("closed_nodes.txt") as f:
        closed_nodes = {x.strip() for x in f.readlines()}
open_nodes = set(queue)

print(
    f"Starting with {len(queue)} users in queue and {len(closed_nodes)} processed users!"
)

Starting with 322554 users in queue and 61208 processed users!


In [4]:
def retry(url, error, timeout):
    print(
        f"Received error {error} while accessing {url}. Retrying in {timeout} seconds"
    )
    time.sleep(timeout)
    return call_api(url)    
@sleep_and_retry
@limits(calls=1, period=4)
def call_api(url):
    try:
        response = requests.get(url)
        if response.status_code == 500:
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
        if response.status_code == 503:
            return retry(url, response.status_code, 600)
    except Exception as e:
        return retry(url, str(e), 600)
    return response

In [5]:
def get_friends(username):
    friends = set()

    max_items_per_page = 300  # property of the API
    more_pages = True
    page = 1
    while more_pages:
        response = call_api(f"https://api.jikan.moe/v3/user/{username}/friends/{page}")
        if response.status_code == 404:
            # The user has no friends
            return friends
        if not response.ok:
            print(f"received error {response.status_code} for user {username}")
            return friends
        new_friends = {x["username"] for x in response.json()["friends"]}
        friends |= new_friends
        page += 1
        if len(new_friends) < max_items_per_page:
            more_pages = False
    return friends

In [None]:
# we use a generator for profiling with tqdm
def generator(queue):
    while queue:
        yield


for _ in tqdm(generator(queue)):
    username = queue[0]
    queue = queue[1:]
    friends = get_friends(username)

    usernames |= friends
    new_friends = [x for x in friends if x not in closed_nodes and x not in open_nodes]
    queue += new_friends
    open_nodes |= set(new_friends)
    open_nodes.remove(username)
    closed_nodes.add(username)

    pd.Series(queue).to_csv("queue.txt", header=False, index=False)
    pd.Series(sorted(list(usernames))).to_csv(
        "usernames.txt", header=False, index=False
    )
    pd.Series(sorted(list(closed_nodes))).to_csv(
        "closed_nodes.txt", header=False, index=False
    )

10512it [11:43:19,  4.18s/it]

Recevied error 500 while accessing https://api.jikan.moe/v3/user/Shadow002/friends/1. Retrying in 600 seconds


11098it [12:32:40,  4.05s/it] 