# Generating a list of MAL users
* We start with a list of starting usernames, which can either be stored in `data/mal/user_facts/queue.txt` or specified in the notebook
* Then, we do a bread-first search of their friends until the friend graph is spanned
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts/usernames.txt` 
* This notebook may run indefinitely. Please manually terminate once an acceptable number of users have been found

In [1]:
import os
import time
import contextlib

import pandas as pd
import requests
from ratelimit import limits, sleep_and_retry
from tqdm import tqdm
import numpy as np

In [2]:
data_path = "../../data/mal/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
# reasonable defaults if this is the first time running the notebook
queue = ["Fro116"]
usernames = set()
closed_nodes = set()

# if we rerunning the notebook, then resume execution where we last left off
if os.path.exists("queue.txt"):
    with open("queue.txt") as f:
        queue = [x.strip() for x in f.readlines() if x.strip()]
if os.path.exists("usernames.txt"):
    with open("usernames.txt") as f:
        usernames = {x.strip() for x in f.readlines() if x.strip()}
if os.path.exists("closed_nodes.txt"):
    with open("closed_nodes.txt") as f:
        closed_nodes = {x.strip() for x in f.readlines() if x.strip()}
queue = queue + list(usernames - closed_nodes - set(queue))
np.random.shuffle(queue)
open_nodes = set(queue)

print(
    f"Starting with {len(queue)} users in queue and {len(closed_nodes)} processed users for a total of {len(usernames)} users!"
)

Starting with 415772 users in queue and 178039 processed users for a total of 593799 users!


In [4]:
@sleep_and_retry
@limits(calls=1, period=5)
def call_api(url):
    try:
        response = requests.get(url)
        if (response.status_code == 500) or (response.status_code == 503):
            # This can occur if MAL servers go down
            raise Exception(f"{response.status_code}")
    except Exception as e:
        timeout = 600
        print(
            f"Received error {str(e)} while accessing {url}. Retrying in {timeout} seconds"
        )
        time.sleep(timeout)
        return call_api(url) 
    return response

In [5]:
def get_friends(username):
    friends = set()

    max_items_per_page = 300  # property of the API
    more_pages = True
    page = 1
    while more_pages:
        response = call_api(f"https://api.jikan.moe/v3/user/{username}/friends/{page}")
        if response.status_code == 404:
            # The user has no friends
            return friends
        if not response.ok:
            print(f"received error {response.status_code} for user {username}")
            return friends
        new_friends = {x["username"] for x in response.json()["friends"]}
        friends |= new_friends
        page += 1
        if len(new_friends) < max_items_per_page:
            more_pages = False
    return friends

In [6]:
@contextlib.contextmanager
def atomic_overwrite(filename):
    temp = filename + '~'
    with open(temp, "w") as f:
        yield f
    os.replace(temp, filename) # this will only happen if no exception was raised
    
def atomic_to_csv(collection, filename):
    with atomic_overwrite(filename) as f:
        pd.Series(collection).to_csv(f, header=False, index=False)    

In [7]:
# we use a generator for profiling with tqdm
def generator(queue):
    while queue:
        yield


for _ in tqdm(generator(queue)):
    username = queue[0]
    queue = queue[1:]
    friends = get_friends(username)

    usernames |= friends
    new_friends = [x for x in friends if x not in closed_nodes and x not in open_nodes]
    queue = new_friends + queue # TODO change back to bfs
    open_nodes |= set(new_friends)
    open_nodes.remove(username)
    closed_nodes.add(username)

    atomic_to_csv(queue, "queue.txt")
    atomic_to_csv(sorted(list(usernames)), "usernames.txt")
    atomic_to_csv(sorted(list(closed_nodes)), "closed_nodes.txt")

6143it [8:33:25,  5.18s/it]

Received error 503 while accessing https://api.jikan.moe/v3/user/Habbit/friends/1. Retrying in 600 seconds


7818it [11:04:34,  5.10s/it]


KeyboardInterrupt: 