# Generating a list of MAL users
* We start with a list of starting usernames, which can either be stored in `data/user_facts/usernames.txt` or specified in the notebook
* Their friends are added, then friends of their friends and so on
* This notebook will continue to run and snapshot data until the friend graph is spanned

In [1]:
import os
import time

import pandas as pd
import requests
from tqdm import tqdm

In [2]:
data_path = "../../data/user_facts"
if not os.path.exists(data_path):
    os.mkdir(data_path)
os.chdir(data_path)

In [3]:
if os.path.exists("usernames.txt"):
    with open('usernames.txt') as f:
        starting_usernames = {x.strip() for x in f.readlines()}
else:    
    starting_usernames = ["Fro116"]

In [4]:
def get_friend_df(json):
    # turn the json response into a dataframe
    usernames = []
    for item in json["friends"]:
        usernames.append(item["username"])
    return pd.DataFrame({"friends": usernames})

In [5]:
def get_friends(usernames):
    friend_lists = []

    max_items_per_page = 300  # property of the API
    for username in tqdm(usernames):
        more_pages = True
        page = 1
        friends = []
        while more_pages:
            time.sleep(4)  # Jikan requires a minimum rate limit of 1 request per 4 secs
            response = requests.get(
                f"https://api.jikan.moe/v3/user/{username}/friends/{page}"
            )
            if not response.ok:
                print(f"error: no friends found for {username}")
                continue
            page_df = get_friend_df(response.json())
            friends.append(page_df)
            page += 1
            if len(page_df) < max_items_per_page:
                more_pages = False
        friend_df = pd.concat(friends)
        friend_df["username"] = username
        friend_lists.append(friend_df)
    friends = pd.concat(friend_lists, ignore_index=True)
    return friends

In [None]:
usernames = set(starting_usernames)

friends_by_round = [pd.DataFrame()]
usernames_by_round = [set()]
while True:
    epoch = len(usernames_by_round) - 1
    new_usernames = usernames - usernames_by_round[epoch]
    if len(new_usernames) == 0:
        break
    new_friends = get_friends(new_usernames)
    friends_by_round.append(new_friends)

    usernames_by_round.append(usernames)
    usernames = set(new_friends["friends"]) | usernames_by_round[epoch] | new_usernames
    pd.Series(sorted(list(usernames))).to_csv("usernames.txt", header=False, index=False)

100%|██████████| 1/1 [00:04<00:00,  4.51s/it]
100%|██████████| 8/8 [00:40<00:00,  5.08s/it]
100%|██████████| 113/113 [10:35<00:00,  5.62s/it]
  6%|▋         | 298/4728 [46:29<6:54:33,  5.61s/it]   