# Generating a list of MAL usernames
* We look up a username by querying their user id
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts`.
* This notebook will run indefinitely. You must manually terminate once an acceptable number of users have been found

In [None]:
import os
import random
import re

from tqdm import tqdm

In [None]:
PARTITION = 0
NUM_PARTITIONS = 1
PROXY_NUMBER = 0

In [None]:
name = f"get_mal_users.{PARTITION}"
source = "mal"
outfile = f"get_mal_users.{PARTITION}.csv"
# can get the most recent userid by searching the usernames in https://myanimelist.net/comments.php?id=17337283
max_mal_userid = 17337283  #  current as of 20231021

In [None]:
%run WebEndpointBase.ipynb

In [None]:
# returns all usernames that have commented on the given userid's profile
def get_username(userid):
    url = f"https://myanimelist.net/comments.php?id={userid}"
    response = call_api(url)
    if response.status_code in [404]:
        # the user may have deleted their account
        return ""
    if not response.ok:
        logger.warning(f"Error {response} received when handling {url}")
        return ""
    urls = re.findall('''/profile/[^"/#%]+"''', response.text)
    users = [x[len("/profile/") : -len('"')] for x in urls]
    return users[0]

In [None]:
def save(records):
    new_userids = {int(x[0]) for x in records}
    with open(outfile + "~", "w") as f:
        f.write("userid,username\n")
        for record in records:
            f.write(f"{record[0]},{record[1]}\n")
        if os.path.exists(outfile):
            with open(outfile) as g:
                header = False
                for line in g:
                    fields = line.strip().split(",")
                    if not header:
                        header = True
                        userid_col = fields.index("userid")
                        continue
                    userid = int(fields[userid_col])
                    if is_valid_userid(userid) and userid not in new_userids:
                        f.write(line)
    os.replace(outfile + "~", outfile)

In [None]:
def is_valid_userid(x):
    return x % NUM_PARTITIONS == PARTITION


def get_remaining_userids():
    searched_userids = set()
    if os.path.exists(outfile):
        with open(outfile) as f:
            header = False
            for line in tqdm(f):
                fields = line.strip().split(",")
                if not header:
                    header = True
                    userid_col = fields.index("userid")
                    continue
                userid = int(fields[userid_col])
                if is_valid_userid(userid):
                    searched_userids.add(userid)

    remaining_userids = [
        x
        for x in range(1, max_mal_userid + 1)
        if is_valid_userid(x) and x not in searched_userids
    ]
    random.shuffle(remaining_userids)
    logger.info(
        f"Starting with {len(searched_userids)} userids out of {len(remaining_userids)}"
    )

    if len(remaining_userids) < 20000:
        searched_userids = list(searched_userids)
        random.shuffle(searched_userids)
        remaining_userids = (remaining_userids + searched_userids)[:20000]
    return remaining_userids

In [None]:
while True:
    records = []
    for userid in get_remaining_userids():
        username = get_username(userid)
        records.append((userid, username))
        if len(records) > 10000:
            save(records)
            records = []
    save(records)