# Get Mal Users From ID
* We look up a username by querying their user id
* You can terminate or restart the notebook at any point without losing progress. All users found so far will be stored at `data/mal/user_facts`.
* This notebook will run indefinitely. You must manually terminate once an acceptable number of users have been found

In [None]:
import os
import random
import re
import traceback

from filelock import FileLock
from tqdm import tqdm

In [None]:
PARTITION = 0
NUM_PARTITIONS = 1
PROXY_NUMBER = 0
NUM_PROXIES = 1

In [None]:
name = f"get_mal_users.{PARTITION}"
source = "mal"
outfile = f"get_mal_users.{PARTITION}.csv"

In [None]:
exec(open("web_endpoint_base.py").read())

In [None]:
LOCK_FILE = f"../../mal_id.lock"
LOCK = FileLock(LOCK_FILE, timeout=-1)

In [None]:
def get_max_id():
    fn = "maxid.txt"
    if not os.path.exists(fn):
        maxid = 17337283
    else:
        with open(fn) as f:
            lines = f.readlines()
            assert len(lines) == 1
            maxid = int(lines[0].strip())
    # see if any new users have joined
    testid = maxid + random.randint(1, 10000)
    if get_username(testid):
        maxid = testid
        partfn = f"{fn}.{PARTITION}.txt"
        with open(partfn, "w") as f:
            f.write(f"{maxid}\n")
        os.rename(partfn, fn)
    return maxid

In [None]:
@LOCK
def save(records):
    logging.info(f"Saving {len(records)} extra userids")
    new_userids = {int(x[0]) for x in records}
    header_fields = ["userid", "username", "failed_attempts"]
    userid_col = header_fields.index("userid")
    username_col = header_fields.index("username")
    attempt_col = header_fields.index("failed_attempts")

    userid_to_record = {}
    with open(outfile + "~", "w") as f:
        f.write(",".join(header_fields) + "\n")
        if os.path.exists(outfile):
            with open(outfile) as g:
                header = False
                for line in g:
                    fields = line.strip().split(",")
                    if not header:
                        header = True
                        continue
                    userid = int(fields[userid_col])
                    if not is_valid_userid(userid):
                        continue
                    if userid in new_userids:
                        userid_to_record[userid] = fields
                    else:
                        f.write(line)
        for record in records:
            userid = int(record[userid_col])
            if record[username_col] != "":
                f.write(f"{record[0]},{record[1]},{0}\n")
            elif userid in userid_to_record:
                stored_record = userid_to_record[userid]
                attempts = int(stored_record[attempt_col]) + 1
                if attempts == 1:
                    # to guard against transient failures, we use the last good state
                    f.write(f"{stored_record[0]},{stored_record[1]},{attempts}\n")
                else:
                    # if we fail twice in a row, then assume the userid is empty
                    f.write(f"{record[0]},{record[1]},{attempts}\n")
            else:
                f.write(f"{record[0]},{record[1]},{1}\n")
    os.replace(outfile + "~", outfile)

In [None]:
def is_valid_userid(x):
    return x % NUM_PARTITIONS == PARTITION


def get_remaining_userids(N):
    searched_userids = []
    if os.path.exists(outfile):
        with open(outfile) as f:
            header = False
            for line in tqdm(f):
                fields = line.strip().split(",")
                if not header:
                    header = True
                    userid_col = fields.index("userid")
                    username_col = fields.index("username")
                    continue
                userid = int(fields[userid_col])
                if is_valid_userid(userid):
                    searched_userids.append(userid)

    searched_userids_set = set(searched_userids)
    remaining_userids = [
        x
        for x in range(1, get_max_id() + 1)
        if is_valid_userid(x) and x not in searched_userids_set
    ]
    random.shuffle(remaining_userids)
    logging.info(
        f"{len(searched_userids)} searched userids, "
        f"and {len(remaining_userids)} remaining userids"
    )

    # we revisit searched_userids because users can change names over time
    to_search = remaining_userids[:N] + searched_userids[:N]
    return to_search[:N]

In [None]:
# used for rebalancing when NUM_PARTITIONS changes
def repartition(fn, N, M):
    with open(f"{fn}.unified.csv", "w") as f:
        for t in range(N):
            header = False
            with open(fn + f".{t}.csv") as infile:
                for line in tqdm(infile):
                    if not header:
                        header = True
                        if t == 0:
                            f.write(line)
                        continue
                    f.write(line)
            os.remove(fn + f".{t}.csv")

    with open(f"{fn}.unified.csv") as infile:
        files = [open(fn + f".{t}.csv", "w") for t in range(M)]
        header = False
        for line in tqdm(infile):
            if not header:
                header = True
                usercol = line.strip().split(",").index("userid")
                for f in files:
                    f.write(line)
                continue
            userid = int(line.strip().split(",")[usercol])
            files[userid % M].write(line)
        for f in files:
            f.close()
    os.remove(f"{fn}.unified.csv")


def repartition_all(N, M):
    repartition(f"../../../data/{source}/user_facts/get_mal_users", N, M)

In [None]:
try:
    records = []
    for userid in get_remaining_userids(20000):
        username = get_username(userid)
        if "," in username:
            username = "" # old mal accounts may have commas in them
        records.append((userid, username))
        if should_save("records", 2000):
            save(records)
            records = []
    save(records)
except Exception as e:
    logging.info(traceback.format_exc())
    logging.info(str(e))
    while True:
        time.sleep(3600)