## Get User Lists

In [None]:
import datetime
import os

import pandas as pd
from tqdm import tqdm

In [None]:
# Entries in user_anime_list.csv can be malformed if the notebook crashes in the
# middle of saving a file. This function removes any malformed lines.
def verify_user_anime_list_consistency(input_fn, check_user_status=False):
    logger.info(f"Verifying consistency of entries in {input_fn}")
    partition = input_fn.split(".")[-2]    
    output_fn = input_fn + "~"
    if check_user_status:
        user_status = pd.read_csv(
            os.path.join(source_dir, f"user_anime_facts/user_status.{partition}.csv"),
            keep_default_na=False,
        )
        successful_users = set(user_status.loc[lambda x: x["success"]]["username"])
        invalid_users = set()
    with open(input_fn, "r") as in_file:
        with open(output_fn, "w") as out_file:
            header = False
            for line in tqdm(in_file):
                if not header:
                    header = True
                    correct_header = "uid,score,status,num_episodes_watched,updated_at,created_at,is_rewatching,username"
                    if line.strip() != correct_header.strip():
                        logger.warning(
                            f"Replacing malformed header line {line.strip} "
                            f"with correct header {correct_header.strip()}"
                        )
                        line = correct_header
                    out_file.write(line)
                    continue
                fields = line.strip().split(",")
                if len(fields) != 8:
                    logger.warning(
                        f"Deleting malformed line in user_anime_list.csv {line} "
                    )
                    continue
                if check_user_status:
                    username = fields[7]
                    username = int(username)
                    if username not in successful_users:
                        if username not in invalid_users:
                            invalid_users.add(username)
                            logger.warning(
                                f"Deleting entries in user_anime_list.csv for "
                                f" unrecognized username {username}"
                            )
                        continue
                out_file.write(line)
        os.replace(output_fn, input_fn)

In [None]:
def to_unix_time(date, fmt):
    return str(int(datetime.datetime.timestamp(datetime.datetime.strptime(date, fmt))))

In [None]:
def process_uid(uid):
    if uid.endswith(".0"):
        uid = uid[: -len(".0")]
    assert uid.isdigit()
    return uid


def process_status(status):
    if status == "completed":
        return "5"
    elif status == "watching":
        return "4"
    elif status == "on_hold":
        return "3"
    elif status == "dropped":
        return "2"
    elif status == "plan_to_watch":
        return "1"
    elif status == "":
        return "0"
    else:
        assert False


def isfloat(x):
    try:
        float(x)
        return True
    except Exception as e:
        return False


def process_score(score):
    assert isfloat(score)
    parsed_score = float(score)
    assert parsed_score >= 0 and parsed_score <= 10
    return score


def process_num_episodes_watched(num):
    assert num.isdigit()
    return num


def process_is_rewatching(rewatching):
    if rewatching == "True":
        return "1"
    elif rewatching == "False":
        return "0"
    else:
        assert False


def process_start_date(date):
    if len(rewatching) == 0:
        return "0"
    else:
        return to_unix_time(date, "%Y-%m-%d")


def process_finish_date(date):
    if len(rewatching) == 0:
        return "0"
    else:
        return to_unix_time(date, "%Y-%m-%d")


def process_priority(priority):
    assert priority == "-1"
    return priority


def process_num_times_rewatched(rewatch):
    return rewatch


def process_rewatch_value(value):
    return value


def process_updated_at(time):
    return to_unix_time(time, "%Y-%m-%dT%H:%M:%S+00:00")


def process_timestamp(time):
    if time[0] == "-":
        time = "0"
    assert time.isdigit()
    return time


df = pd.read_csv("../../data/anilist/user_facts/userid_to_username.csv")
userid_to_username = df.set_index("userid")["username"].to_dict()


def process_username(username):
    userid = int(username)
    if userid in userid_to_username:
        return userid_to_username[userid]
    else:
        return username

In [None]:
def process_header(header):
    assert header.split(",") == [
        "uid",
        "score",
        "status",
        "num_episodes_watched",
        "updated_at",
        "created_at",
        "is_rewatching",
        "username",
    ]
    return ",".join(
        [
            "username",
            "animeid",
            "score",
            "timestamp",
            "status",
            "episodes",
            "rewatch",
            "source",
        ]
    )


def process_line(line):
    try:
        fields = line.split(",")
        anime_uid = process_uid(fields[0])
        score = process_score(fields[1])
        status = process_status(fields[2])
        episodes = process_num_episodes_watched(fields[3])
        timestamp = process_timestamp(fields[4])
        rewatch = process_is_rewatching(fields[6])
        username = process_username(fields[7])
        source = "AniList"
    except Exception as e:
        print(f"Error: could not parse {line}")
        raise e
    return ",".join(
        [username, anime_uid, score, timestamp, status, episodes, rewatch, source]
    )

In [None]:
def process_user_anime_lists(infile, outfile):
    logger.info(f"processing entries in {infile}")
    with open(infile, "r") as in_file:
        with open(outfile, "a") as out_file:
            header = False
            for line in tqdm(in_file):
                line = line.strip()
                if not header:
                    header = True
                    out_file.write(process_header(line) + "\n")
                    continue
                out_file.write(process_line(line) + "\n")