In [None]:
import os
import random

import pandas as pd
from tqdm import tqdm

## Load encodings

In [None]:
outdir = "../../data/processed_data"

In [None]:
def is_missing_timestamp(ts):
    # MAL entries can have missing timestamps, which get mapped to 18000
    # Anilist entries can have missing timestamps, which get mapped to 0
    return ts == 18000 or ts == 0

In [None]:
def get_user_to_first_timestamp(user_anime_lists_fn):
    user_to_first_timestamp = {}
    with open(user_anime_lists_fn, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                continue
            fields = line.strip().split(",")
            user = fields[0]
            ts = int(fields[3])
            if user not in user_to_first_timestamp:
                user_to_first_timestamp[user] = 0
            if is_missing_timestamp(ts):
                continue
            if user_to_first_timestamp[user] == 0 or ts < user_to_first_timestamp[user]:
                user_to_first_timestamp[user] = ts
    return user_to_first_timestamp

In [None]:
item_to_first_timestamp = {}
with open(os.path.join(outdir, "item_timestamp_encodings.csv"), "r") as in_file:
    for line in tqdm(in_file):
        fields = line.split(",")
        item_to_first_timestamp[fields[0]] = int(fields[1])

In [None]:
with open(os.path.join(outdir, "processing_encodings.csv"), "r") as in_file:

    def parse_line(field):
        line = in_file.readline()
        fields = line.split(",")
        assert len(fields) == 2
        assert fields[0] == field
        return int(fields[1])

    min_timestamp = parse_line("min_timestamp")
    max_timestamp = parse_line("max_timestamp")
    mal_id = parse_line("MAL")
    anilist_id = parse_line("AniList")

In [None]:
seconds_in_year = 3.156e7

In [None]:
anime = pd.read_csv(os.path.join(outdir, "anime.csv"))
anime_to_eps = anime.set_index("anime_id")["num_episodes"].to_dict()

## Process fields

In [None]:
def completion_percentage(uid, episodes):
    uid = int(uid)
    episodes = int(episodes)
    if uid not in anime_to_eps or anime_to_eps[uid] == 0:
        return 0
    return episodes / anime_to_eps[uid]

In [None]:
def is_missing_timestamp(ts):
    # MAL entries can have missing timestamps, which get mapped to 18000
    # Anilist entries can have missing timestamps, which get mapped to 0
    return ts == 18000 or ts == 0

In [None]:
def process_timestamp(ts):
    ts = int(ts)
    if ts < min_timestamp:
        return 0
    if ts > max_timestamp:
        return 1
    return (ts - min_timestamp) / max_timestamp

In [None]:
def process_user_timestamp(ts, user):
    ts = int(ts)
    if is_missing_timestamp(ts):
        return 0
    return (ts - user_to_first_timestamp[user]) / seconds_in_year


def process_item_timestamp(ts, item):
    ts = int(ts)
    if is_missing_timestamp(ts):
        return 0
    return (int(ts) - item_to_first_timestamp[item]) / seconds_in_year

In [None]:
def process_source(source):
    if source == "MAL":
        return mal_id
    elif source == "AniList":
        return anilist_id
    else:
        assert False

In [None]:
def process_status(status):
    status = int(status)
    if status == 0:
        status = 5
    return status

In [None]:
def format_float(x):
    return f"{x:.9f}"


def process_line(line):
    fields = line.split(",")
    return ",".join(
        [
            fields[0],
            fields[1],
            fields[2],
            format_float(process_timestamp(fields[3])),
            format_float(process_user_timestamp(fields[3], fields[0])),
            format_float(process_item_timestamp(fields[3], fields[1])),
            str(process_status(fields[4])),
            format_float(completion_percentage(fields[1], fields[5])),
            fields[6],
            str(process_source(fields[7])),
        ]
    )

In [None]:
def process(source, dest):
    global user_to_first_timestamp 
    user_to_first_timestamp = get_user_to_first_timestamp(source)
    with open(source, "r") as in_file, open(dest, "w") as out_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                out_file.write(
                    "username,animeid,score,timestamp,user_rel_timestamp,item_rel_timestamp,status,completion,rewatch,source\n"
                )
                continue
            try:
                out_file.write(f"{process_line(line.strip())}\n")
            except Exception as e:
                print(line)
                raise e