In [None]:
import logging
import os
import time
import traceback
from functools import cache

import pandas as pd
from tqdm import tqdm

## Load encodings

In [None]:
outdir = "../../data/processed_data"

In [None]:
def parse_line(file, field, format=int):
    line = file.readline()
    fields = line.strip().split(",")
    assert len(fields) == 2
    assert fields[0] == field
    return format(fields[1])


with open(os.path.join(outdir, "timestamps.csv")) as f:
    min_timestamp = parse_line(f, "min_timestamp")
    max_timestamp = parse_line(f, "max_timestamp")

In [None]:
def get_mapping(fn, col):
    return pd.read_csv(f"{outdir}/{fn}").set_index(col)["uid"].to_dict()

In [None]:
username_to_uid = get_mapping("username_to_uid.csv", "userid")
media_to_uid = {
    "0": get_mapping("manga_to_uid.csv", "mediaid"),
    "1": get_mapping("anime_to_uid.csv", "mediaid"),
}

## Process fields

In [None]:
def format_timestamp(ts, min_ts, max_ts):
    ts = int(ts)
    # manually entered timestamps can be inaccurate
    if ts < min_ts:
        return 0
    if ts > time.time():
        return 0
    return (ts - min_ts) / (max_ts - min_ts)


def process_timestamp(ts):
    return format_timestamp(ts, min_timestamp, max_timestamp)

In [None]:
def process_line(line, header, username_map):
    fields = line.strip().split(",")
    for f in ["updated_at", "created_at", "started_at", "finished_at"]:
        fields[header.index(f)] = str(process_timestamp(fields[header.index(f)]))
    user_map = username_map if username_map is not None else username_to_uid
    fields[header.index("userid")] = str(user_map[fields[header.index("userid")]])
    medium = fields[header.index("medium")]
    a = int(fields[header.index("mediaid")])
    if a not in media_to_uid[medium]:
        logging.warning(f"Item {a} not found")
        return None
    fields[header.index("mediaid")] = str(media_to_uid[medium][a])
    return ",".join(fields) + "\n"

In [None]:
def process(source, dest, username_map=None):
    with open(source, "r") as in_file, open(dest, "w") as out_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                header_fields = line.strip().split(",")
                out_file.write(line)
                continue
            try:
                out = process_line(line, header_fields, username_map)
                if out is not None:
                    out_file.write(out)
            except Exception as e:
                logging.warning(line)
                logging.warning(str(e))
                raise e