In [None]:
import os
import random

import pandas as pd
from tqdm import tqdm

## Load encodings

In [None]:
outdir = "../../data/processed_data"

In [None]:
def is_missing_timestamp(ts):
    # MAL entries can have missing timestamps, which get mapped to 18000
    # Anilist entries can have missing timestamps, which get mapped to 0
    return ts == 18000 or ts == 0

In [None]:
with open(os.path.join(outdir, f"{media}_processing_encodings.csv"), "r") as in_file:

    def parse_line(field):
        line = in_file.readline()
        fields = line.split(",")
        assert len(fields) == 2
        assert fields[0] == field
        return int(fields[1])

    min_timestamp = parse_line("min_timestamp")
    max_timestamp = parse_line("max_timestamp")
    mal_id = parse_line("MAL")
    anilist_id = parse_line("AniList")
    kitsu_id = parse_line("Kitsu")    

In [None]:
seconds_in_year = 3.156e7

In [None]:
medium = pd.read_csv(os.path.join(outdir, f"{media}.csv"))
if media == "anime":
    media_to_completion = {
        "episodes": medium.set_index(f"{media}_id")["num_episodes"].to_dict()
    }
elif media == "manga":
    media_to_completion = {
        "volumes": medium.set_index(f"{media}_id")["num_volumes"].to_dict(),
        "chapters": medium.set_index(f"{media}_id")["num_chapters"].to_dict(),
    }
else:
    assert False

## Process fields

In [None]:
def completion_percentage(uid, value, field):
    uid = int(uid)
    value = int(value)
    assert field in media_to_completion
    media_to_maxval = media_to_completion[field]
    if uid not in media_to_maxval or media_to_maxval[uid] == 0:
        return 0
    return min(max(value / media_to_maxval[uid], 0), 1)

In [None]:
def process_timestamp(ts):
    ts = int(ts)
    if is_missing_timestamp(ts):
        return -1
    if ts < min_timestamp:
        return 0
    return (ts - min_timestamp) / (max_timestamp - min_timestamp)

In [None]:
def process_source(source):
    if source == "MAL":
        return mal_id
    elif source == "AniList":
        return anilist_id
    elif source == "Kitsu":
        return kitsu_id    
    else:
        assert False

In [None]:
def process_status(status):
    status = int(status)
    if status == 0:
        status = 5
    return status

In [None]:
def format_float(x):
    return f"{x:.9f}"

In [None]:
def get_header(media):
    if media == "anime":
        return [
            "username",
            f"{media}id",
            "score",
            "timestamp",
            "status",
            "episodes",
            "userid",
            "source",
        ]
    elif media == "manga":
        return [
            "username",
            f"{media}id",
            "score",
            "timestamp",
            "status",
            "volumes",
            "chapters",
            "userid",
            "source",
        ]
    else:
        return False


def get_output_header(media):
    return [
        "username",
        f"{media}id",
        "score",
        "timestamp",
        "status",
        "completion",
        "source",
    ]


def process_line(line):
    header = get_header(media)
    fields = line.split(",")
    output_fields = []
    output_fields.append(fields[header.index("username")])
    output_fields.append(fields[header.index(f"{media}id")])
    output_fields.append(fields[header.index("score")])
    output_fields.append(
        format_float(process_timestamp(fields[header.index("timestamp")]))
    )
    output_fields.append(str(process_status(fields[header.index("status")])))
    completion = max(
        completion_percentage(
            fields[header.index(f"{media}id")],
            fields[header.index(x)],
            x,
        )
        for x in media_to_completion
    )
    output_fields.append(format_float(completion))
    output_fields.append(str(process_source(fields[header.index("source")])))
    return ",".join(output_fields)

In [None]:
def process(source, dest):
    with open(source, "r") as in_file, open(dest, "w") as out_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                assert line == ",".join(get_header(media)) + "\n"
                out_file.write(",".join(get_output_header(media)) + "\n")
                continue
            try:
                out_file.write(f"{process_line(line.strip())}\n")
            except Exception as e:
                print(line)
                raise e