# Process Anime Lists
* Replace raw features like number_of_episodes_watched with processed features like completion_percentage

In [None]:
import os
import time

import pandas as pd
from tqdm import tqdm

In [None]:
media = ""

## Save encodings

In [None]:
source_dir = "../../data/raw_data"
outdir = "../../data/processed_data"

In [None]:
def is_missing_timestamp(ts):
    # MAL entries can have missing timestamps, which get mapped to 18000
    # Anilist entries can have missing timestamps, which get mapped to 0
    return ts == 18000 or ts == 0

In [None]:
def get_timestamps():
    min_timestamp = float("inf")
    max_timestamp = float("-inf")
    for media in ["manga", "anime"]:
        with open(os.path.join(source_dir, f"user_{media}_list.csv"), "r") as in_file:
            header = False
            for line in tqdm(in_file):
                fields = line.strip().split(",")
                if not header:
                    header = True
                    ts_col = fields.index("timestamp")
                    continue
                ts = int(fields[ts_col])
                if is_missing_timestamp(ts):
                    continue
                if ts < min_timestamp:
                    min_timestamp = ts
                if ts > max_timestamp:
                    max_timestamp = ts
    assert min_timestamp > 946702800  # Jan 1, 2000. no rating site existed before then
    assert max_timestamp < time.time()
    return min_timestamp, max_timestamp

In [None]:
min_timestamp, max_timestamp = get_timestamps()

In [None]:
mal_id = 0
anilist_id = 1
kitsu_id = 2

In [None]:
with open(os.path.join(outdir, f"{media}_processing_encodings.csv"), "w") as out_file:
    out_file.write(f"min_timestamp,{min_timestamp}\n")
    out_file.write(f"max_timestamp,{max_timestamp}\n")
    out_file.write(f"MAL,{mal_id}\n")
    out_file.write(f"AniList,{anilist_id}\n")
    out_file.write(f"Kitsu,{kitsu_id}\n")

## Process file

In [None]:
%run ProcessMediaListsBase.ipynb

In [None]:
process(
    os.path.join(source_dir, f"user_{media}_list.csv"),
    os.path.join(outdir, f"user_{media}_list.csv"),
)