# Process Anime Lists
* Replace raw features like number_of_episodes_watched with processed features like completion_percentage

In [None]:
import os

import pandas as pd
from tqdm import tqdm

## Save encodings

In [None]:
source_dir = "../../data/raw_data"
outdir = "../../data/processed_data"

In [None]:
def is_missing_timestamp(ts):
    # MAL entries can have missing timestamps, which get mapped to 18000
    # Anilist entries can have missing timestamps, which get mapped to 0
    return ts == 18000 or ts == 0

In [None]:
min_timestamp = float("inf")
max_timestamp = float("-inf")
item_to_first_timestamp = {}
with open(os.path.join(source_dir, "user_anime_list.csv"), "r") as in_file:
    header = False
    for line in tqdm(in_file):
        if not header:
            header = True
            continue
        fields = line.strip().split(",")
        item = fields[1]
        ts = int(fields[3])
        if item not in item_to_first_timestamp:
            item_to_first_timestamp[item] = 0
        if is_missing_timestamp(ts):
            continue
        if item_to_first_timestamp[item] == 0 or ts < item_to_first_timestamp[item]:
            item_to_first_timestamp[item] = ts
        if ts < min_timestamp:
            min_timestamp = ts
        if ts > max_timestamp:
            max_timestamp = ts
assert min_timestamp > 946702800  # Jan 1, 2000. no rating site existed before then

In [None]:
mal_id = 0
anilist_id = 1

In [None]:
with open(os.path.join(outdir, "processing_encodings.csv"), "w") as out_file:
    out_file.write(f"min_timestamp,{min_timestamp}\n")
    out_file.write(f"max_timestamp,{max_timestamp}\n")
    out_file.write(f"MAL,{mal_id}\n")
    out_file.write(f"AniList,{anilist_id}\n")

In [None]:
with open(os.path.join(outdir, "item_timestamp_encodings.csv"), "w") as out_file:
    keys = sorted(int(x) for x in item_to_first_timestamp)
    for k in tqdm(keys):
        out_file.write(f"{k},{item_to_first_timestamp[str(k)]}\n")

## Process file

In [None]:
%run ProcessAnimeListsBase.ipynb

In [None]:
process(
    os.path.join(source_dir, "user_anime_list.csv"),
    os.path.join(outdir, "user_anime_list.csv"),
)