# Process Media Lists
* Replace raw fields with processed fields

In [None]:
import datetime
import glob
import os
import random

import pandas as pd
from tqdm import tqdm

In [None]:
seed = 0

# Save encodings

In [None]:
random.seed(seed)

## Timestamps

In [None]:
source_dir = "../../data/raw_data"
outdir = "../../data/processed_data"

In [None]:
def save_timestamps():
    # no rating site existed before then
    min_timestamp = int(datetime.datetime(2000, 1, 1).timestamp())

    max_timestamp = float("-inf")
    for source in tqdm(["mal", "anilist", "kitsu", "animeplanet"]):
        for fn in sorted(
            glob.glob(f"../../data/{source}/user_media_facts/user_status.*.csv")
        ):
            with open(fn, "r") as f:
                header = False
                for line in f:
                    fields = line.strip().split(",")
                    if not header:
                        header = True
                        ts_col = fields.index("access_timestamp")
                        continue
                    ts = int(fields[ts_col])
                    if ts > max_timestamp:
                        max_timestamp = ts
    assert max_timestamp != float("-inf")

    with open(os.path.join(outdir, f"timestamps.csv"), "w") as f:
        f.write(f"min_timestamp,{min_timestamp}\n")
        f.write(f"max_timestamp,{max_timestamp}\n")
    return max_timestamp

## User and media ids

In [None]:
def get_unique_values(media, col):
    values = set()
    file = f"{source_dir}/user_{media}_list.csv"
    with open(file, "r") as in_file:
        header = False
        for line in tqdm(in_file):
            fields = line.strip().split(",")
            if not header:
                header = True
                idx = fields.index(col)
                continue
            values.add(fields[idx])
    return list(values)

In [None]:
def shuffle_usernames():
    usernames = pd.DataFrame()
    shuffled_usernames = list(
        set(get_unique_values("manga", "userid"))
        | set(get_unique_values("anime", "userid"))
    )
    random.shuffle(shuffled_usernames)
    usernames["userid"] = shuffled_usernames
    usernames["uid"] = usernames.index
    usernames.to_csv(f"{outdir}/username_to_uid.csv", index=False)

In [None]:
def shuffle_media_ids(media):
    items = pd.DataFrame()
    shuffled_items = get_unique_values(media, "mediaid")
    random.shuffle(shuffled_items)
    items["mediaid"] = shuffled_items
    items["uid"] = items.index
    items.to_csv(f"{outdir}/{media}_to_uid.csv", index=False)

# Process files

In [None]:
save_timestamps()

In [None]:
shuffle_usernames()
for media in ["manga", "anime"]:
    shuffle_media_ids(media)

In [None]:
%run ProcessMediaListsBase.ipynb

In [None]:
for medium in ["manga", "anime"]:
    process(
        f"{source_dir}/user_{medium}_list.csv",
        f"{outdir}/user_{medium}_list.csv",
    )