# Generate Media Splits

In [None]:
import os

import pandas as pd
import yaml

In [None]:
username = ""
source = ""
original_source = ""
media = ""

In [None]:
data_path = os.path.join("../../data/recommendations", source, username)

In [None]:
def fullpath(x):
    return os.path.join(data_path, f"user_{x}_list.csv")

## Knowledge Cutoff

In [None]:
def get_settings():
    d = {}
    for s in ["default_settings", "private_settings"]:
        with open(f"../../environment/{s}.yml", "r") as f:
            d |= yaml.safe_load(f)
    return d

In [None]:
def get_knowledge_cutoff(days):
    def parse_line(file, field, format=int):
        line = file.readline()
        fields = line.strip().split(",")
        assert len(fields) == 2
        assert fields[0] == field
        return format(fields[1])

    with open("../../data/processed_data/timestamps.csv") as f:
        min_timestamp = parse_line(f, "min_timestamp")
        max_timestamp = parse_line(f, "max_timestamp")

    seconds_in_day = 24 * 60 * 60
    return 1.0 - days * seconds_in_day / (max_timestamp - min_timestamp)

In [None]:
df = pd.read_csv(fullpath(media))
if original_source == "training":
    settings = get_settings()
    if settings["mode"] == "research":
        cutoff_days = settings["cutoff_days"]
        cutoff = get_knowledge_cutoff(cutoff_days)
        df = df.query(f"updated_at <= {cutoff}")

## Generate Splits

In [None]:
df = df.sort_values(by=["update_order", "updated_at"]).reset_index(drop=True)
df["unit"] = 1
df["forward_order"] = (
    df.groupby("userid", group_keys=False)["unit"].apply(lambda x: x.cumsum()).values
)
df["backward_order"] = (
    df.groupby("userid", group_keys=False)["unit"]
    .apply(lambda x: x.cumsum()[::-1])
    .values
)
df.to_csv(fullpath(media), index=False)