# Generate Splits
* Creates training and test splits

In [None]:
import hashlib
import os

import numpy as np
import pandas as pd
import yaml

In [None]:
part = 0
media = ""

In [None]:
source_dir = "../../data/processed_data"

In [None]:
outdir = "../../data/splits"
os.makedirs(outdir, exist_ok=True)

In [None]:
def get_settings():
    d = {}
    for s in ["default_settings", "private_settings"]:
        with open(f"../../environment/{s}.yml", "r") as f:
            d |= yaml.safe_load(f)
    return d

In [None]:
def is_training(userid, mode):
    if mode == "research":
        return True
    hashfn = lambda x: hashlib.sha256(str(x).encode("ASCII")).hexdigest()
    salt = hashfn("get_split")
    hash = hashfn(salt + userid)
    p = (hash % 100) / 100
    return p > 0.99

In [None]:
def get_cutoff(days):
    def parse_line(file, field, format=int):
        line = file.readline()
        fields = line.strip().split(",")
        assert len(fields) == 2
        assert fields[0] == field
        return format(fields[1])

    with open(os.path.join(source_dir, "timestamps.csv")) as f:
        min_timestamp = parse_line(f, "min_timestamp")
        max_timestamp = parse_line(f, "max_timestamp")

    seconds_in_day = 24 * 60 * 60
    return 1.0 - days * seconds_in_day / (max_timestamp - min_timestamp)

In [None]:
def temporal_sort(fn):
    df = pd.read_csv(fn, keep_default_na=False, engine="pyarrow")
    df.sort_values(
        by=["userid", "update_order", "updated_at"], inplace=True, ignore_index=True
    )
    df["unit"] = 1
    df["forward_order"] = (
        df.groupby("userid", group_keys=False)["unit"]
        .apply(lambda x: x.cumsum())
        .values
    )
    df["backward_order"] = (
        df.groupby("userid", group_keys=False)["unit"]
        .apply(lambda x: x.cumsum()[::-1])
        .values
    )
    return df

In [None]:
def generate_training_splits(timestamp_cutoff, num_interactions, mode):
    df = temporal_sort(
        os.path.join(source_dir, f"user_{media}_list.{part}.csv"),
    )    
    training = (
        df.userid.apply(lambda x: is_training(x, mode))
        | (df.forward_order > num_interactions)
        | (df.updated_at < timestamp_cutoff)
    )
    df[training].to_hdf(os.path.join(outdir, f"training.user_{media}_list.{part}.h5"), key='df', mode='w') 
    df[~training].to_hdf(os.path.join(outdir, f"test.user_{media}_list.{part}.h5"), key='df', mode='w') 

In [None]:
def generate_future_split(num_interactions):
    df = temporal_sort(
        os.path.join(source_dir, f"prune.{media}.knowledge_cutoff.{part}.csv"),
    )
    df.to_hdf(os.path.join(outdir, f"test.user_{media}_list.{part}.h5"), key='df', mode='w')

In [None]:
# interactions will go in the test split if the user is in the test split
# AND the the interaction is one of the user's N most recent interactions
# AND the interaction occured less that M days ago
settings = get_settings()
generate_training_splits(
    get_cutoff(settings["cutoff_days"]),
    settings["cutoff_interactions"],
    settings["mode"],
)
if settings["mode"] == "research":
    # in research mode, we first filter to interactions that occured
    # M days ago, and then we take the first N interactions per user
    generate_future_split(settings["cutoff_interactions"])