# Knowledge Cutoff
* Drop interactions before a certain date
* Used to collect out-of-sample metrics

In [None]:
import os

import numpy as np
import pandas as pd
import yaml
from tqdm import tqdm

In [None]:
outdir = "../../data/processed_data"

In [None]:
HEADER_FIELDS = []

In [None]:
def get_settings():
    with open("../../environment/settings.yml", "r") as f:
        return yaml.safe_load(f)

In [None]:
cutoff_days = get_settings()["ProcessData"]["KnowledgeCutoff"]["cutoff_days"]

In [None]:
def get_knowledge_cutoff(days):
    def parse_line(file, field, format=int):
        line = file.readline()
        fields = line.strip().split(",")
        assert len(fields) == 2
        assert fields[0] == field
        return format(fields[1])

    with open(os.path.join(outdir, "timestamps.csv")) as f:
        min_timestamp = parse_line(f, "min_timestamp")
        max_timestamp = parse_line(f, "max_timestamp")

    seconds_in_day = 24 * 60 * 60
    return 1.0 - days * seconds_in_day / (max_timestamp - min_timestamp)

In [None]:
def process(media, remove_line, error_file):
    source = os.path.join(outdir, f"user_{media}_list.csv")
    dest = os.path.join(outdir, f"user_{media}_list.csv~")
    error_file = os.path.join(outdir, error_file)
    with open(source, "r") as in_file, open(dest, "w") as out_file, open(
        error_file, "w"
    ) as err_file:
        header = False
        for line in tqdm(in_file):
            if not header:
                header = True
                global HEADER_FIELDS
                HEADER_FIELDS = line.strip().split(",")
                out_file.write(line)
                err_file.write(line)
                continue
            try:
                if remove_line(media, line):
                    err_file.write(line)
                else:
                    out_file.write(line)
            except Exception as e:
                print(line)
                raise e
    os.rename(dest, source)

In [None]:
def enforce_knowledge_cutoff(media, line):
    fields = line.strip().split(",")
    col = HEADER_FIELDS.index("updated_at")
    return float(fields[col]) > cutoff

In [None]:
cutoff = get_knowledge_cutoff(cutoff_days)
with open(os.path.join(outdir, "knowledge_cutoff.csv"), "w") as f:
    f.write(f"knowledge_cutoff,{cutoff}\n")

In [None]:
for media in ["manga", "anime"]:
    process(media, enforce_knowledge_cutoff, f"prune.{media}.knowledge_cutoff.csv")