# Prune Media Lists
* Drop any rows with unknown items
* Drop any duplicate rows
* Drop any users with 3 or fewer item interactions

In [None]:
import logging
import os

import pandas as pd
import yaml

In [None]:
part = 0

In [None]:
logging.basicConfig(level=logging.INFO)
outdir = "../../data/raw_data"
ALL_MEDIUMS = ["manga", "anime"]

In [None]:
def get_settings():
    d = {}
    for s in ["default_settings", "private_settings"]:
        with open(f"../../environment/{s}.yml", "r") as f:
            d |= yaml.safe_load(f)
    return d

In [None]:
settings = get_settings()

In [None]:
def prune(media):
    fn = os.path.join(outdir, f"user_{media}_list.{part}.csv")
    df = pd.read_csv(fn)
    logging.info(f"{len(df)} entries after loading")

    valid_titles = set(pd.read_csv(f"{outdir}/{media}.csv")[f"{media}_id"])
    df = df.loc[lambda x: x["mediaid"].isin(valid_titles)]
    logging.info(f"{len(df)} entries after valid title filter")

    df = df.sort_values(by=["updated_at", "update_order"])
    df = df.groupby(["userid", "mediaid"]).last().reset_index()
    logging.info(f"{len(df)} entries after duplicate filter")

    valid_users = set(
        df.groupby("userid")
        .size()
        .loc[lambda x: x >= settings["min_user_interactions"]]
        .index
    )
    df = df.loc[lambda x: x["userid"].isin(valid_users)]
    logging.info(f"{len(df)} entries after sparse user filter")

    valid_medias = set(
        df.groupby("mediaid")
        .size()
        .loc[lambda x: x >= settings["min_item_interactions"]]
        .index
    )
    df = df.loc[lambda x: x["mediaid"].isin(valid_medias)]
    logging.info(f"{len(df)} entries after sparse media filter")
    df.to_csv(os.path.join(outdir, f"user_{media}_list.pruned.{part}.csv"), index=False)

In [None]:
for media in ALL_MEDIUMS:
    df = prune(media)