# TODO: Document
# TODO: Cleanup
* Note, this script may take several days to run
* TODO: Document the related-signal residualization
* TODO: Document that this signal is prone to underfitting

In [1]:
# CHANGE THIS PARAMETER
username = "taapaye"

In [2]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import seaborn as sns
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
from tqdm import tqdm as tqdm


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [3]:
np.random.seed(0)

In [4]:
os.chdir(f"../../data/recommendations/{username}")

In [5]:
filtered_df = pickle.load(open("../../processed_data/user_anime_lists.pkl", "rb"))
user_df = pickle.load(open("user_anime_list.pkl", "rb"))
filtered_df = filtered_df.loc[lambda x: ~x["username"].isin(user_df.username)]
filtered_df = pd.concat([filtered_df, user_df], ignore_index=True)
filtered_df = filtered_df.set_index("username")

In [6]:
related_df = pickle.load(open("related_loocv.pkl", "rb"))

In [7]:
@functools.lru_cache()
def get_item_corrs(nonneg_corrs=False):
    corrs = pickle.load(open("../../processed_data/item_correlations.pkl", "rb"))
    if nonneg_corrs:
        corrs["similarity"] = corrs["corr"]
    else:
        corrs["similarity"] = corrs["corr"].abs()
    corrs = corrs.dropna()
    corrs = corrs.loc[
        lambda x: x.index.get_level_values("anime_id_x")
        != x.index.get_level_values("anime_id_y")
    ]
    corrs = corrs.sort_values(by="similarity")
    return corrs


def get_item_corrs_wrapper(df, username):
    return get_item_corrs()


def get_item_scores(df, corrs, username, neighborhood_size):
    corrs = corrs.groupby("anime_id_x").tail(neighborhood_size)
    score = df.loc[username].merge(
        corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y",
    )
    score = score.drop("anime_id", axis=1).rename({"anime_id_x": "anime_id"}, axis=1)
    return score

In [8]:
def get_delta(score):
    return score.groupby("anime_id").apply(
        lambda x: np.dot(x["score"], x["corr"]) / x["corr"].abs().sum()
    )

In [9]:
def get_squared_error(df, pred_df, username):
    pred_df = pred_df.loc[pred_df.index.intersection(df.loc[username].anime_id)]
    pred_df = pred_df.merge(
        df.loc[username].set_index("anime_id")["score"], on="anime_id"
    )
    errors = pred_df["pred_score"] - pred_df["score"]
    return np.dot(errors, errors)

In [15]:
get_item_corrs()

Unnamed: 0_level_0,Unnamed: 1_level_0,corr,corr_var,size,similarity
anime_id_x,anime_id_y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [12]:
def compute_accuracy_metrics(
    is_df, oos_df, score_fn, corrs_fn, username, neighborhood_sizes,
):
    corrs = corrs_fn(is_df, username)
    metrics = pd.DataFrame()
    for neighborhood_size in tqdm(
        sorted(neighborhood_sizes), total=len(neighborhood_sizes),
    ):
        score = score_fn(is_df, corrs, username, neighborhood_size)
        pred_df = pd.DataFrame()
        pred_df["related"] = related_df['delta']  
        
        
        pred_df["delta"] = get_delta(score)
        pred_df = pred_df.fillna(0)

        # train linear model
        seen_shows = is_df.loc[username].merge(pred_df, on='anime_id')
        model = lm("score ~ delta + related", seen_shows)

        # inference
        pred_df["pred_score"] = model.predict(pred_df)
        is_pred_df = pred_df.loc[lambda x: x.index.isin(is_df.loc[username].anime_id)]
        oos_pred_df = pred_df.loc[lambda x: x.index.isin(oos_df.anime_id)]

        # compute coverage
        is_coverage = len(is_pred_df.loc[lambda x: ~np.isclose(x['delta'], 0)]) / len(is_df.loc[username])
        oos_coverage = len(oos_pred_df.loc[lambda x: ~np.isclose(x['delta'], 0)]) / len(oos_df)

        # compute rmse
        # TODO delete the 'missing' components as nothing should be missing anymore
        missing_is = is_df.loc[username].loc[
            lambda x: ~x.anime_id.isin(is_pred_df.index)
            & ~x.anime_id.isin(oos_df.anime_id)
        ]
        missing_oos = oos_df.loc[lambda x: ~x.anime_id.isin(oos_pred_df.index)]
        is_se = get_squared_error(is_df, is_pred_df, username)
        oos_se = get_squared_error(oos_df, oos_pred_df, username)
        missing_is_se = np.dot(missing_is["score"], missing_is["score"])
        missing_oos_se = np.dot(missing_oos["score"], missing_oos["score"])
        is_rmse = np.sqrt((is_se + missing_is_se) / len(is_df.loc[username]))
        oos_rmse = np.sqrt((oos_se + missing_oos_se) / len(oos_df))
        metrics = metrics.append(
            {
                "neighborhood_size": neighborhood_size,
                "is_rsquared": model.rsquared_adj,
                "is_rmse": is_rmse,
                "is_coverage": is_coverage,
                "oos_rmse": oos_rmse,
                "oos_coverage": oos_coverage,
            },
            ignore_index=True,
        )
        print({
                "neighborhood_size": neighborhood_size,
                "is_rsquared": model.rsquared_adj,
                "is_rmse": is_rmse,
                "is_coverage": is_coverage,
                "oos_rmse": oos_rmse,
                "oos_coverage": oos_coverage,
            },)
    return metrics

In [13]:
errors_by_neighborhood_size = []
max_size = len(filtered_df.anime_id.unique())
base = np.sqrt(2)
neighborhood_sizes = [
    int(base ** i) for i in range(int(np.log(max_size) / np.log(base)) + 1)
] + [max_size]
neighborhood_sizes = sorted(list(set(neighborhood_sizes)))

oos_df = filtered_df.loc[username].sample(frac=1)
is_df = filtered_df
    
item_metrics = compute_accuracy_metrics(
        is_df.copy(),
        oos_df.copy(),
        get_item_scores,
        get_item_corrs_wrapper,
        username,
        neighborhood_sizes,
    )
item_metrics["signal"] = "item"
errors_by_neighborhood_size.append(item_metrics)    

  0%|                                                                                           | 0/27 [00:00<?, ?it/s]


ValueError: Wrong number of items passed 0, placement implies 1

In [None]:
allerrors = pd.concat(errors_by_neighborhood_size, ignore_index=True)

## Visualizations

In [None]:
wide_data = pd.melt(allerrors, ["neighborhood_size", "signal"])

In [None]:
for signal in wide_data["signal"].unique():
    wide_data.loc[lambda x: x["signal"] == signal, "variable"] = (
        f"{signal}_" + wide_data.loc[lambda x: x["signal"] == signal, "variable"]
    )

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("coverage")],
).set(xscale="log", title="Prediction Coverage")

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("rmse")],
).set(xscale="log", title="Root Mean Squared Error")

In [None]:
allerrors.groupby(["signal", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).head(50)

In [None]:
allerrors.groupby(["signal", "neighborhood_size"]).mean().rolling(3).mean().xs('item', level='signal').sort_values(by='oos_rmse')

In [None]:
(1.199971 + 1.200345 + 1.200527) / 3

In [None]:
(1.198847 + 1.198858 + 1.202293 ) / 3

## Store best hyperparameters

In [None]:
allerrors.groupby(["signal", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).reset_index().groupby("signal").first()

In [None]:
outdir = "parameters"
if not os.path.exists(outdir):
    os.mkdir(outdir)
os.chdir(outdir)

In [None]:
allerrors.groupby(["signal", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).reset_index().groupby("signal").first().to_pickle("neighborhoodcf.best.pkl")
allerrors.to_pickle("neighborhoodcf.all.pkl")