# TODO: Document
# TODO: Cleanup
* Note, this script may take several days to run
* TODO: Document the related-signal residualization

In [1]:
# CHANGE THIS PARAMETER
username = "taapaye"

In [2]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import seaborn as sns
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt
from tqdm import tqdm as tqdm


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [3]:
np.random.seed(0)

In [4]:
os.chdir(f"../../data/recommendations/{username}")

In [5]:
filtered_df = pickle.load(open("../../processed_data/user_anime_lists.pkl", "rb"))
user_df = pickle.load(open("user_anime_list.pkl", "rb"))
filtered_df = filtered_df.loc[lambda x: x['username'].str.lower() != username.lower()]
filtered_df = pd.concat([filtered_df, user_df], ignore_index=True)
filtered_df = filtered_df.set_index("username")

In [6]:
residual_df = pd.DataFrame()
residual_df['anime_id'] = user_df['anime_id']
residual_df['residual'] = 0
residual_df = residual_df.set_index('anime_id')

In [7]:
@functools.lru_cache()
def get_item_corrs(nonneg_corrs=False):
    corrs = pickle.load(open("../../processed_data/item_correlations.pkl", "rb"))
    return corrs


def get_item_corrs_wrapper(df, username):
    return get_item_corrs()


def get_item_scores(df, corrs, username, neighborhood_size):
    corrs = corrs.groupby("anime_id_x").tail(neighborhood_size)
    score = df.loc[username].merge(
        corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y",
    )
    score = score.drop("anime_id", axis=1).rename({"anime_id_x": "anime_id"}, axis=1)
    return score

In [8]:
def get_user_corrs(df, username):
    user_subset = df.loc[[username]].merge(df.reset_index(), on="anime_id")
    corr_numerator = user_subset.groupby("username").apply(
        lambda x: np.dot(x["score_x"], x["score_y"])
    )
    corr_denom = df.groupby("username").apply(
        lambda x: np.sqrt(np.dot(x["score"], x["score"]))
    )
    corr_denom *= corr_denom.loc[username]
    corrs = pd.DataFrame((corr_numerator / corr_denom), columns=["corr"])
    corrs["similarity"] = corrs["corr"].abs()
    corrs["corr_size"] = user_subset.groupby("username").size()
    corrs = corrs.drop(username)
    corrs = corrs.dropna()
    corrs = corrs.loc[lambda x: x["corr_size"] > 2]
    corrs = corrs.sort_values(by="similarity")
    return corrs


def get_user_scores(df, corrs, recommendee, neighborhood_size):
    corrs = corrs[-neighborhood_size:]
    score = (df.merge(pd.DataFrame(corrs), on="username")).dropna()
    return score

In [9]:
def get_delta(score):
    return score.groupby("anime_id").apply(
        lambda x: np.dot(x["score"], x["corr"]) / x["corr"].abs().sum()
    )

In [10]:
def get_squared_error(df, pred_df, username):
    pred_df = pred_df.loc[pred_df.index.intersection(df.loc[username].anime_id)]
    pred_df = pred_df.merge(
        df.loc[username].set_index("anime_id")["score"], on="anime_id"
    )
    errors = pred_df["pred_score"] - pred_df["score"]
    return np.dot(errors, errors)

In [11]:
def compute_accuracy_metrics(
    is_df, oos_df, score_fn, corrs_fn, username, neighborhood_sizes,
):
    corrs = corrs_fn(is_df, username)
    metrics = pd.DataFrame()
    for neighborhood_size in tqdm(
        reversed(sorted(neighborhood_sizes)), total=len(neighborhood_sizes),
    ):
        score = score_fn(is_df, corrs, username, neighborhood_size)
        pred_df = pd.DataFrame()
        pred_df["residual"] = residual_df['residual']        
        pred_df["delta"] = get_delta(score)
        pred_df = pred_df.fillna(0)

        # train linear model
        seen_shows = is_df.loc[username].merge(pred_df, on='anime_id')
        model = lm("score ~ delta + residual", seen_shows)

        # inference
        pred_df["pred_score"] = model.predict(pred_df)
        is_pred_df = pred_df.loc[lambda x: x.index.isin(is_df.loc[username].anime_id)]
        oos_pred_df = pred_df.loc[lambda x: x.index.isin(oos_df.anime_id)]

        # compute coverage
        is_coverage = len(is_pred_df.loc[lambda x: ~np.isclose(x['delta'], 0)]) / len(is_df.loc[username])
        oos_coverage = len(oos_pred_df.loc[lambda x: ~np.isclose(x['delta'], 0)]) / len(oos_df)

        # compute rmse
        # TODO delete the 'missing' components as nothing should be missing anymore
        missing_is = is_df.loc[username].loc[
            lambda x: ~x.anime_id.isin(is_pred_df.index)
            & ~x.anime_id.isin(oos_df.anime_id)
        ]
        missing_oos = oos_df.loc[lambda x: ~x.anime_id.isin(oos_pred_df.index)]
        is_se = get_squared_error(is_df, is_pred_df, username)
        oos_se = get_squared_error(oos_df, oos_pred_df, username)
        missing_is_se = np.dot(missing_is["score"], missing_is["score"])
        missing_oos_se = np.dot(missing_oos["score"], missing_oos["score"])
        is_rmse = np.sqrt((is_se + missing_is_se) / len(is_df.loc[username]))
        oos_rmse = np.sqrt((oos_se + missing_oos_se) / len(oos_df))
        metrics = metrics.append(
            {
                "neighborhood_size": neighborhood_size,
                "is_rsquared": model.rsquared_adj,
                "is_rmse": is_rmse,
                "is_coverage": is_coverage,
                "oos_rmse": oos_rmse,
                "oos_coverage": oos_coverage,
            },
            ignore_index=True,
        )

    return metrics

In [None]:
K = 10
base = np.sqrt(2)

errors_by_neighborhood_size = []
item_max_size = len(filtered_df.anime_id.unique())
item_neighborhood_sizes = [
    int(base ** i) for i in range(int(np.log(item_max_size) / np.log(base)) + 1)
] + [item_max_size]
user_max_size = len(filtered_df.index.unique())
user_neighborhood_sizes = [
    int(base ** i) for i in range(int(np.log(user_max_size) / np.log(base)) + 1)
] + [user_max_size]
splits = np.array_split(filtered_df.loc[username].sample(frac=1), K)

for split in splits:
    oos_df = split
    is_df = filtered_df.loc[
        lambda x: ~(
            (x.index.get_level_values("username") == username)
            & x.anime_id.isin(oos_df.anime_id)
        )
    ]
    
    item_metrics = compute_accuracy_metrics(
        is_df.copy(),
        oos_df.copy(),
        get_item_scores,
        get_item_corrs_wrapper,
        username,
        item_neighborhood_sizes,
    )
    item_metrics["signal"] = "item"
    errors_by_neighborhood_size.append(item_metrics)    

    user_metrics = compute_accuracy_metrics(
        is_df.copy(),
        oos_df.copy(),
        get_user_scores,
        get_user_corrs,
        username,
        user_neighborhood_sizes,
    )
    user_metrics["signal"] = "user"
    errors_by_neighborhood_size.append(user_metrics)

  0%|                                                                                           | 0/29 [00:00<?, ?it/s]

In [None]:
allerrors = pd.concat(errors_by_neighborhood_size, ignore_index=True)

## Visualizations

In [None]:
wide_data = pd.melt(allerrors, ["neighborhood_size", "signal"])

In [None]:
for signal in wide_data["signal"].unique():
    wide_data.loc[lambda x: x["signal"] == signal, "variable"] = (
        f"{signal}_" + wide_data.loc[lambda x: x["signal"] == signal, "variable"]
    )

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("coverage")],
).set(xscale="log", title="Prediction Coverage")

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("rmse")],
).set(xscale="log", title="Root Mean Squared Error")

In [None]:
allerrors.groupby(["signal", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).head(50)

## Store best hyperparameters

In [None]:
allerrors.groupby(["signal", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).reset_index().groupby("signal").first()

In [None]:
outdir = "parameters"
if not os.path.exists(outdir):
    os.mkdir(outdir)
os.chdir(outdir)

In [None]:
allerrors.groupby(["signal", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).reset_index().groupby("signal").first().to_pickle("neighborhoodcf.best.pkl")
allerrors.to_pickle("neighborhoodcf.all.pkl")