In [1]:
# TODO: Document
# TODO: Cleanup
# TODO: Merge with ItemCF if possible

In [2]:
# CHANGE THIS PARAMETER
recommendee = "taapaye"

In [3]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import seaborn as sns
import statsmodels.formula.api as smf
from matplotlib import pyplot as plt


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [4]:
np.random.seed(0)

In [5]:
os.chdir(f"../../data/recommendations/{recommendee}")

In [6]:
filtered_df = pickle.load(open("../../processed_data/user_anime_lists.pkl", "rb"))

In [7]:
user_df = pickle.load(open("user_anime_list.pkl", "rb"))
filtered_df = filtered_df.loc[lambda x: ~x["username"].isin(user_df.username)]
filtered_df = pd.concat([filtered_df, user_df], ignore_index=True)

In [8]:
filtered_df = filtered_df.set_index("username")

In [9]:
# average_rating = filtered_df["my_score"].mean()
# user_bias = (
#     pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
#         {"my_score": "user_bias"}, axis=1
#     )
#     - average_rating
# )
# anime_bias = (
#     pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
#         {"my_score": "anime_bias"}, axis=1
#     )
#     - average_rating
# )

In [10]:
# filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
#     user_bias, on=["username"]
# )
# filtered_df["normalized_score"] = (
#     filtered_df["my_score"]
#     - filtered_df["anime_bias"]
#     - filtered_df["user_bias"]
#     - average_rating
# )
# filtered_df["orig_normalized_score"] = filtered_df["normalized_score"]
# filtered_df = filtered_df.set_index("username")
# filtered_df = filtered_df.dropna()

In [11]:
def prepare_prediction(recommendee, neighborhood):
    pred_df = pd.DataFrame()
    pred_df["delta"] = neighborhood.groupby("anime_id").apply(
        lambda x: np.dot(x["score"], x["corr"]) / x["corr"].abs().sum()
    )
    pred_df = pred_df.dropna()
    return pred_df

In [12]:
def get_correlation(df, recommendee):
    user_subset = df.loc[[recommendee]].merge(df.reset_index(), on="anime_id")
    adj_cos_corr_numerator = user_subset.groupby("username").apply(
        lambda x: np.dot(x["score_x"], x["score_y"])
    )
    adj_cos_corr_denom = df.groupby("username").apply(
        lambda x: np.sqrt(np.dot(x["score"], x["score"]))
    )
    adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
    adj_cos_corrs = pd.DataFrame(
        (adj_cos_corr_numerator / adj_cos_corr_denom), columns=["corr"]
    )
    adj_cos_corrs = adj_cos_corrs.dropna()
    return adj_cos_corrs

In [13]:
def get_squared_error(df, pred_df, recommendee):
    recommendee_df = pred_df.loc[
        pred_df.index.intersection(df.loc[recommendee].anime_id)
    ]
    recommendee_df = recommendee_df.merge(
        df.loc[recommendee].set_index("anime_id")["score"], on="anime_id"
    )
    errors = recommendee_df["pred_score"] - recommendee_df["score"]
    return np.dot(errors, errors)

In [14]:
filtered_df.head()

Unnamed: 0_level_0,anime_id,score,score_var
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
karthiga,21,0.605479,2.031881
karthiga,59,-0.47416,1.593917
karthiga,74,-0.750239,1.456926
karthiga,120,-0.743815,1.658394
karthiga,178,-0.206618,1.55604


In [18]:
def compute_accuracy_metrics(
    is_df, oos_df, recommendee, neighborhood_sizes, nonneg_corrs
):

    # compute correlations
    corrs = get_correlation(is_df, recommendee)
    if nonneg_corrs:
        corrs["similarity"] = corrs["corr"]
    else:
        corrs["similarity"] = corrs["corr"].abs()
    corrs = corrs.sort_values(by="similarity").dropna()
    corrs = corrs.drop(recommendee)  # makes insample score more meaningful

    metrics = pd.DataFrame()
    for neighborhood_size in tqdm(neighborhood_sizes):
        # extract model features
        neighborhood = (
            is_df.merge(pd.DataFrame(corrs[-neighborhood_size:]), on="username")
        ).dropna()
        pred_df = prepare_prediction(recommendee, neighborhood)

        # train linear model
        recomendee_seen_shows = is_df.loc[recommendee].merge(pred_df, on=["anime_id"])
        model = lm("score ~ delta + 0", recomendee_seen_shows)

        # inference
        pred_df["pred_score"] = model.predict(pred_df)
        is_pred_df = pred_df.loc[
            lambda x: x.index.isin(is_df.loc[recommendee].anime_id)
        ]
        oos_pred_df = pred_df.loc[lambda x: x.index.isin(oos_df.anime_id)]

        # compute coverage
        is_coverage = len(is_pred_df) / len(is_df.loc[recommendee])
        oos_coverage = len(oos_pred_df) / len(oos_df)

        # compute rmse
        missing_is = is_df.loc[recommendee].loc[
            lambda x: ~x.anime_id.isin(is_pred_df.index)
            & ~x.anime_id.isin(oos_df.anime_id)
        ]
        missing_oos = oos_df.loc[lambda x: ~x.anime_id.isin(oos_pred_df.index)]
        is_se = get_squared_error(is_df, is_pred_df, recommendee)
        oos_se = get_squared_error(oos_df, oos_pred_df, recommendee)
        missing_is_se = np.dot(missing_is["score"], missing_is["score"])
        missing_oos_se = np.dot(missing_oos["score"], missing_oos["score"])
        is_rmse = np.sqrt((is_se + missing_is_se) / len(is_df.loc[recommendee]))
        oos_rmse = np.sqrt((oos_se + missing_oos_se) / len(oos_df))
        metrics = metrics.append(
            {
                "neighborhood_size": neighborhood_size,
                "nonneg_corrs": nonneg_corrs,
                "is_rmse": is_rmse,
                "is_coverage": is_coverage,
                "oos_rmse": oos_rmse,
                "oos_coverage": oos_coverage,
            },
            ignore_index=True,
        )

    return metrics

In [20]:
%%time
from tqdm import tqdm

K = 10
base = np.sqrt(2)

errors_by_neighborhood_size = []
neighborhood_sizes = [
    int(base ** i) for i in range(int(np.log(len(filtered_df)) / np.log(base)) + 1)
] + [len(filtered_df)]
neighborhood_sizes = sorted(list(set(neighborhood_sizes)))
splits = np.array_split(filtered_df.loc[recommendee].sample(frac=1), K)

for split in tqdm(splits):
    oos_df = split
    is_df = filtered_df.loc[
        lambda x: ~(
            (x.index.get_level_values("username") == recommendee)
            & x.anime_id.isin(oos_df.anime_id)
        )
    ]
    errors_by_neighborhood_size.append(
        compute_accuracy_metrics(
            is_df.copy(), oos_df.copy(), recommendee, neighborhood_sizes, False
        )
    )
    errors_by_neighborhood_size.append(
        compute_accuracy_metrics(
            is_df.copy(), oos_df.copy(), recommendee, neighborhood_sizes, True
        )
    )

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/50 [00:00<?, ?it/s][A
  2%|▏         | 1/50 [00:08<06:34,  8.06s/it][A
  4%|▍         | 2/50 [00:14<05:35,  6.99s/it][A
  6%|▌         | 3/50 [00:20<05:15,  6.71s/it][A
  8%|▊         | 4/50 [00:26<04:59,  6.51s/it][A
 10%|█         | 5/50 [00:33<04:53,  6.53s/it][A
 12%|█▏        | 6/50 [00:40<04:56,  6.73s/it][A
 14%|█▍        | 7/50 [00:48<05:01,  7.02s/it][A
 16%|█▌        | 8/50 [00:55<04:52,  6.96s/it][A
 18%|█▊        | 9/50 [01:01<04:40,  6.85s/it][A
 20%|██        | 10/50 [01:08<04:28,  6.71s/it][A
 22%|██▏       | 11/50 [01:14<04:21,  6.70s/it][A
 24%|██▍       | 12/50 [01:22<04:27,  7.05s/it][A
 26%|██▌       | 13/50 [01:29<04:15,  6.89s/it][A
 28%|██▊       | 14/50 [01:35<04:05,  6.82s/it][A
 30%|███       | 15/50 [01:42<03:57,  6.78s/it][A
 32%|███▏      | 16/50 [01:49<03:58,  7.01s/it][A
 34%|███▍      | 17/50 [01:56<03:48,  6.94s/it][A
 36%|███▌      | 18/50 [02:04<03:45,  7.04s/it][A
 38%|███▊ 

In [21]:
allerrors = pd.concat(errors_by_neighborhood_size, ignore_index=True)

In [None]:
allerrors

In [None]:
wide_data = pd.melt(allerrors, ["neighborhood_size", "nonneg_corrs"])

In [None]:
wide_data.loc[lambda x: x.nonneg_corrs == True, "variable"] = (
    "nonneg_" + wide_data.loc[lambda x: x.nonneg_corrs == True, "variable"]
)

In [None]:
allerrors.loc[lambda x: x["neighborhood_size"] == 1.0]

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("coverage")],
).set(xscale="log", title="Prediction Coverage")

In [None]:
plt.figure(figsize=(20, 10))
_ = sns.lineplot(
    x="neighborhood_size",
    y="value",
    hue="variable",
    data=wide_data.loc[lambda x: x.variable.str.contains("rmse")],
).set(xscale="log", title="Root Mean Squared Error")

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
allerrors.groupby(["nonneg_corrs", "neighborhood_size"]).mean().sort_values(
    by="oos_rmse"
).head(10)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
allerrors.groupby(["nonneg_corrs", "neighborhood_size"]).mean().xs(
    0, level="nonneg_corrs"
).sort_values(by="oos_rmse").head(10)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
allerrors.groupby(["nonneg_corrs", "neighborhood_size"]).mean().xs(
    0, level="nonneg_corrs"
).sort_values(by="is_rmse").head(10)

In [None]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
allerrors.groupby(["nonneg_corrs", "neighborhood_size"]).mean().xs(
    0, level="nonneg_corrs"
)