# Neighborhood-based Collaborative Filtering
* Implements Item-Item and User-User Collaborative Filtering
* See [Item-Based Collaborative Filtering Recommendation Algorithms](http://www.ra.ethz.ch/cdstore/www10/papers/pdf/p519.pdf) and [An Empirical Analysis of Design Choices in Neighborhood-Based Collaborative Filtering Algorithms](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.586.3847&rep=rep1&type=pdf)
* For a given user, we compute:
  1) A predicted score and variance for each item they have not rated
  2) The leave-one-out-cross-validation prediction for each item they have rated

In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
# TODO read these hyperparameters from a file
# Change this to get recommendations for a different user
recommendee = "Fro116"

# parameters chosen by cross-validation
item_neighborhood_size = 724
item_nonneg_corrs = True
user_neighborhood_size = 11585
user_nonneg_corrs = False

In [3]:
outdir = f"../../data/recommendations/{recommendee}"
os.chdir(outdir)

In [4]:
df = pickle.load(open("../../processed_data/user_anime_lists.pkl", "rb"))

In [5]:
# replace ratings in the database with the updated list
user_df = pickle.load(open("user_anime_list.pkl", "rb"))
df = pd.concat([df.loc[lambda x: ~x.username.isin(user_df.username)], user_df], ignore_index=True)

In [6]:
df = df.set_index('username')

In [7]:
def get_delta(score):
    return score.groupby("anime_id").apply(
        lambda x: np.dot(x["score"], x["corr"]) / x["corr"].abs().sum()
    )

In [8]:
def get_delta_variance(score):
    # The following formulae are used to compute the variance of delta. Delta
    # is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
    # a vector scores and w_i is the weight.
    #
    # By linearity, it suffices to compute (s_i * w_i) / (Σw_i).
    # The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
    # See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
    # formula for the ratio of two correlated variables R,S is
    # Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
    #
    # Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
    def correction_factor(x):
        return (
            1
            + x["corr_var"] / (x["corr"] ** 2)
            - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
            + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
        )

    delta_var = score.groupby("anime_id").apply(
        lambda x: np.sum(
            x["score_var"] * x["corr"] ** 2 * correction_factor(x)
        )
        / (x["corr"].abs().sum() ** 2)
    )

    # if the var < 0, then the ratio distribution approximation failed,
    # usually because sample size is too small
    delta_var.loc[lambda x: x < 0] = np.inf

    # Apply a bessel-like correction to unbias the variance
    effective_sample_size = score.groupby("anime_id")["effective_sample_size"].median()
    delta_var.loc[effective_sample_size <= 1] = np.inf
    delta_var.loc[effective_sample_size > 1] *= effective_sample_size / (
        effective_sample_size - 1
    )

    return delta_var

In [9]:
def get_deltas(is_df, anime_ids, recommendee, neighborhood_size, score_fn):
    # get the neighborhood for each item
    score = score_fn(is_df, recommendee, neighborhood_size)

    # extract model features
    pred_df = pd.DataFrame()
    pred_df["delta"] = get_delta(score)
    pred_df["delta_var"] = get_delta_variance(score)
    pred_df = pred_df.loc[lambda x: x.index.isin(anime_ids)]

    # fill in missing predictions with nan
    for anime_id in set(anime_ids) - set(pred_df.index):
        pred_df = pred_df.append(pd.Series(name=anime_id, dtype=float))
    return pred_df

In [10]:
@functools.lru_cache()
def get_item_corrs():
    corrs = pickle.load(open("../../processed_data/item_correlations.pkl", "rb"))
    if item_nonneg_corrs:
        corrs["similarity"] = corrs["corr"]
    else:
        corrs["similarity"] = corrs["corr"].abs()
    corrs = corrs.dropna()
    corrs = corrs.loc[
        lambda x: x.index.get_level_values("anime_id_x")
        != x.index.get_level_values("anime_id_y")
    ]
    corrs = corrs.sort_values(by="similarity")
    return corrs


def get_item_scores(df, recommendee, neighborhood_size):
    corrs = get_item_corrs()
    corrs = corrs.groupby("anime_id_x").tail(neighborhood_size)
    score = df.loc[recommendee].merge(
        corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y",
    )
    score = score.drop("anime_id", axis=1).rename({"anime_id_x": "anime_id"}, axis=1)

    weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
    average_weight = corrs.groupby("anime_id_x").apply(lambda x: x["corr"].abs().mean())
    average_weight.index.rename("anime_id", inplace=True)
    effective_sample_size = pd.DataFrame(weights / average_weight).rename(
        {0: "effective_sample_size"}, axis=1
    )
    score = score.merge(effective_sample_size, on="anime_id")

    return score

In [11]:
def get_user_corrs(df, recommendee):
    user_subset = df.loc[[recommendee]].merge(df.reset_index(), on="anime_id")
    corr_numerator = user_subset.groupby("username").apply(
        lambda x: np.dot(x["score_x"], x["score_y"])
    )
    corr_denom = df.groupby("username").apply(
        lambda x: np.sqrt(np.dot(x["score"], x["score"]))
    )
    corr_denom *= corr_denom.loc[recommendee]
    corrs = pd.DataFrame((corr_numerator / corr_denom), columns=["corr"])
    if user_nonneg_corrs:
        corrs["similarity"] = corrs["corr"]
    else:
        corrs["similarity"] = corrs["corr"].abs()
    corrs["corr_size"] = user_subset.groupby("username").size()
    corrs = corrs.drop(recommendee)
    corrs = corrs.dropna()
    return corrs


def get_user_scores(df, recommendee, neighborhood_size):
    corrs = get_user_corrs(df, recommendee)

    # We assume variance is the same as the variance for pearson correlation.
    # see https://www.jstor.org/stable/2277400?seq=1
    corrs = corrs.loc[lambda x: x["corr_size"] > 2]
    corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (
        corrs["corr_size"] - 2
    )
    corrs = corrs.sort_values(by="similarity").dropna()[-neighborhood_size:]

    score = (df.merge(pd.DataFrame(corrs), on="username")).dropna()

    weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
    average_weight = corrs["corr"].abs().mean()
    effective_sample_size = pd.DataFrame(weights / average_weight).rename(
        {0: "effective_sample_size"}, axis=1
    )
    score = score.merge(effective_sample_size, on="anime_id")

    return score

In [12]:
def store_deltas(df, recommendee, neighborhood_size, score_fn, signal_name):
    # compute out-of-sample deltas
    oos_pred_dfs = []
    K = len(df.loc[recommendee])
    np.random.seed(1)
    splits = np.array_split(df.loc[recommendee].sample(frac=1), K)
    for split in tqdm(splits):
        oos_df = split
        is_df = df.loc[
            lambda x: ~(
                (x.index.get_level_values("username") == recommendee)
                & x.anime_id.isin(oos_df.anime_id)
            )
        ]
        oos_pred_df = get_deltas(
            is_df=is_df,
            anime_ids=list(oos_df.anime_id),
            recommendee=recommendee,
            neighborhood_size=neighborhood_size,
            score_fn=score_fn,
        )
        oos_pred_dfs.append(oos_pred_df)
    oos_pred_df = pd.concat(oos_pred_dfs)
    oos_pred_df.to_pickle(f"{signal_name}_loocv.pkl")


    # compute deltas over the full data
    pred_df = get_deltas(
        is_df=df,
        anime_ids=list(df.anime_id),
        recommendee=recommendee,
        neighborhood_size=neighborhood_size,
        score_fn=score_fn,
    )

    # store deltas
    pred_df.to_pickle(f"{signal_name}.pkl")

In [13]:
# each iteration takes several minutes
store_deltas(
    df=df,
    recommendee=recommendee,
    neighborhood_size=item_neighborhood_size,
    score_fn=get_item_scores,
    signal_name="item",
)

100%|██████████| 349/349 [4:48:16<00:00, 49.56s/it]  


In [14]:
# each iteration takes several minutes
store_deltas(
    df=df,
    recommendee=recommendee,
    neighborhood_size=user_neighborhood_size,
    score_fn=get_user_scores,
    signal_name="user",
)

100%|██████████| 349/349 [9:09:24<00:00, 94.45s/it]     
