In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
from scipy.special import gamma


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data")

In [3]:
recommendee = "Fro116"
neighborhood_size = 64
confidence_interval = 0.99
full_neighborhoods = False
perform_regression = True

In [4]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [5]:
df = pd.read_csv("UserAnimeList.csv")

In [6]:
len(df["username"].unique()), len(df["anime_id"].unique()), len(df) / (
    len(df["username"].unique()) * len(df["anime_id"].unique())
), len(df)

(283045, 14478, 0.01954064606703893, 80076112)

In [7]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [8]:
# add additional user anime-lists
extraUsers = pickle.load(open("user_profiles/ExtraUserAnimeLists.pkl", "rb"))
filtered_df = filtered_df.loc[lambda x: ~x["username"].isin(extraUsers.username)]
filtered_df = pd.concat([filtered_df, extraUsers], ignore_index=True)

In [9]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [10]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960563,-0.059899,0.605474
karthiga,59,7,0.040202,-0.059899,-0.474165
karthiga,74,7,0.316282,-0.059899,-0.750244
karthiga,120,7,0.309858,-0.059899,-0.743820
karthiga,178,7,-0.227339,-0.059899,-0.206623
...,...,...,...,...,...
temptemptemp,10040,6,-1.636718,-1.493861,1.636718
cinnamoroller,12963,10,-0.798861,2.506139,0.798861
inactiveX,5143,7,-0.652952,-0.493861,0.652952
omgm,5581,5,-1.857497,-2.493861,1.857497


In [12]:
all_corrs = pickle.load(open("item_correlations/correlations.pkl", "rb"))

In [13]:
all_corrs["similarity"] = all_corrs["corr"].abs()
all_corrs = all_corrs.dropna()

In [14]:
corrs = all_corrs.loc[
    lambda x: x.index.get_level_values("anime_id_x")
    != x.index.get_level_values("anime_id_y")
]

In [None]:
if full_neighborhoods:
    corrs = corrs.groupby("anime_id_x").apply(lambda x: x.sort_values(by="similarity"))
else:
    corrs = corrs.groupby("anime_id_x").apply(
        lambda x: x.sort_values(by="similarity")[-neighborhood_size:]
    )

In [None]:
corrs.index = corrs.index.droplevel()

In [None]:
corrs

In [None]:
score = filtered_df.loc[recommendee].merge(
    corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y"
)

user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score["user_var"] = user_var.loc[recommendee].squeeze()

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")

score = score.drop("anime_id", axis=1).rename({"anime_id_x": "anime_id"}, axis=1)

if full_neighborhoods:
    score = (
        score.groupby("anime_id")
        .apply(lambda x: x.sort_values(by="similarity")[-neighborhood_size:])
        .reset_index(drop=True)
    )

In [None]:
score.head()

In [None]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

In [None]:
# The following formulae are used to compute the variance of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for item i and w_i is the weight for item_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We assume that
# Var(s_i) is the same as the variance over the vector s_i (i.e. over
# all users s_i has rated). We treat w_i as a random variable with mean w_i
# and variance corr['corr_var']
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
# formula for the ratio of two correlated variables R,S is
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
#
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
def correction_factor(x):
    return (
        1
        + x["corr_var"] / (x["corr"] ** 2)
        - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
        + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
    )


delta_var = score.groupby("anime_id").apply(
    lambda x: np.sum(x["user_var"] * x["corr"] ** 2 * correction_factor(x))
    / (x["corr"].abs().sum() ** 2)
)

# if the var < 0, then the ratio distribution approximation failed,
# usually because sample size is too small
delta_var.loc[lambda x: x < 0] = np.inf

# Apply a bessel correction to unbias the variance
average_weight = corrs.groupby("anime_id_x").apply(lambda x: x["corr"].abs().mean())
effective_sample_size = weights / average_weight
delta_var.loc[effective_sample_size <= 1] = np.inf
delta_var.loc[effective_sample_size > 1] *= effective_sample_size / (effective_sample_size - 1)

In [None]:
pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = np.sqrt(delta_var)

# # Unbias the standard deviation estimate
# # See https://en.wikipedia.org/wiki/Unbiased_estimation_of_standard_deviation#Results_for_the_normal_distribution
# def standard_deviation_bias(n):
#     if n < 1:
#         return np.inf
#     if gamma((n - 1) / 2) == np.inf:
#         return 1
#     return np.sqrt(2 / (n - 1)) * gamma(n / 2) / gamma((n - 1) / 2)
# pred_df["delta_sem"] /= effective_sample_size.apply(standard_deviation_bias)

pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
if perform_regression:

    model = lm("target ~ delta + 0", recomendee_seen_shows)
    pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
    pred_df["sem"] = np.sqrt(
        (
            (pred_df["delta_sem"] ** 2 + pred_df["delta"] ** 2)
            * (model.bse["delta"] ** 2 + model.params["delta"] ** 2)
        )
        - pred_df["delta"] ** 2 * model.params["delta"] ** 2
    )
else:
    pred_df["score"] = pred_df["delta"] + pred_df["blp"]
    pred_df["sem"] = pred_df["delta_sem"]


zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [None]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

In [None]:
# confirm that the top shows are ones that the user rates highly
pred_df.loc[lambda x: x["delta"] > 0].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

In [None]:
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
    & (x["type"] != "Music")
]

In [None]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

In [None]:
new_recs.loc[21]

In [None]:
filtered_df.loc["Fro116"].loc[lambda x: x['anime_id'].isin([31964, 1575, 2904, 1535, 121, 2001, 245])]