In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
from scipy.special import gamma


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data")

In [3]:
# recommendee = "Fro116"
# neighborhood_size = 8192
recommendee = "mpfei"
neighborhood_size = 724
confidence_interval = 0.99
normalize_variance = False

In [4]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [5]:
df = pd.read_csv("UserAnimeList.csv")

In [6]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [7]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [8]:
# add additional user anime-lists
extraUsers = pickle.load(open("user_profiles/ExtraUserAnimeLists.pkl", "rb"))
filtered_df = filtered_df.loc[lambda x: ~x["username"].isin(extraUsers.username)]
filtered_df = pd.concat([filtered_df, extraUsers], ignore_index=True)

In [9]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [10]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960563,-0.059899,0.605474
karthiga,59,7,0.040202,-0.059899,-0.474165
karthiga,74,7,0.316282,-0.059899,-0.750244
karthiga,120,7,0.309858,-0.059899,-0.743820
karthiga,178,7,-0.227339,-0.059899,-0.206623
...,...,...,...,...,...
temptemptemp,10040,6,-1.636718,-1.493861,1.636718
cinnamoroller,12963,10,-0.798861,2.506139,0.798861
inactiveX,5143,7,-0.652952,-0.493861,0.652952
omgm,5581,5,-1.857497,-2.493861,1.857497


In [12]:
if normalize_variance:
    user_stds = (
        filtered_df.groupby("username")[["normalized_score"]]
        .std()
        .rename({"normalized_score": "user_std"}, axis=1)
    )
    filtered_df = filtered_df.merge(user_stds, on="username")
    filtered_df["normalized_score"] /= filtered_df["user_std"]
    filtered_df = filtered_df.drop("user_std", axis=1)

In [13]:
user_subset = filtered_df.loc[[recommendee]].merge(
    filtered_df.reset_index(), on="anime_id"
)

In [14]:
adj_cos_corr_numerator = user_subset.groupby("username").apply(
    lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
)
adj_cos_corr_denom = filtered_df.groupby("username").apply(
    lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
)
adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
adj_cos_corrs = pd.DataFrame(
    (adj_cos_corr_numerator / adj_cos_corr_denom), columns=["corr"]
)
adj_cos_corrs = adj_cos_corrs.dropna()

In [15]:
corrs = adj_cos_corrs.copy()
corrs["similarity"] = corrs["corr"].abs()
corrs["size"] = user_subset.groupby("username").size()
corrs = corrs.drop(
    recommendee
)  # Technically not needed because it's a noop for new series, but its useful for debugging

# We assume variance is the same as the variance for pearson correlation.
# see https://www.jstor.org/stable/2277400?seq=1
corrs = corrs.loc[lambda x: x["size"] > 2]
corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (corrs["size"] - 2)
corrs = corrs.sort_values(by="similarity").dropna()[-neighborhood_size:]

In [16]:
corrs.describe()

Unnamed: 0,corr,similarity,size,corr_var
count,724.0,724.0,724.0,724.0
mean,0.17857,0.183528,9.631215,0.313845
std,0.048464,0.023472,7.072294,0.310182
min,-0.201931,0.15754,3.0,0.020526
25%,0.165567,0.165803,4.0,0.085777
50%,0.177293,0.177802,7.0,0.180202
75%,0.194189,0.194585,13.0,0.466145
max,0.292924,0.292924,48.0,0.950926


In [17]:
score = (filtered_df.merge(pd.DataFrame(corrs), on="username")).dropna()

user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score = score.merge(user_var, on="username")

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")

In [18]:
score

Unnamed: 0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity,size,corr_var,user_var,anime_var
0,21,10,0.960563,0.675631,0.869945,0.165713,0.165713,13,0.085985,0.814852,2.650513
1,21,9,0.960563,1.422806,-0.877230,0.167934,0.167934,3,0.944392,0.876487,2.650513
2,21,8,0.960563,1.294601,-1.749025,0.162019,0.162019,10,0.118524,1.211759,2.650513
3,21,8,0.960563,-0.523273,0.068848,0.186428,0.186428,7,0.186339,2.551526,2.650513
4,21,8,0.960563,-2.452925,1.998501,-0.196875,0.196875,19,0.054352,3.664946,2.650513
...,...,...,...,...,...,...,...,...,...,...,...
40376,5307,8,-0.518828,1.506139,-0.481172,0.160856,0.160856,3,0.948920,0.497053,1.717632
40377,34414,7,-0.177038,-0.620177,0.303354,0.191632,0.191632,16,0.066279,1.089463,1.483714
40378,36864,6,-1.271639,-0.620177,0.397954,0.191632,0.191632,16,0.066279,1.089463,1.864312
40379,20973,10,-0.421879,2.395028,0.532990,0.180202,0.180202,4,0.468054,0.704874,1.390564


In [19]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

In [20]:
# The following formulae are used to compute the variance of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for user i and w_i is the weight for user_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We assume that
# Var(s_i) is the same as the variance over the vector s_i (i.e. over
# all items s_i has rated). We treat w_i as a random variable with mean w_i
# and variance corr['corr_var']
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
# formula for the ratio of two correlated variables R,S is
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
#
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
def correction_factor(x):
    return (
        1
        + x["corr_var"] / (x["corr"] ** 2)
        - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
        + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
    )


delta_var = score.groupby("anime_id").apply(
    lambda x: np.sum(x["user_var"] * x["corr"] ** 2 * correction_factor(x))
    / (x["corr"].abs().sum() ** 2)
)

# if the var < 0, then the ratio distribution approximation failed,
# usually because sample size is too small
delta_var.loc[lambda x: x < 0] = np.inf

# Apply a bessel correction to unbias the variance
average_weight = corrs["corr"].abs().mean()
effective_sample_size = weights / average_weight
delta_var.loc[effective_sample_size <= 1] = np.inf
delta_var.loc[effective_sample_size > 1] *= effective_sample_size / (effective_sample_size - 1)

In [30]:
pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = np.sqrt(delta_var)

# # Unbias the standard deviation estimate
# # See https://en.wikipedia.org/wiki/Unbiased_estimation_of_standard_deviation#Results_for_the_normal_distribution
# def standard_deviation_bias(n):
#     if n < 1:
#         return np.inf
#     if gamma((n - 1) / 2) == np.inf:
#         return 1
#     return np.sqrt(2 / (n - 1)) * gamma(n / 2) / gamma((n - 1) / 2)
# pred_df["delta_sem"] /= effective_sample_size.apply(standard_deviation_bias)

pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
model = lm("target ~ delta + 0", recomendee_seen_shows)
pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
pred_df["sem"] = np.sqrt(
    (
        (pred_df["delta_sem"] ** 2 + pred_df["delta"] ** 2)
        * (model.bse["delta"] ** 2 + model.params["delta"] ** 2)
    )
    - pred_df["delta"] ** 2 * model.params["delta"] ** 2
)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [31]:
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   0.688
Model:                            OLS   Adj. R-squared (uncentered):              0.683
Method:                 Least Squares   F-statistic:                              143.4
Date:                Wed, 05 May 2021   Prob (F-statistic):                    4.25e-18
Time:                        00:41:12   Log-Likelihood:                         -74.780
No. Observations:                  66   AIC:                                      151.6
Df Residuals:                      65   BIC:                                      153.7
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [32]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.992
Model:                            OLS   Adj. R-squared (uncentered):              0.992
Method:                 Least Squares   F-statistic:                              3882.
Date:                Wed, 05 May 2021   Prob (F-statistic):                    1.59e-67
Time:                        00:41:13   Log-Likelihood:                         -72.886
No. Observations:                  66   AIC:                                      149.8
Df Residuals:                      64   BIC:                                      154.2
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [33]:
# confirm that the top shows are ones that the user rates highly
pred_df.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
34240,0.100875,6.214468,35,0.381753,8.65293,8.764009,0.421934,7.677179,9.850839,Shelter,Music
35843,0.312741,1.821889,10,0.639196,8.996565,9.340941,0.706888,7.520118,11.161763,Gintama.: Porori-hen,TV
28675,0.006043,2.602445,15,0.469666,8.686027,8.692681,0.518975,7.35589,10.029471,Kyoukai no Kanata Movie 2: I&#039;ll Be Here -...,Movie
16762,0.201072,6.399317,36,0.291071,7.96308,8.184491,0.322161,7.35466,9.014322,Mirai Nikki: Redial,OVA
4382,0.457,12.995172,74,0.319069,7.675116,8.178343,0.355062,7.263763,9.092923,Suzumiya Haruhi no Yuuutsu (2009),TV
21647,0.013847,2.22298,13,0.478635,8.591741,8.606989,0.528886,7.244669,9.96931,Tamako Love Story,Movie
23277,0.009706,3.23976,19,0.332299,8.106333,8.117021,0.367187,7.171211,9.062832,Saenai Heroine no Sodatekata,TV
9379,0.346729,2.025613,11,0.398194,7.839452,8.221253,0.441152,7.084921,9.357585,Denpa Onna to Seishun Otoko,TV
12115,0.180873,2.829725,16,0.669304,8.757693,8.956862,0.739759,7.05137,10.862354,Berserk: Ougon Jidai-hen III - Kourin,Movie
20057,0.146885,4.016882,22,0.456682,8.127082,8.288824,0.504808,6.988524,9.589124,Space☆Dandy,TV


In [34]:
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
    & (x["type"] != "Music")
]

In [35]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
35843,0.312741,1.821889,10,0.639196,8.996565,9.340941,0.706888,7.520118,11.161763,Gintama.: Porori-hen,TV
23277,0.009706,3.23976,19,0.332299,8.106333,8.117021,0.367187,7.171211,9.062832,Saenai Heroine no Sodatekata,TV
9379,0.346729,2.025613,11,0.398194,7.839452,8.221253,0.441152,7.084921,9.357585,Denpa Onna to Seishun Otoko,TV
20057,0.146885,4.016882,22,0.456682,8.127082,8.288824,0.504808,6.988524,9.589124,Space☆Dandy,TV
4981,0.402074,2.524512,13,0.543075,8.07694,8.519684,0.601228,6.971022,10.068346,Casshern Sins,TV
10357,0.190667,2.133051,12,0.557507,8.308907,8.518861,0.616287,6.93141,10.106312,Jinrui wa Suitai Shimashita,TV
10456,0.430597,0.721665,4,0.453258,7.629628,8.103782,0.502407,6.809667,9.397896,Kyoukaisenjou no Horizon,TV
3091,0.019325,2.648743,15,0.747373,8.902932,8.924211,0.825838,6.796993,11.051429,xxxHOLiC Kei,TV
3701,0.145152,1.832095,10,0.699397,8.618936,8.77877,0.77294,6.78781,10.769731,Kaiba,TV
22265,0.156697,2.241829,13,0.605908,8.31334,8.485888,0.669675,6.760918,10.210857,Free!: Eternal Summer,TV


In [36]:
list(
    new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound")[-20:][
        "title"
    ]
)

['Rokudenashi Majutsu Koushi to Akashic Records',
 'Doraemon (1979)',
 'Date A Live',
 'Inu x Boku SS',
 'Re:␣Hamatora',
 'Danna ga Nani wo Itteiru ka Wakaranai Ken 2 Sure-me',
 'Dragon Ball Super',
 'Date A Live II',
 'D.Gray-man Hallow',
 'Ore no Kanojo to Osananajimi ga Shuraba Sugiru',
 'Free!: Eternal Summer',
 'Kaiba',
 'xxxHOLiC Kei',
 'Kyoukaisenjou no Horizon',
 'Jinrui wa Suitai Shimashita',
 'Casshern Sins',
 'Space☆Dandy',
 'Denpa Onna to Seishun Otoko',
 'Saenai Heroine no Sodatekata',
 'Gintama.: Porori-hen']

In [28]:
pred_df.to_pickle("deltas/user.pkl")

In [29]:
filtered_df.loc[recommendee].to_pickle("deltas/recommendee.pkl")

delta                     -0.115515
weight                      6.50257
counts                           36
delta_sem                  0.573064
blp                         7.94427
score                       7.81707
sem                        0.633317
score_lower_bound           6.18576
score_upper_bound           9.44839
title                Vampire Knight
type                             TV
Name: 3457, dtype: object