In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
from scipy.special import gamma


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data")

In [3]:
recommendee = "Fro116"
neighborhood_size = 11585
# recommendee = "mpfei"
# neighborhood_size = 724
confidence_interval = 0.99
normalize_variance = False

In [4]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [5]:
df = pd.read_csv("UserAnimeList.csv")

In [6]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [7]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [8]:
# add additional user anime-lists
extraUsers = pickle.load(open("user_profiles/ExtraUserAnimeLists.pkl", "rb"))
filtered_df = filtered_df.loc[lambda x: ~x["username"].isin(extraUsers.username)]
filtered_df = pd.concat([filtered_df, extraUsers], ignore_index=True)

In [9]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [10]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605474
karthiga,59,7,0.040203,-0.059898,-0.474165
karthiga,74,7,0.316282,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227339,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857497,-2.493860,1.857497


In [12]:
if normalize_variance:
    user_stds = (
        filtered_df.groupby("username")[["normalized_score"]]
        .std()
        .rename({"normalized_score": "user_std"}, axis=1)
    )
    filtered_df = filtered_df.merge(user_stds, on="username")
    filtered_df["normalized_score"] /= filtered_df["user_std"]
    filtered_df = filtered_df.drop("user_std", axis=1)

In [13]:
user_subset = filtered_df.loc[[recommendee]].merge(
    filtered_df.reset_index(), on="anime_id"
)

In [14]:
adj_cos_corr_numerator = user_subset.groupby("username").apply(
    lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
)
adj_cos_corr_denom = filtered_df.groupby("username").apply(
    lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
)
adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
adj_cos_corrs = pd.DataFrame(
    (adj_cos_corr_numerator / adj_cos_corr_denom), columns=["corr"]
)
adj_cos_corrs = adj_cos_corrs.dropna()

In [15]:
corrs = adj_cos_corrs.copy()
corrs["similarity"] = corrs["corr"].abs()
corrs["size"] = user_subset.groupby("username").size()
corrs = corrs.drop(
    recommendee
)  # Technically not needed because it's a noop for new series, but its useful for debugging

# We assume variance is the same as the variance for pearson correlation.
# see https://www.jstor.org/stable/2277400?seq=1
corrs = corrs.loc[lambda x: x["size"] > 2]
corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (corrs["size"] - 2)
corrs = corrs.sort_values(by="similarity").dropna()[-neighborhood_size:]

In [16]:
corrs.describe()

Unnamed: 0,corr,similarity,size,corr_var
count,11585.0,11585.0,11585.0,11585.0
mean,0.032637,0.076451,44.025464,0.086411
std,0.070724,0.014899,36.445194,0.169494
min,-0.14589,0.060875,3.0,0.003474
25%,-0.063246,0.065456,16.0,0.016468
50%,0.066861,0.07204,35.0,0.030059
75%,0.078683,0.08279,62.0,0.07053
max,0.188266,0.188266,287.0,0.992572


In [17]:
score = (filtered_df.merge(pd.DataFrame(corrs), on="username")).dropna()

user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score = score.merge(user_var, on="username")

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")

In [18]:
score

Unnamed: 0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity,size,corr_var,user_var,anime_var
0,21,10,0.960564,1.191325,0.354251,0.067317,0.067317,4,0.495479,2.420727,2.650513
1,21,7,0.960564,-0.441573,-1.012852,0.061035,0.061035,34,0.031018,2.844932,2.650513
2,21,8,0.960564,-0.183376,-0.271048,-0.065715,0.065715,99,0.010220,1.783043,2.650513
3,21,10,0.960564,-0.271638,1.817214,-0.086887,0.086887,26,0.041040,5.162514,2.650513
4,21,10,0.960564,1.216666,0.328910,0.073203,0.073203,62,0.016489,0.629076,2.650513
...,...,...,...,...,...,...,...,...,...,...,...
1859541,36714,5,-1.919392,-1.410527,0.836059,0.097138,0.097138,11,0.109024,0.948404,2.138055
1859542,7376,8,-1.863091,-0.110882,2.480112,-0.095870,0.095870,26,0.040904,3.507532,2.873370
1859543,4332,10,-0.227194,1.654288,1.079045,-0.122913,0.122913,6,0.242503,5.846162,1.396115
1859544,7216,5,-1.933860,0.238283,-0.798283,-0.068684,0.068684,22,0.049529,1.733849,1.220686


In [19]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

In [20]:
# The following formulae are used to compute the variance of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for user i and w_i is the weight for user_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We assume that
# Var(s_i) is the same as the variance over the vector s_i (i.e. over
# all items s_i has rated). We treat w_i as a random variable with mean w_i
# and variance corr['corr_var']
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
# formula for the ratio of two correlated variables R,S is
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
#
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
def correction_factor(x):
    return (
        1
        + x["corr_var"] / (x["corr"] ** 2)
        - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
        + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
    )


delta_var = score.groupby("anime_id").apply(
    lambda x: np.sum(x["user_var"] * x["corr"] ** 2 * correction_factor(x))
    / (x["corr"].abs().sum() ** 2)
)

# if the var < 0, then the ratio distribution approximation failed,
# usually because sample size is too small
delta_var.loc[lambda x: x < 0] = np.inf

# Apply a bessel correction to unbias the variance
average_weight = corrs["corr"].abs().mean()
effective_sample_size = weights / average_weight
delta_var.loc[effective_sample_size <= 1] = np.inf
delta_var.loc[effective_sample_size > 1] *= effective_sample_size / (effective_sample_size - 1)

In [21]:
pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = np.sqrt(delta_var)

# # Unbias the standard deviation estimate
# # See https://en.wikipedia.org/wiki/Unbiased_estimation_of_standard_deviation#Results_for_the_normal_distribution
# def standard_deviation_bias(n):
#     if n < 1:
#         return np.inf
#     if gamma((n - 1) / 2) == np.inf:
#         return 1
#     return np.sqrt(2 / (n - 1)) * gamma(n / 2) / gamma((n - 1) / 2)
# pred_df["delta_sem"] /= effective_sample_size.apply(standard_deviation_bias)

pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
model = lm("target ~ delta + 0", recomendee_seen_shows)
pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
pred_df["sem"] = np.sqrt(
    (
        (pred_df["delta_sem"] ** 2 + pred_df["delta"] ** 2)
        * (model.bse["delta"] ** 2 + model.params["delta"] ** 2)
    )
    - pred_df["delta"] ** 2 * model.params["delta"] ** 2
)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [22]:
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   0.758
Model:                            OLS   Adj. R-squared (uncentered):              0.757
Method:                 Least Squares   F-statistic:                              1081.
Date:                Sat, 15 May 2021   Prob (F-statistic):                   2.55e-108
Time:                        23:42:30   Log-Likelihood:                         -429.29
No. Observations:                 346   AIC:                                      860.6
Df Residuals:                     345   BIC:                                      864.4
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [23]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.985
Model:                            OLS   Adj. R-squared (uncentered):              0.985
Method:                 Least Squares   F-statistic:                          1.102e+04
Date:                Sat, 15 May 2021   Prob (F-statistic):                   1.35e-312
Time:                        23:42:30   Log-Likelihood:                         -424.01
No. Observations:                 346   AIC:                                      852.0
Df Residuals:                     344   BIC:                                      859.7
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [24]:
# confirm that the top shows are ones that the user rates highly
pred_df.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
32,1.123164,264.337452,3414,0.059279,7.237932,10.693833,0.210602,10.151359,11.236308,Neon Genesis Evangelion: The End of Evangelion,Movie
2476,1.780514,262.775764,3470,0.070316,5.086119,10.564641,0.273187,9.860957,11.268326,School Days,TV
30,0.866942,378.889908,4929,0.052304,7.105061,9.772582,0.180304,9.308151,10.237013,Neon Genesis Evangelion,TV
3784,0.602298,209.95784,2710,0.058736,7.459656,9.312886,0.189397,8.825032,9.80074,Evangelion: 2.0 You Can (Not) Advance,Movie
3297,0.759445,38.679767,502,0.142009,7.564902,9.901662,0.442896,8.760837,11.042486,Aria The Origination,TV
16201,1.283876,72.023541,942,0.10204,5.628043,9.578444,0.336321,8.712139,10.444748,Aku no Hana,TV
11981,0.565735,138.125703,1798,0.068434,7.323216,9.063945,0.217218,8.504429,9.62346,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie
9756,0.454082,376.437754,4934,0.04982,7.362036,8.759216,0.159143,8.34929,9.169142,Mahou Shoujo Madoka★Magica,TV
227,0.681158,266.840816,3493,0.060536,6.749039,8.844916,0.196957,8.337589,9.352243,FLCL,OVA
3785,0.801955,140.972111,1810,0.070563,6.442667,8.910227,0.229823,8.318241,9.502213,Evangelion: 3.0 You Can (Not) Redo,Movie


In [25]:
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
    & (x["type"] != "Music")
]

In [26]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
33089,0.464653,17.453091,231,0.181717,6.325985,7.755691,0.561078,6.310451,9.200931,Kemono Friends,TV
26,0.222104,52.731606,691,0.106694,6.427794,7.111193,0.3291,6.263488,7.958898,Texhnolyze,TV
32681,0.281922,38.791515,515,0.118006,6.292856,7.16031,0.364222,6.222137,8.098482,Uchuu Patrol Luluco,TV
3604,0.16685,18.584464,247,0.153329,6.783952,7.297339,0.472261,6.080877,8.513801,Hidamari Sketch x 365,TV
7062,0.12549,14.967163,197,0.16019,6.912503,7.298629,0.493263,6.028068,8.569189,Hidamari Sketch x ☆☆☆,TV
593,0.365377,14.308963,190,0.200884,6.470005,7.594246,0.619338,5.998937,9.189554,Mugen no Ryvius,TV
26165,0.387438,28.981661,389,0.133272,5.809242,7.001362,0.411859,5.940482,8.062241,Yuri Kuma Arashi,TV
147,0.124684,77.212074,1022,0.091238,6.276711,6.660355,0.281105,5.936277,7.384433,Kimi ga Nozomu Eien,TV
31771,0.275771,20.766134,283,0.143811,6.19248,7.04101,0.443453,5.898751,8.183268,Amanchu!,TV
29787,0.049193,21.558546,288,0.13813,6.798197,6.949561,0.425237,5.854223,8.044899,Gochuumon wa Usagi desu ka??,TV


In [27]:
new_recs.loc[lambda x:x['title'].str.contains('Eien')]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
147,0.124684,77.212074,1022,0.091238,6.276711,6.660355,0.281105,5.936277,7.384433,Kimi ga Nozomu Eien,TV
2717,-0.729008,0.138964,2,1.972038,5.88984,3.646733,6.071017,-11.991172,19.284637,Ginga Tetsudou Monogatari: Eien e no Bunkiten,TV


In [28]:
new_recs.loc[lambda x:x['title'].str.contains('Koi Kaze')]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [29]:
pred_df.loc[lambda x:x['title'].str.contains('Koi Kaze')]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
634,0.79253,37.426445,493,0.129147,6.187901,8.626462,0.404421,7.584741,9.668182,Koi Kaze,TV


In [30]:
list(
    new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound", ascending=False)[:20][
        "title"
    ]
)

['Kemono Friends',
 'Texhnolyze',
 'Uchuu Patrol Luluco',
 'Hidamari Sketch x 365',
 'Hidamari Sketch x ☆☆☆',
 'Mugen no Ryvius',
 'Yuri Kuma Arashi',
 'Kimi ga Nozomu Eien',
 'Amanchu!',
 'Gochuumon wa Usagi desu ka??',
 'Hidamari Sketch',
 'Kodomo no Jikan (TV)',
 'Girls & Panzer',
 'Hidamari Sketch x Honeycomb',
 'Shakunetsu no Takkyuu Musume',
 'Mobile Suit Gundam',
 'Ichigo Mashimaro',
 'Tenkuu no Escaflowne',
 'Macross',
 'Yama no Susume: Second Season']

In [31]:
a = ['Texhnolyze',
 'Mugen no Ryvius',
 'Kemono Friends',
 'Pingu in the City',
 'Uchuu Patrol Luluco',
 'Yuri Kuma Arashi',
 'Girls & Panzer',
 'Hidamari Sketch x 365',
 'Kodomo no Jikan (TV)',
 'Amanchu!',
 'Kimi ga Nozomu Eien',
 'Hidamari Sketch x ☆☆☆',
 'Hidamari Sketch',
 'Koi Kaze',
 'Tenkuu no Escaflowne',
 'Yosuga no Sora: In Solitude, Where We Are Least Alone.',
 'Hidamari Sketch x Honeycomb',
 'Ichigo Mashimaro',
 'Soredemo Machi wa Mawatteiru',
 'Mobile Suit Gundam']

In [32]:
b = ['Texhnolyze',
 'Uchuu Patrol Luluco',
 'Kemono Friends',
 'Mugen no Ryvius',
 'Yuri Kuma Arashi',
 'Girls & Panzer',
 'Kodomo no Jikan (TV)',
 'Hidamari Sketch x 365',
 'Tenkuu no Escaflowne',
 'Kemonozume',
 'Kimi ga Nozomu Eien',
 'Hidamari Sketch x ☆☆☆',
 'Mobile Suit Gundam',
 'Ichigo Mashimaro',
 'Yosuga no Sora: In Solitude, Where We Are Least Alone.',
 'Koi Kaze',
 'Hidamari Sketch',
 'Amanchu!',
 'Pingu in the City',
 'Macross']

In [33]:
set(a) - set(b)

{'Hidamari Sketch x Honeycomb', 'Soredemo Machi wa Mawatteiru'}

In [34]:
set(b) - set(a)

{'Kemonozume', 'Macross'}

In [35]:
['Texhnolyze',
 'Uchuu Patrol Luluco',
 'Kemono Friends',
 'Mugen no Ryvius',
 'Yuri Kuma Arashi',
 'Girls & Panzer',
 'Kodomo no Jikan (TV)',
 'Hidamari Sketch x 365',
 'Tenkuu no Escaflowne',
 'Kemonozume',
 'Kimi ga Nozomu Eien',
 'Hidamari Sketch x ☆☆☆',
 'Mobile Suit Gundam',
 'Ichigo Mashimaro',
 'Yosuga no Sora: In Solitude, Where We Are Least Alone.',
 'Koi Kaze',
 'Hidamari Sketch',
 'Amanchu!',
 'Pingu in the City',
 'Macross']

['Texhnolyze',
 'Uchuu Patrol Luluco',
 'Kemono Friends',
 'Mugen no Ryvius',
 'Yuri Kuma Arashi',
 'Girls & Panzer',
 'Kodomo no Jikan (TV)',
 'Hidamari Sketch x 365',
 'Tenkuu no Escaflowne',
 'Kemonozume',
 'Kimi ga Nozomu Eien',
 'Hidamari Sketch x ☆☆☆',
 'Mobile Suit Gundam',
 'Ichigo Mashimaro',
 'Yosuga no Sora: In Solitude, Where We Are Least Alone.',
 'Koi Kaze',
 'Hidamari Sketch',
 'Amanchu!',
 'Pingu in the City',
 'Macross']