In [1]:
import functools
import os
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data")

In [3]:
recommendee = "Fro116"
neighborhood_size = 8192
confidence_interval = 0.99
normalize_variance = True

In [4]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [5]:
df = pd.read_csv("UserAnimeList.csv")

In [6]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [7]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [8]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, "r").read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: "anime_id", 9: "my_score"}, axis=1)
    df["username"] = username
    df["anime_id"] = df["anime_id"].astype(int)
    df["my_score"] = df["my_score"].astype(int)
    df["username"] = df["username"].astype(str)
    df = df.loc[lambda x: x["my_score"] != 0]
    df = df.reset_index(drop=True)
    return df


def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x["username"] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [9]:
filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")

In [10]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [11]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [12]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605473
karthiga,59,7,0.040203,-0.059898,-0.474166
karthiga,74,7,0.316283,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227338,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857496,-2.493860,1.857496


In [13]:
if normalize_variance:
    user_stds = (
        filtered_df.groupby("username")[["normalized_score"]]
        .std()
        .rename({"normalized_score": "user_std"}, axis=1)
    )
    filtered_df = filtered_df.merge(user_stds, on="username")
    filtered_df["normalized_score"] /= filtered_df["user_std"]
    filtered_df = filtered_df.drop("user_std", axis=1)

In [14]:
user_subset = filtered_df.loc[[recommendee]].merge(
    filtered_df.reset_index(), on="anime_id"
)

In [15]:
adj_cos_corr_numerator = user_subset.groupby("username").apply(
    lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
)
adj_cos_corr_denom = filtered_df.groupby("username").apply(
    lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
)
adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
adj_cos_corrs = pd.DataFrame(
    (adj_cos_corr_numerator / adj_cos_corr_denom), columns=["corr"]
)
adj_cos_corrs = adj_cos_corrs.dropna()

In [16]:
corrs = adj_cos_corrs.copy()
corrs["similarity"] = corrs["corr"].abs()
corrs["size"] = user_subset.groupby("username").size()
corrs = corrs.drop(
    recommendee
)  # Technically not needed because it's a noop for new series, but its useful for debugging

# We assume variance is the same as the variance for pearson correlation.
# see https://www.jstor.org/stable/2277400?seq=1
corrs = corrs.loc[lambda x: x["size"] > 2]
corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (corrs["size"] - 2)
corrs = corrs.sort_values(by="similarity").dropna()[-neighborhood_size:]

In [17]:
corrs.describe()

Unnamed: 0,corr,similarity,size,corr_var
count,8192.0,8192.0,8192.0,8192.0
mean,0.041739,0.083613,43.052368,0.090394
std,0.074055,0.015314,35.555,0.17769
min,-0.146668,0.067726,3.0,0.003536
25%,-0.068237,0.072488,16.0,0.016735
50%,0.074879,0.079044,34.0,0.03077
75%,0.086438,0.090059,61.0,0.07068
max,0.193013,0.193013,282.0,0.990836


In [18]:
score = (filtered_df.merge(pd.DataFrame(corrs), on="username")).dropna()

user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score = score.merge(user_var, on="username")

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")

In [19]:
score

Unnamed: 0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity,size,corr_var,user_var,anime_var
0,21,8,0.960564,0.520846,-0.837534,0.067872,0.067872,27,0.039632,1.0,1.791939
1,21,10,0.960564,-0.271638,0.799789,-0.083694,0.083694,27,0.039442,1.0,1.791939
2,21,10,0.960564,1.216666,0.414691,0.076136,0.076136,62,0.016474,1.0,1.791939
3,21,8,0.960564,-0.970051,0.192857,0.094892,0.094892,10,0.122759,1.0,1.791939
4,21,10,0.960564,0.481750,0.666542,0.081828,0.081828,20,0.054814,1.0,1.791939
...,...,...,...,...,...,...,...,...,...,...,...
1235163,4332,10,-0.227193,1.654288,0.446276,-0.120465,0.120465,6,0.242797,1.0,0.731552
1235164,12347,8,-0.972881,1.800258,-0.313881,0.078642,0.078642,44,0.023516,1.0,0.776905
1235165,7216,5,-1.933860,0.238283,-0.606248,-0.070809,0.070809,21,0.052105,1.0,0.717074
1235166,1745,6,-2.743860,-0.136717,0.769228,0.090460,0.090460,9,0.140529,1.0,2.376743


In [20]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

In [21]:
# The following formulae are used to compute the variance of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for user i and w_i is the weight for user_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We assume that
# Var(s_i) is the same as the variance over the vector s_i (i.e. over
# all items s_i has rated). We treat w_i as a random variable with mean w_i
# and variance corr['corr_var']
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
# formula for the ratio of two correlated variables R,S is
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
#
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
def correction_factor(x):
    return (
        1
        + x["corr_var"] / (x["corr"] ** 2)
        - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
        + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
    )


delta_var = score.groupby("anime_id").apply(
    lambda x: np.sum(x["user_var"] * x["corr"] ** 2 * correction_factor(x))
    / (x["corr"].abs().sum() ** 2)
)

# if the var < 0, then the ratio distribution approximation failed,
# usually because sample size is too small
delta_var.loc[lambda x: x < 0] = np.inf

# The above is a biased estimator of the variance. To unbias the estimator,
# we need to apply a Bessel-like correction. See the formula in
# (https://stats.stackexchange.com/questions/47325/bias-correction-in-weighted-variance)
bias_correction = (
    score.set_index("anime_id")
    .loc[counts > 1]
    .groupby("anime_id")
    .apply(
        lambda x: (x["corr"].abs().sum() ** 2)
        / (x["corr"].abs().sum() ** 2 - (x["corr"] ** 2).sum())
    )
)
delta_var *= bias_correction

In [22]:
pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = np.sqrt(delta_var)
pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
model = lm("target ~ delta + 0", recomendee_seen_shows)
pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
pred_df["sem"] = np.sqrt(
    (
        (pred_df["delta_sem"] ** 2 + pred_df["delta"] ** 2)
        * (model.bse["delta"] ** 2 + model.params["delta"] ** 2)
    )
    - pred_df["delta"] ** 2 * model.params["delta"] ** 2
)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [23]:
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   0.737
Model:                            OLS   Adj. R-squared (uncentered):              0.736
Method:                 Least Squares   F-statistic:                              961.2
Date:                Fri, 30 Apr 2021   Prob (F-statistic):                   1.67e-101
Time:                        14:47:36   Log-Likelihood:                         -431.90
No. Observations:                 344   AIC:                                      865.8
Df Residuals:                     343   BIC:                                      869.6
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [24]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.986
Model:                            OLS   Adj. R-squared (uncentered):              0.986
Method:                 Least Squares   F-statistic:                          1.195e+04
Date:                Fri, 30 Apr 2021   Prob (F-statistic):                   3.57e-317
Time:                        14:47:36   Log-Likelihood:                         -409.99
No. Observations:                 344   AIC:                                      824.0
Df Residuals:                     342   BIC:                                      831.7
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [25]:
# confirm that the top shows are ones that the user rates highly
pred_df.sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
32,0.780101,209.438307,2492,0.047543,7.32393,10.024063,0.186261,9.544286,10.50384,Neon Genesis Evangelion: The End of Evangelion,Movie
2476,1.429737,192.384041,2323,0.057288,5.172089,10.12078,0.254631,9.464894,10.776667,School Days,TV
30,0.591355,295.833562,3539,0.042311,7.191034,9.237869,0.160713,8.823899,9.651838,Neon Genesis Evangelion,TV
3784,0.440139,165.465317,1965,0.049115,7.545625,9.069063,0.177043,8.61303,9.525095,Evangelion: 2.0 You Can (Not) Advance,Movie
16201,1.006494,53.858939,648,0.081294,5.714013,9.197752,0.303123,8.416958,9.978546,Aku no Hana,TV
3297,0.521179,29.249896,349,0.117432,7.650871,9.454808,0.410817,8.396614,10.513003,Aria The Origination,TV
11981,0.382913,106.771625,1275,0.057117,7.409164,8.734527,0.202368,8.213263,9.255792,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie
3785,0.600165,110.615684,1309,0.057118,6.528636,8.605964,0.208843,8.06802,9.143908,Evangelion: 3.0 You Can (Not) Redo,Movie
9756,0.287266,291.35941,3501,0.040994,7.447999,8.442302,0.145542,8.06741,8.817195,Mahou Shoujo Madoka★Magica,TV
2759,0.3822,182.253897,2169,0.048219,7.150698,8.47359,0.172351,8.029642,8.917538,Evangelion: 1.0 You Are (Not) Alone,Movie


In [26]:
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
    & (x["type"] != "Music")
]

In [27]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
33089,0.349839,13.231112,160,0.141471,6.411954,7.62284,0.491476,6.356882,8.888798,Kemono Friends,TV
32681,0.220282,28.561676,345,0.099791,6.378826,7.141279,0.346455,6.248871,8.033687,Uchuu Patrol Luluco,TV
26,0.100118,39.230045,470,0.085892,6.513763,6.860299,0.297661,6.093576,7.627021,Texhnolyze,TV
26165,0.339868,20.688893,254,0.109973,5.895212,7.071584,0.38273,6.085736,8.057432,Yuri Kuma Arashi,TV
2403,0.233305,31.566416,391,0.103614,6.076891,6.884422,0.359766,5.957726,7.811119,Kodomo no Jikan (TV),TV
14131,0.037387,49.964611,608,0.084542,6.539773,6.66918,0.292803,5.91497,7.42339,Girls & Panzer,TV
147,0.058847,55.898825,674,0.07914,6.362681,6.566364,0.274144,5.860217,7.272512,Kimi ga Nozomu Eien,TV
1454,0.188562,17.838523,218,0.130223,6.370114,7.022777,0.45146,5.859892,8.185661,Kemonozume,TV
593,0.248242,9.408438,112,0.174892,6.555974,7.415203,0.606296,5.853488,8.976919,Mugen no Ryvius,TV
182,0.024339,35.077449,426,0.105496,6.693356,6.777599,0.365348,5.836526,7.718672,Tenkuu no Escaflowne,TV


In [28]:
list(
    new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound")[-20:][
        "title"
    ]
)

['Hidamari Sketch',
 'Pingu in the City',
 'Casshern Sins',
 'Amanchu!',
 'Mobile Suit Gundam',
 'Soredemo Machi wa Mawatteiru',
 'Mobile Suit Zeta Gundam',
 'Koi Kaze',
 'Hidamari Sketch x 365',
 'Macross',
 'Tenkuu no Escaflowne',
 'Mugen no Ryvius',
 'Kemonozume',
 'Kimi ga Nozomu Eien',
 'Girls & Panzer',
 'Kodomo no Jikan (TV)',
 'Yuri Kuma Arashi',
 'Texhnolyze',
 'Uchuu Patrol Luluco',
 'Kemono Friends']

In [29]:
pred_df.to_pickle("deltas/user.pkl")

In [30]:
filtered_df.loc[recommendee].to_pickle("deltas/recommendee.pkl")