In [1]:
import functools
import os
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data")

In [3]:
recommendee = "Fro116"
# A neighborhood size of 32768 minimizes cross-validation oos rmse, but 23170 was the second best model
# and empirically yields better results
neighborhood_size = 23170
confidence_interval = 0.99

In [4]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [5]:
df = pd.read_csv("UserAnimeList.csv")

In [6]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [7]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [8]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, "r").read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: "anime_id", 9: "my_score"}, axis=1)
    df["username"] = username
    df["anime_id"] = df["anime_id"].astype(int)
    df["my_score"] = df["my_score"].astype(int)
    df["username"] = df["username"].astype(str)
    df = df.loc[lambda x: x["my_score"] != 0]
    df = df.reset_index(drop=True)
    return df


def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x["username"] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [9]:
filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")

In [10]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [11]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [12]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605473
karthiga,59,7,0.040203,-0.059898,-0.474166
karthiga,74,7,0.316283,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227338,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857496,-2.493860,1.857496


In [13]:
user_subset = filtered_df.loc[[recommendee]].merge(
    filtered_df.reset_index(), on="anime_id"
)

In [14]:
adj_cos_corr_numerator = user_subset.groupby("username").apply(
    lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
)
adj_cos_corr_denom = filtered_df.groupby("username").apply(
    lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
)
adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
adj_cos_corrs = pd.DataFrame(
    (adj_cos_corr_numerator / adj_cos_corr_denom), columns=["corr"]
)
adj_cos_corrs = adj_cos_corrs.dropna()

In [15]:
corrs = adj_cos_corrs.copy()
corrs["similarity"] = corrs["corr"].abs()
corrs["size"] = user_subset.groupby("username").size()
corrs = corrs.drop(
    recommendee
)  # Technically not needed because it's a noop for new series, but its useful for debugging

# We assume variance is the same as the variance for pearson correlation.
# see https://www.jstor.org/stable/2277400?seq=1
corrs = corrs.loc[lambda x: x["size"] > 2]
corrs["corr_var"] = (1 - corrs["corr"] * corrs["corr"]) ** 2 / (corrs["size"] - 2)
corrs = corrs.sort_values(by="similarity").dropna()[-neighborhood_size:]

In [16]:
corrs.describe()

Unnamed: 0,corr,similarity,size,corr_var
count,23170.0,23170.0,23170.0,23170.0
mean,0.024385,0.066656,44.497497,0.082274
std,0.064067,0.016003,36.701538,0.164167
min,-0.146668,0.050052,3.0,0.003205
25%,-0.054021,0.054925,17.0,0.016544
50%,0.055601,0.061784,35.0,0.03003
75%,0.068587,0.073451,62.0,0.066164
max,0.193013,0.193013,312.0,0.994985


In [17]:
score = (filtered_df.merge(pd.DataFrame(corrs), on="username")).dropna()

user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score = score.merge(user_var, on="username")

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")

In [18]:
score

Unnamed: 0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity,size,corr_var,user_var,anime_var
0,21,9,0.960564,0.102140,0.443436,0.056223,0.056223,142,0.007098,1.639109,2.650513
1,21,8,0.960564,0.520846,-0.975270,0.067872,0.067872,27,0.039632,1.355953,2.650513
2,21,8,0.960564,-0.011404,-0.443020,0.052580,0.052580,68,0.015068,0.869279,2.650513
3,21,10,0.960564,1.191325,0.354251,0.067607,0.067607,4,0.495440,2.420728,2.650513
4,21,7,0.960564,-0.131698,-1.322726,-0.058118,0.058118,52,0.019865,1.266169,2.650513
...,...,...,...,...,...,...,...,...,...,...,...
4034585,15961,9,-0.743860,1.273582,0.976418,0.058982,0.058982,19,0.058415,1.273887,0.770146
4034586,24121,7,-1.093860,1.900079,-1.300079,-0.064992,0.064992,27,0.039663,1.048035,2.944382
4034587,33946,10,-3.827193,-0.102556,6.435889,0.057176,0.057176,30,0.035481,3.731532,9.060473
4034588,31673,6,-1.711251,-0.232990,0.450382,0.056620,0.056620,32,0.033120,2.229561,4.062124


In [19]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

In [20]:
# The following formulae are used to compute the variance of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for user i and w_i is the weight for user_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We assume that
# Var(s_i) is the same as the variance over the vector s_i (i.e. over
# all items s_i has rated). We treat w_i as a random variable with mean w_i
# and variance corr['corr_var']
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
# formula for the ratio of two correlated variables R,S is
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
#
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
def correction_factor(x):
    return (
        1
        + x["corr_var"] / (x["corr"] ** 2)
        - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
        + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
    )


delta_var = score.groupby("anime_id").apply(
    lambda x: np.sum(x["user_var"] * x["corr"] ** 2 * correction_factor(x))
    / (x["corr"].abs().sum() ** 2)
)

# if the var < 0, then the ratio distribution approximation failed,
# usually because sample size is too small
delta_var.loc[lambda x: x < 0] = np.inf

# The above is a biased estimator of the variance. To unbias the estimator,
# we need to apply a Bessel-like correction. See the formula in
# (https://stats.stackexchange.com/questions/47325/bias-correction-in-weighted-variance)
bias_correction = (
    score.set_index("anime_id")
    .loc[counts > 1]
    .groupby("anime_id")
    .apply(
        lambda x: (x["corr"].abs().sum() ** 2)
        / (x["corr"].abs().sum() ** 2 - (x["corr"] ** 2).sum())
    )
)
delta_var *= bias_correction

In [21]:
pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = np.sqrt(delta_var)
pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
model = lm("target ~ delta + 0", recomendee_seen_shows)
pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
pred_df["sem"] = np.sqrt(
    (
        (pred_df["delta_sem"] ** 2 + pred_df["delta"] ** 2)
        * (model.bse["delta"] ** 2 + model.params["delta"] ** 2)
    )
    - pred_df["delta"] ** 2 * model.params["delta"] ** 2
)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [22]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.987
Model:                            OLS   Adj. R-squared (uncentered):              0.987
Method:                 Least Squares   F-statistic:                          1.334e+04
Date:                Tue, 27 Apr 2021   Prob (F-statistic):                        0.00
Time:                        17:34:41   Log-Likelihood:                         -391.29
No. Observations:                 344   AIC:                                      786.6
Df Residuals:                     342   BIC:                                      794.3
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [23]:
# confirm that the top shows are ones that the user rates highly
pred_df.sort_values(by="score_lower_bound")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
820,0.274404,89.036768,1331,0.107827,7.872021,8.795004,0.363825,7.857854,9.732154,Ginga Eiyuu Densetsu,OVA
2724,0.851866,21.335829,316,0.173045,6.507725,9.373049,0.588204,7.857937,10.888161,Daicon Opening Animations,Special
962,0.419216,67.34816,1012,0.092685,7.262308,8.672377,0.314558,7.862129,9.482625,Aria The Natural,TV
2563,0.515477,44.288698,664,0.103095,7.076386,8.810238,0.350544,7.907296,9.71318,Aria The OVA: Arietta,OVA
2001,0.18498,649.978306,9787,0.038981,7.661604,8.283799,0.132407,7.942741,8.624857,Tengen Toppa Gurren Lagann,TV
1689,0.3914,603.129721,9043,0.038961,7.022911,8.339419,0.136555,7.987677,8.691162,Byousoku 5 Centimeter,Movie
849,0.403701,662.417191,9995,0.038866,7.05583,8.413712,0.136589,8.061882,8.765542,Suzumiya Haruhi no Yuuutsu,TV
12467,0.766201,140.389921,2163,0.074868,6.198381,8.775565,0.262794,8.098653,9.452477,Nazo no Kanojo X,TV
3785,0.672145,227.921676,3347,0.05636,6.528636,8.789455,0.200676,8.272548,9.306362,Evangelion: 3.0 You Can (Not) Redo,Movie
2759,0.458383,380.629629,5622,0.046101,7.150698,8.692509,0.161451,8.276639,9.108379,Evangelion: 1.0 You Are (Not) Alone,Movie


In [24]:
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
    & (x["type"] != "Music")
]

In [25]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
23269,0.062654,36.869885,573,0.110786,6.572363,6.783106,0.372843,5.822725,7.743487,Hello!! Kiniro Mosaic,TV
2403,0.16733,86.216131,1347,0.091952,6.076891,6.639719,0.309848,5.841603,7.437835,Kodomo no Jikan (TV),TV
1852,0.118173,55.676867,852,0.103041,6.394334,6.791817,0.346925,5.898197,7.685437,Hidamari Sketch,TV
634,0.150549,57.624397,875,0.100454,6.273674,6.780059,0.338347,5.908534,7.651584,Koi Kaze,TV
1088,0.026222,45.7528,701,0.126069,6.935756,7.023957,0.42423,5.931213,8.116701,Macross,TV
122,0.096829,52.248521,823,0.152885,6.936235,7.261927,0.514544,5.936549,8.587304,Full Moon wo Sagashite,TV
26165,0.278867,52.71425,812,0.102497,5.895212,6.833204,0.345973,5.942036,7.724372,Yuri Kuma Arashi,TV
31771,0.17447,40.208815,636,0.10531,6.27845,6.865294,0.354776,5.951452,7.779136,Amanchu!,TV
593,0.254619,25.066945,383,0.168361,6.555974,7.412406,0.567081,5.951702,8.87311,Mugen no Ryvius,TV
14131,0.028748,122.058312,1867,0.078637,6.539773,6.636469,0.264627,5.954835,7.318103,Girls & Panzer,TV


In [26]:
list(
    new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound")[-20:][
        "title"
    ]
)

['Hello!! Kiniro Mosaic',
 'Kodomo no Jikan (TV)',
 'Hidamari Sketch',
 'Koi Kaze',
 'Macross',
 'Full Moon wo Sagashite',
 'Yuri Kuma Arashi',
 'Amanchu!',
 'Mugen no Ryvius',
 'Girls & Panzer',
 'Mobile Suit Gundam',
 'Hidamari Sketch x Honeycomb',
 'Mobile Suit Zeta Gundam',
 'Kimi ga Nozomu Eien',
 'Hidamari Sketch x 365',
 'Maison Ikkoku',
 'Hidamari Sketch x ☆☆☆',
 'Texhnolyze',
 'Kemono Friends',
 'Uchuu Patrol Luluco']