In [1]:
import functools
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "Fro116"
neighborhood_size = 8192
confidence_interval = 0.95

In [3]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [4]:
df = pd.read_csv("UserAnimeList.csv")

In [5]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [6]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [7]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, "r").read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: "anime_id", 9: "my_score"}, axis=1)
    df["username"] = username
    df["anime_id"] = df["anime_id"].astype(int)
    df["my_score"] = df["my_score"].astype(int)
    df["username"] = df["username"].astype(str)
    df = df.loc[lambda x: x["my_score"] != 0]
    df = df.reset_index(drop=True)
    return df


def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x["username"] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [8]:
filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")

In [9]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [10]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605473
karthiga,59,7,0.040203,-0.059898,-0.474166
karthiga,74,7,0.316283,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227338,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857496,-2.493860,1.857496


In [12]:
# # Should we normalize by variance?
# filtered_df.groupby("username")["normalized_score"].std().hist(bins=100)
# user_stds = (
#     filtered_df.groupby("username")[["normalized_score"]]
#     .std()
#     .rename({"normalized_score": "user_std"}, axis=1)
# )
# user_stds = user_stds.dropna().loc[lambda x: x["user_std"] != 0]
# filtered_df = filtered_df.merge(user_stds, on="username")
# filtered_df["normalized_score"] /= filtered_df["user_std"]
# filtered_df = filtered_df.drop("user_std", axis=1)

In [13]:
user_subset = filtered_df.loc[[recommendee]].merge(
    filtered_df.reset_index(), on="anime_id"
)

In [14]:
adj_cos_corr_numerator = user_subset.groupby("username").apply(
    lambda x: np.dot(x["normalized_score_x"], x["normalized_score_y"])
)
adj_cos_corr_denom = filtered_df.groupby("username").apply(
    lambda x: np.sqrt(np.dot(x["normalized_score"], x["normalized_score"]))
)
adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
adj_cos_corrs = pd.DataFrame(
    (adj_cos_corr_numerator / adj_cos_corr_denom), columns=["corr"]
)
adj_cos_corrs = adj_cos_corrs.dropna()

In [15]:
neighborhood_size = 8192
confidence_interval = 0.95

In [16]:
corrs = adj_cos_corrs.copy()
corrs["similarity"] = corrs["corr"].abs()
corrs = corrs.sort_values(by="similarity").dropna()[-neighborhood_size:]
corrs = corrs.drop(
    recommendee
)  # Technically not needed because its a noop for new series, but its useful for debugging

In [17]:
corrs["similarity"].describe()

count    8191.000000
mean        0.085811
std         0.015910
min         0.069309
25%         0.074172
50%         0.081014
75%         0.092570
max         0.193013
Name: similarity, dtype: float64

In [18]:
score = (filtered_df.merge(pd.DataFrame(corrs), on="username")).dropna()

In [19]:
# add standard error of the weighted mean
# TODO make the formula accurate
user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score = score.merge(user_var, on="username")

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")


# This is the formula for the standard deviation of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for user i and w_i is the weight for user_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We can compute
# Var(s_i) directly, by taking the variance over the vector s_i (i.e. over 
# all items s_i has rated). The error for w_i does not have a closed form
# solution. We assume that w_i follows a Poisson distribution, in particular,
# that Var(w_i) = E[w_i]. There is no theoretical justification for this,
# but it makes the math pretty.
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The 
# formula for the ratio of two correlated variables R,S is  
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
# 
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
delta_sem = score.groupby("anime_id").apply(
    lambda x: np.sum(
        x["user_var"]
        * x["corr"].abs()
        * x["corr"].abs()
        * (
            1
            + 1 / x["corr"].abs()
            - 2 / (x["corr"].abs().sum() - x["corr"].abs())
            + 1 / (x["corr"].abs().sum())
        ) 
    )
    / (x["corr"].abs().sum() * x["corr"].abs().sum())
)
sem.loc[
    lambda x: x < 0
] = (
    np.inf
)  # the ratio distribution approximation failed, usually because sample size is too small
delta_sem = np.sqrt(delta_sem)

In [25]:
score

Unnamed: 0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity,user_var,anime_var
0,21,10,0.960564,-0.271638,1.817213,-0.083694,0.083694,5.162513,2.650513
1,21,10,0.960564,1.216666,0.328909,0.076136,0.076136,0.629076,2.650513
2,21,8,0.960564,-0.970051,0.515626,0.094892,0.094892,7.148230,2.650513
3,21,10,0.960564,0.481750,1.063826,0.081828,0.081828,2.547332,2.650513
4,21,7,0.960564,0.355655,-1.810079,0.098160,0.098160,1.699027,2.650513
...,...,...,...,...,...,...,...,...,...
1113087,4703,10,-0.140799,0.003494,2.643444,0.073157,0.073157,3.679393,1.307741
1113088,4703,5,-0.140799,-1.065289,-1.287773,0.077408,0.077408,1.162838,1.307741
1113089,7216,5,-1.933860,0.238283,-0.798283,-0.070809,0.070809,1.733856,1.220686
1113090,1745,6,-2.743860,-0.136717,1.386717,0.090460,0.090460,3.249869,4.382262


In [26]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = delta_sem
pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
model = lm("target ~ delta + 0", recomendee_seen_shows)
pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
pred_df["sem"] = np.sqrt(
    (
        (
            pred_df["delta_sem"] * pred_df["delta_sem"]
            + pred_df["delta"] * pred_df["delta"]
        )
        * (
            model.bse["delta"] * model.bse["delta"]
            + model.params["delta"] * model.params["delta"]
        )
    )
    - pred_df["delta"]
    * pred_df["delta"]
    * model.params["delta"]
    * model.params["delta"]
)
zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)

# account for variance scaling
# pred_df['score'] *= user_stds.loc[recommendee].squeeze()
# pred_df['sem'] *= user_stds.loc[recommendee].squeeze()

pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [27]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.986
Model:                            OLS   Adj. R-squared (uncentered):              0.986
Method:                 Least Squares   F-statistic:                          1.205e+04
Date:                Sat, 24 Apr 2021   Prob (F-statistic):                   8.55e-318
Time:                        01:57:58   Log-Likelihood:                         -408.55
No. Observations:                 344   AIC:                                      821.1
Df Residuals:                     342   BIC:                                      828.8
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [28]:
# confirm that the top shows are ones that the user rates highly
pred_df.sort_values(by="score_lower_bound")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
11061,-0.021535,137.836131,1646,0.113567,7.998886,7.937581,0.323453,7.303624,8.571538,Hunter x Hunter (2011),TV
4382,0.646165,170.781009,2012,0.103202,6.114352,7.953832,0.299239,7.367334,8.540331,Suzumiya Haruhi no Yuuutsu (2009),TV
9253,-0.067285,303.484013,3582,0.077387,8.054028,7.862484,0.220484,7.430344,8.294624,Steins;Gate,TV
820,0.343905,37.592547,445,0.243022,7.872021,8.851039,0.692794,7.493187,10.208891,Ginga Eiyuu Densetsu,OVA
2001,0.16347,265.344336,3140,0.083716,7.661604,8.126965,0.238854,7.658819,8.59511,Tengen Toppa Gurren Lagann,TV
12467,0.9218,49.824591,596,0.186482,6.198381,8.822531,0.537119,7.769796,9.875266,Nazo no Kanojo X,TV
849,0.417324,265.237207,3132,0.083534,7.05583,8.243853,0.240658,7.772171,8.715535,Suzumiya Haruhi no Yuuutsu,TV
1689,0.462214,250.276396,2944,0.084308,7.022911,8.338727,0.24345,7.861574,8.815881,Byousoku 5 Centimeter,Movie
2759,0.526898,169.194421,1978,0.106709,7.150698,8.650652,0.307344,8.048269,9.253036,Evangelion: 1.0 You Are (Not) Alone,Movie
3785,0.821355,102.754406,1194,0.138486,6.528636,8.866842,0.400822,8.081245,9.652439,Evangelion: 3.0 You Can (Not) Redo,Movie


In [29]:
# Movies tend to be recaps of TV series
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
]

In [30]:
seen_shows = pred_df.loc[
    pred_df.index.intersection(filtered_df.loc[recommendee].anime_id)
]

In [31]:
seen_shows["my_score"] = filtered_df.loc[recommendee].set_index("anime_id")[
    ["my_score"]
]

In [32]:
errors = seen_shows["my_score"] - seen_shows["score"]
mse = np.dot(errors, errors) / len(errors)
print(mse)

0.6637918122857163


In [33]:
# using all data gets you 0.6569931689855164

In [34]:
zscore = st.norm.ppf(1 - (1 - 0.95) / 2)
print(zscore)
new_recs["score_lower_bound"] = new_recs["score"] - new_recs["sem"] * zscore
new_recs["score_upper_bound"] = new_recs["score"] + new_recs["sem"] * zscore

1.959963984540054


In [35]:
new_recs.loc[lambda x: x["title"] == "Koi Kaze"]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
634,0.180416,21.736596,256,0.292218,6.273674,6.787277,0.832411,5.155782,8.418772,Koi Kaze,TV


In [36]:
new_recs.loc[lambda x: x["title"] == "Pingu in the City"]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
36259,0.503339,6.65149,79,0.628341,6.738497,8.171387,1.790107,4.662842,11.679932,Pingu in the City,TV


In [37]:
new_recs.loc[lambda x: (x["score_upper_bound"] > 8) & (x["delta"] > 0)].sort_values(
    by="score_lower_bound"
)[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
593,0.383089,8.861453,104,0.484339,6.555974,7.646539,1.379842,4.942098,10.350979,Mugen no Ryvius,TV
32526,0.122845,20.420723,245,0.306683,6.370506,6.720217,0.873525,5.008139,8.432295,Love Live! Sunshine!!,TV
1088,0.00091,16.825802,202,0.340776,6.935756,6.938348,0.970561,5.036084,8.840612,Macross,TV
8726,0.153821,16.220343,195,0.317402,6.37779,6.815684,0.904088,5.043705,8.587663,Soredemo Machi wa Mawatteiru,TV
85,0.153128,13.297574,161,0.411727,6.911561,7.347481,1.172712,5.049009,9.645954,Mobile Suit Zeta Gundam,TV
122,0.019954,16.779641,205,0.341407,6.936235,6.993039,0.97236,5.087248,8.89883,Full Moon wo Sagashite,TV
1852,0.15826,20.348025,240,0.304512,6.394334,6.844863,0.867387,5.144815,8.54491,Hidamari Sketch,TV
1454,0.262425,16.769692,202,0.351584,6.370114,7.117176,1.001603,5.15407,9.080282,Kemonozume,TV
634,0.180416,21.736596,256,0.292218,6.273674,6.787277,0.832411,5.155782,8.418772,Koi Kaze,TV
3604,0.133615,12.446086,148,0.374607,6.869922,7.250291,1.066979,5.159052,9.341531,Hidamari Sketch x 365,TV
