In [1]:
import functools
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "Fro116"
neighborhood_size = 100
confidence_interval = 0.95

In [3]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [4]:
df = pd.read_csv("UserAnimeList.csv")

In [5]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [6]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [7]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, "r").read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: "anime_id", 9: "my_score"}, axis=1)
    df["username"] = username
    df["anime_id"] = df["anime_id"].astype(int)
    df["my_score"] = df["my_score"].astype(int)
    df["username"] = df["username"].astype(str)
    df = df.loc[lambda x: x["my_score"] != 0]
    df = df.reset_index(drop=True)
    return df


def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x["username"] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [8]:
filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")

In [9]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [10]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605473
karthiga,59,7,0.040203,-0.059898,-0.474166
karthiga,74,7,0.316283,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227338,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857496,-2.493860,1.857496


In [12]:
all_corrs = pickle.load(open("item_correlations/correlations.pkl", "rb"))

In [13]:
all_corrs['corr_var'] = 0 # TODO fix
all_corrs['size'] = -1 # TODO fix

In [None]:
all_corrs["similarity"] = all_corrs["corr"].abs()
all_corrs = all_corrs.dropna()

In [None]:
corrs = all_corrs.loc[
    lambda x: x.index.get_level_values("anime_id_x")
    != x.index.get_level_values("anime_id_y")
]

In [None]:
corrs = corrs.groupby("anime_id_x").apply(
    lambda x: x.sort_values(by="similarity")
)

In [None]:
corrs.index = corrs.index.droplevel()

In [None]:
corrs

In [18]:
score = filtered_df.loc[recommendee].merge(
    corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y"
)

user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score['user_var'] = user_var.loc['Fro116'].squeeze()

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")

score = score.drop('anime_id', axis=1).rename({'anime_id_x': 'anime_id'}, axis=1)

In [19]:
score.head()

Unnamed: 0,my_score,anime_bias,user_bias,normalized_score,anime_id,corr,corr_var,size,similarity,user_var,anime_var
0,7,0.219362,-1.027193,0.313971,24,0.035363,0,-1,0.035363,2.582855,1.630601
1,7,0.219362,-1.027193,0.313971,45,0.055748,0,-1,0.055748,2.582855,1.630601
2,7,0.219362,-1.027193,0.313971,50,0.034549,0,-1,0.034549,2.582855,1.630601
3,7,0.219362,-1.027193,0.313971,120,0.026038,0,-1,0.026038,2.582855,1.630601
4,7,0.219362,-1.027193,0.313971,123,0.048346,0,-1,0.048346,2.582855,1.630601


In [20]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

In [21]:
# The following formulae are used to compute the variance of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for user i and w_i is the weight for user_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We can compute
# Var(s_i) directly, by taking the variance over the vector s_i (i.e. over
# all items s_i has rated). We treat w_i as a random variable with mean w_i
# and variance corr['corr_var']
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
# formula for the ratio of two correlated variables R,S is
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
#
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
def correction_factor(x):
    return (
        1
        + x["corr_var"] / (x["corr"] ** 2)
        - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
        + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
    )


delta_var = score.groupby("anime_id").apply(
    lambda x: np.sum(x["user_var"] * x["corr"] ** 2 * correction_factor(x))
    / (x["corr"].abs().sum() ** 2)
)

# if the var < 0, then the ratio distribution approximation failed,
# usually because sample size is too small
delta_var.loc[lambda x: x < 0] = np.inf

# The above is a biased estimator of the variance. To unbias the estimator,
# we need to apply a Bessel-like correction. See the formula in
# (https://stats.stackexchange.com/questions/47325/bias-correction-in-weighted-variance)
bias_correction = (
    score.set_index("anime_id")
    .loc[counts > 1]
    .groupby("anime_id")
    .apply(
        lambda x: (x["corr"].abs().sum() ** 2)
        / (x["corr"].abs().sum() ** 2 - (x["corr"] ** 2).sum())
    )
)
delta_var *= bias_correction

In [22]:
pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = np.sqrt(delta_var)
pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
model = lm("target ~ delta + 0", recomendee_seen_shows)
pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
pred_df["sem"] = np.sqrt(
    (
        (pred_df["delta_sem"] ** 2 + pred_df["delta"] ** 2)
        * (model.bse["delta"] ** 2 + model.params["delta"] ** 2)
    )
    - pred_df["delta"] ** 2 * model.params["delta"] ** 2
)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [23]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.954
Model:                            OLS   Adj. R-squared (uncentered):              0.954
Method:                 Least Squares   F-statistic:                              3416.
Date:                Mon, 26 Apr 2021   Prob (F-statistic):                   2.44e-220
Time:                        14:43:43   Log-Likelihood:                         -589.49
No. Observations:                 330   AIC:                                      1183.
Df Residuals:                     328   BIC:                                      1191.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [24]:
# confirm that the top shows are ones that the user rates highly
pred_df.sort_values(by="score_lower_bound")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
444,0.803849,1.602563,30,0.312727,6.859356,7.800036,0.379064,7.057084,8.542989,Maria-sama ga Miteru: Haru,TV
820,-0.179875,4.64374,41,0.260532,7.872021,7.661528,0.307029,7.059761,8.263294,Ginga Eiyuu Densetsu,OVA
34284,1.94988,1.854379,13,0.690602,6.432449,8.714239,0.842182,7.063592,10.364886,Yuuki Yuuna wa Yuusha de Aru: Washio Sumi no Shou,TV
427,0.889891,0.812481,19,0.385871,6.94396,7.985329,0.465025,7.073897,8.896761,Kaleido Star,TV
962,0.531688,6.359496,43,0.344323,7.262308,7.884501,0.409416,7.08206,8.686941,Aria The Natural,TV
32281,-0.27576,3.78305,45,0.263708,8.049697,7.726997,0.311677,7.116122,8.337873,Kimi no Na wa.,Movie
5114,-0.338808,4.070269,42,0.267056,8.145486,7.749006,0.316399,7.128876,8.369135,Fullmetal Alchemist: Brotherhood,TV
9253,-0.302975,4.766364,49,0.240978,8.054028,7.69948,0.285464,7.139982,8.258979,Steins;Gate,TV
7311,-0.143919,5.67354,62,0.218752,7.817092,7.648675,0.25774,7.143514,8.153836,Suzumiya Haruhi no Shoushitsu,Movie
397,1.203396,3.967437,31,0.507063,6.945007,8.353246,0.611942,7.153861,9.552632,Seikai no Senki II,TV


In [25]:
pred_df.sort_values(by="score")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
429,1.438941,0.511119,12,0.488978,6.961248,8.645127,0.598111,7.472852,9.817402,Kaleido Star: Legend of Phoenix - Layla Hamilt...,OVA
34284,1.94988,1.854379,13,0.690602,6.432449,8.714239,0.842182,7.063592,10.364886,Yuuki Yuuna wa Yuusha de Aru: Washio Sumi no Shou,TV
10153,1.164725,1.837688,17,0.585393,7.360889,8.723874,0.701126,7.349692,10.098057,Mahou Shoujo Lyrical Nanoha: The Movie 2nd A&#...,Movie
34394,2.976166,0.618526,4,0.949308,5.296001,8.778772,1.167117,6.491264,11.066279,Yuuki Yuuna wa Yuushabu Shozoku,Movie
32153,2.575549,0.209981,2,1.681569,5.771713,8.785674,1.999106,4.867498,12.703851,Mahou Shoujo Madoka★Magica: Concept Movie,Movie
21573,2.135821,0.357805,5,0.818402,6.29411,8.793492,0.992907,6.847429,10.739555,Senki Zesshou Symphogear GX,TV
35472,2.988982,0.642722,4,0.965784,5.409426,8.907196,1.186082,6.582518,11.231873,Yuuki Yuuna wa Yuushabu Shozoku 2,Movie
35473,2.9868,0.622313,4,0.961984,5.428503,8.923719,1.181732,6.607567,11.23987,Yuuki Yuuna wa Yuushabu Shozoku 3,Movie
32836,2.302232,0.490889,6,0.756194,6.365195,9.059315,0.927434,7.241578,10.877051,Senki Zesshou Symphogear AXZ,TV
30344,2.368955,0.206458,4,0.931419,6.29438,9.066581,1.128298,6.855157,11.278006,The iDOLM@STER Cinderella Girls 2nd Season,TV


In [26]:
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
    & (x["type"] != "Music")
]

In [27]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
122,0.565032,0.427999,14,0.457627,6.936235,7.597447,0.541964,6.535217,8.659678,Full Moon wo Sagashite,TV
16706,0.112754,1.797444,26,0.375066,7.269059,7.401006,0.441197,6.536276,8.265736,Kami nomi zo Shiru Sekai: Megami-hen,TV
36220,1.16917,0.49682,11,0.514974,6.384743,7.75293,0.620149,6.537459,8.9684,Itsudatte Bokura no Koi wa 10 cm Datta.,TV
11239,0.369812,2.376435,22,0.359809,6.94698,7.379742,0.425185,6.546395,8.213089,Hidamari Sketch x Honeycomb,TV
2164,0.005827,4.646545,49,0.236995,7.092265,7.099084,0.278663,6.552915,7.645253,Dennou Coil,TV
345,0.498122,1.675145,29,0.307022,6.739508,7.32242,0.365483,6.606086,8.038754,Eikoku Koi Monogatari Emma,TV
3604,0.412,2.828137,30,0.30996,6.869922,7.352053,0.367499,6.631769,8.072337,Hidamari Sketch x 365,TV
18195,0.061632,1.43083,33,0.291607,7.260824,7.332946,0.342948,6.660781,8.005112,Little Busters!: Refrain,TV
21,0.413234,0.550693,11,0.528568,7.427231,7.910806,0.623298,6.689164,9.132448,One Piece,TV
7062,0.353671,2.835073,30,0.309655,6.998473,7.412346,0.366344,6.694325,8.130368,Hidamari Sketch x ☆☆☆,TV


In [28]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
33041,1.801796,0.09048,2,1.608105,5.749169,7.857669,1.902065,4.129689,11.585648,Bubuki Buranki: Hoshi no Kyojin,TV
35828,1.44475,0.117439,2,1.611444,6.219783,7.91046,1.901974,4.18266,11.638259,Miira no Kaikata,TV
21,0.413234,0.550693,11,0.528568,7.427231,7.910806,0.623298,6.689164,9.132448,One Piece,TV
34834,2.120829,0.336806,4,0.938927,5.461001,7.942839,1.13042,5.727256,10.158423,Hina Logi: From Luck & Logic,TV
427,0.889891,0.812481,19,0.385871,6.94396,7.985329,0.465025,7.073897,8.896761,Kaleido Star,TV
890,1.120799,0.16196,2,1.614668,6.681039,7.992622,1.902887,4.263032,11.722211,Yuusha-Ou GaoGaiGar,TV
2929,1.360667,0.255276,4,0.929616,6.456567,8.048849,1.104114,5.884825,10.212872,Moonlight Mile 2nd Season: Touch Down,TV
3750,0.897188,1.367768,26,0.333189,7.011968,8.061876,0.405026,7.26804,8.855713,Maria-sama ga Miteru 4th,TV
2582,1.136278,0.316164,2,1.612547,6.758869,8.088565,1.90052,4.363615,11.813516,Soukou Kihei Votoms,TV
1728,1.448524,0.096433,4,0.928225,6.433931,8.129023,1.103963,5.965295,10.292751,Super GALS! Kotobuki Ran,TV
