In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data")

In [3]:
recommendee = "Fro116"
neighborhood_size = 64
confidence_interval = 0.99
full_neighborhoods = False
perform_regression = True

In [4]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [5]:
df = pd.read_csv("UserAnimeList.csv")

In [6]:
len(df["username"].unique()), len(df["anime_id"].unique()), len(df) / (
    len(df["username"].unique()) * len(df["anime_id"].unique())
), len(df)

(283045, 14478, 0.01954064606703893, 80076112)

In [7]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [8]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, "r").read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: "anime_id", 9: "my_score"}, axis=1)
    df["username"] = username
    df["anime_id"] = df["anime_id"].astype(int)
    df["my_score"] = df["my_score"].astype(int)
    df["username"] = df["username"].astype(str)
    df = df.loc[lambda x: x["my_score"] != 0]
    df = df.reset_index(drop=True)
    return df


def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x["username"] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [9]:
filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")

In [10]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [11]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [12]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605473
karthiga,59,7,0.040203,-0.059898,-0.474166
karthiga,74,7,0.316283,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227338,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857496,-2.493860,1.857496


In [13]:
all_corrs = pickle.load(open("item_correlations/correlations.pkl", "rb"))

In [14]:
all_corrs["similarity"] = all_corrs["corr"].abs()
all_corrs = all_corrs.dropna()

In [15]:
corrs = all_corrs.loc[
    lambda x: x.index.get_level_values("anime_id_x")
    != x.index.get_level_values("anime_id_y")
]

In [16]:
if full_neighborhoods:
    corrs = corrs.groupby("anime_id_x").apply(lambda x: x.sort_values(by="similarity"))
else:
    corrs = corrs.groupby("anime_id_x").apply(
        lambda x: x.sort_values(by="similarity")[-neighborhood_size:]
    )

In [17]:
corrs.index = corrs.index.droplevel()

In [18]:
corrs

Unnamed: 0_level_0,Unnamed: 1_level_0,corr,corr_var,size,similarity
anime_id_x,anime_id_y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,239,0.085037,0.000085,11599,0.085037
1,440,0.085474,0.000131,7531,0.085474
1,2236,0.088889,0.000030,33034,0.088889
1,875,0.089690,0.000192,5127,0.089690
1,790,0.089991,0.000040,24767,0.089991
...,...,...,...,...,...
37908,28913,0.207284,0.915913,3,0.207284
37908,28915,0.226825,0.899748,3,0.226825
37908,28911,0.235301,0.892332,3,0.235301
37908,37124,0.267396,0.862111,3,0.267396


In [19]:
score = filtered_df.loc[recommendee].merge(
    corrs.reset_index("anime_id_x"), left_on="anime_id", right_on="anime_id_y"
)

user_var = (
    pd.DataFrame(filtered_df.groupby("username")["normalized_score"].var())
    .rename({"normalized_score": "user_var"}, axis=1)
    .dropna()
)
score["user_var"] = user_var.loc["Fro116"].squeeze()

anime_var = (
    pd.DataFrame(filtered_df.groupby("anime_id")["normalized_score"].var())
    .rename({"normalized_score": "anime_var"}, axis=1)
    .dropna()
)
score = score.merge(anime_var, on="anime_id")

score = score.drop("anime_id", axis=1).rename({"anime_id_x": "anime_id"}, axis=1)

if full_neighborhoods:
    score = (
        score.groupby("anime_id")
        .apply(lambda x: x.sort_values(by="similarity")[-neighborhood_size:])
        .reset_index(drop=True)
    )

In [20]:
score.head()

Unnamed: 0,my_score,anime_bias,user_bias,normalized_score,anime_id,corr,corr_var,size,similarity,user_var,anime_var
0,7,0.219362,-1.027193,0.313971,45,0.055748,0.000104,9546,0.055748,2.582855,1.630601
1,7,0.219362,-1.027193,0.313971,123,0.048346,0.000212,4696,0.048346,2.582855,1.630601
2,7,0.219362,-1.027193,0.313971,154,0.034332,0.000147,6792,0.034332,2.582855,1.630601
3,7,0.219362,-1.027193,0.313971,189,0.048751,0.000113,8816,0.048751,2.582855,1.630601
4,7,0.219362,-1.027193,0.313971,223,0.075094,8.6e-05,11440,0.075094,2.582855,1.630601


In [21]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

In [22]:
# The following formulae are used to compute the variance of the delta. Delta
# is a weighted sum of the form δ = Σ(s_i * w_i) / (Σw_i), where s_i is
# a vector scores for item i and w_i is the weight for item_i.
#
# By linearity, it suffices to compute (s_i * w_i) / (Σw_i). We assume that
# Var(s_i) is the same as the variance over the vector s_i (i.e. over
# all users s_i has rated). We treat w_i as a random variable with mean w_i
# and variance corr['corr_var']
#
# The variance for (w_i) / (Σw_i) can be estimated by doing a Taylor Approximation.
# See equation 20 of https://www.stat.cmu.edu/~hseltman/files/ratio.pdf. The
# formula for the ratio of two correlated variables R,S is
# Var(R/S) = E[R]^2/E[S]^2(Var[R]/E[R]^2 - 2Cov(R,S)/(E[R]E[S]) + Var[S]/E[S]^2)
#
# Lastly we take the product distribution of s_i and (w_i) / (Σw_i).
def correction_factor(x):
    return (
        1
        + x["corr_var"] / (x["corr"] ** 2)
        - 2 * x["corr_var"] / (x["corr"].abs().sum() * x["corr"].abs())
        + x["corr_var"].sum() / (x["corr"].abs().sum() ** 2)
    )


delta_var = score.groupby("anime_id").apply(
    lambda x: np.sum(x["user_var"] * x["corr"] ** 2 * correction_factor(x))
    / (x["corr"].abs().sum() ** 2)
)

# if the var < 0, then the ratio distribution approximation failed,
# usually because sample size is too small
delta_var.loc[lambda x: x < 0] = np.inf

# The above is a biased estimator of the variance. To unbias the estimator,
# we need to apply a Bessel-like correction. See the formula in
# (https://stats.stackexchange.com/questions/47325/bias-correction-in-weighted-variance)
bias_correction = (
    score.set_index("anime_id")
    .loc[counts > 1]
    .groupby("anime_id")
    .apply(
        lambda x: (x["corr"].abs().sum() ** 2)
        / (x["corr"].abs().sum() ** 2 - (x["corr"] ** 2).sum())
    )
)
delta_var *= bias_correction

In [23]:
pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = np.sqrt(delta_var)
pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
if perform_regression:

    model = lm("target ~ delta + 0", recomendee_seen_shows)
    pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
    pred_df["sem"] = np.sqrt(
        (
            (pred_df["delta_sem"] ** 2 + pred_df["delta"] ** 2)
            * (model.bse["delta"] ** 2 + model.params["delta"] ** 2)
        )
        - pred_df["delta"] ** 2 * model.params["delta"] ** 2
    )
else:
    pred_df["score"] = pred_df["delta"] + pred_df["blp"]
    pred_df["sem"] = pred_df["delta_sem"]


zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [24]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.955
Model:                            OLS   Adj. R-squared (uncentered):              0.954
Method:                 Least Squares   F-statistic:                              3418.
Date:                Wed, 28 Apr 2021   Prob (F-statistic):                   5.70e-219
Time:                        19:21:10   Log-Likelihood:                         -583.56
No. Observations:                 327   AIC:                                      1171.
Df Residuals:                     325   BIC:                                      1179.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [25]:
# confirm that the top shows are ones that the user rates highly
pred_df.loc[lambda x: x['delta'] > 0].sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
290,1.130285,3.664641,26,0.488551,6.773893,7.894695,0.500521,6.605437,9.183952,Seikai no Monshou,TV
29893,0.59834,3.416642,23,0.425594,7.129651,7.722971,0.428639,6.618869,8.827073,Aria The Avvenire,Special
4772,0.701163,4.54191,25,0.442143,7.081624,7.776903,0.446538,6.626697,8.92711,Aria The Origination: Sono Choppiri Himitsu no...,Special
1575,0.014877,3.03534,29,0.415301,7.706751,7.721503,0.413999,6.655111,8.787895,Code Geass: Hangyaku no Lelouch,TV
3371,0.900205,1.576788,13,0.527144,7.154531,8.047183,0.533477,6.673038,9.421328,Ginga Eiyuu Densetsu Gaiden,OVA
18617,0.728571,0.821585,10,0.564388,7.422199,8.144658,0.567519,6.682825,9.60649,Girls & Panzer Movie,Movie
396,1.378188,3.594813,23,0.562732,6.813696,8.180321,0.578368,6.690545,9.670097,Seikai no Senki,TV
11979,0.328184,2.438581,28,0.392168,7.401396,7.726826,0.392372,6.716142,8.73751,Mahou Shoujo Madoka★Magica Movie 2: Eien no Mo...,Movie
2563,0.838548,4.750159,27,0.43606,7.076386,7.907899,0.443053,6.766669,9.049128,Aria The OVA: Arietta,OVA
3784,0.209351,3.627105,34,0.381311,7.545625,7.75322,0.380714,6.772564,8.733875,Evangelion: 2.0 You Can (Not) Advance,Movie


In [26]:
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
    & (x["type"] != "Music")
]

In [27]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
113,1.075809,0.707219,13,0.547701,6.408509,7.475292,0.556935,6.040722,8.909863,Uchuu no Stellvia,TV
2942,0.765327,1.629399,19,0.412189,6.372702,7.131608,0.418269,6.05422,8.208997,Sketchbook: Full Color&#039;s,TV
36220,1.570682,0.384963,8,0.708493,6.384743,7.942247,0.72427,6.07665,9.807844,Itsudatte Bokura no Koi wa 10 cm Datta.,TV
182,0.307807,1.566413,25,0.343872,6.693356,6.998581,0.344232,6.111898,7.885264,Tenkuu no Escaflowne,TV
165,0.657154,1.254385,21,0.379607,6.475967,7.127608,0.384326,6.137651,8.117565,RahXephon,TV
16706,0.060068,1.407809,18,0.458158,7.269059,7.328623,0.45676,6.152086,8.50516,Kami nomi zo Shiru Sekai: Megami-hen,TV
21,0.415994,0.440726,8,0.628157,7.427231,7.839734,0.627625,6.223078,9.45639,One Piece,TV
7062,0.390294,1.71556,15,0.449438,6.998473,7.385493,0.449797,6.226891,8.544094,Hidamari Sketch x ☆☆☆,TV
11239,0.489917,1.723251,14,0.462356,6.94698,7.432786,0.463614,6.238596,8.626977,Hidamari Sketch x Honeycomb,TV
488,0.485514,1.418199,21,0.377099,6.751288,7.232729,0.379173,6.256044,8.209414,Ichigo Mashimaro,TV


In [28]:
pred_df.to_pickle("item.pkl")

In [30]:
filtered_df.loc[recommendee].to_pickle("recommendee.pkl")