In [1]:
import functools
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "Fro116"
neighborhood_size = 8192
confidence_interval = 0.95

In [3]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]

In [4]:
df = pd.read_csv("UserAnimeList.csv")

In [5]:
len(df["username"].unique()), len(df["anime_id"].unique())

(283045, 14478)

In [6]:
filtered_df = df[["username", "anime_id", "my_score"]].loc[lambda x: x["my_score"] != 0]

In [7]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, "r").read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: "anime_id", 9: "my_score"}, axis=1)
    df["username"] = username
    df["anime_id"] = df["anime_id"].astype(int)
    df["my_score"] = df["my_score"].astype(int)
    df["username"] = df["username"].astype(str)
    df = df.loc[lambda x: x["my_score"] != 0]
    df = df.reset_index(drop=True)
    return df


def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x["username"] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [8]:
filtered_df = add_user(filtered_df, "user_profiles/Fro116.xml", "Fro116")

In [9]:
average_rating = filtered_df["my_score"].mean()
user_bias = (
    pd.DataFrame(filtered_df.groupby("username")["my_score"].mean()).rename(
        {"my_score": "user_bias"}, axis=1
    )
    - average_rating
)
anime_bias = (
    pd.DataFrame(filtered_df.groupby("anime_id")["my_score"].mean()).rename(
        {"my_score": "anime_bias"}, axis=1
    )
    - average_rating
)

In [10]:
filtered_df = filtered_df.merge(anime_bias, on=["anime_id"]).merge(
    user_bias, on=["username"]
)
filtered_df["normalized_score"] = (
    filtered_df["my_score"]
    - filtered_df["anime_bias"]
    - filtered_df["user_bias"]
    - average_rating
)
filtered_df = filtered_df.set_index("username")
filtered_df = filtered_df.dropna()

In [11]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605473
karthiga,59,7,0.040203,-0.059898,-0.474166
karthiga,74,7,0.316283,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227338,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857496,-2.493860,1.857496


In [68]:
all_corrs = pickle.load(open("item_correlations/correlations.pkl", "rb"))

In [70]:
all_corrs["similarity"] = all_corrs["corr"].abs()
all_corrs = all_corrs.dropna()

In [81]:
related_items = filtered_df.loc[recommendee].merge(all_corrs.reset_index("anime_id_y"), left_on = 'anime_id', right_on = 'anime_id_x')

In [85]:
len(related_items['anime_id_y'].unique())

12333

In [71]:
neighborhood_size = 10
confidence_interval = 0.95

In [113]:
neighborhood = related_items.groupby('anime_id_y').apply(lambda x:x.sort_values(by='similarity')[-neighborhood_size:])
neighborhood = neighborhood.drop('anime_id_y', axis=1)
neighborhood.index = neighborhood.index.get_level_values('anime_id_y')

In [120]:
neighborhood

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity
anime_id_y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1033,5,0.806017,-1.027193,-2.272683,0.130666,0.130666
1,199,6,1.329722,-1.027193,-1.796388,0.133833,0.133833
1,22135,6,1.002012,-1.027193,-1.468678,0.133843,0.133843
1,437,7,0.710424,-1.027193,-0.177091,0.140756,0.140756
1,7785,6,0.958944,-1.027193,-1.425611,0.144233,0.144233
...,...,...,...,...,...,...,...
37860,34240,7,0.625534,-1.027193,-0.092201,0.010892,0.010892
37860,2167,8,0.752724,-1.027193,0.780610,-0.010974,0.010974
37860,12893,6,0.307927,-1.027193,-0.774594,-0.017593,0.017593
37860,16782,7,0.731517,-1.027193,-0.198184,-0.019830,0.019830


In [116]:
deltas = neighborhood.groupby("anime_id_y").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)

In [None]:
neighborhood.groupby("anime_id_y").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)

In [21]:
score = (filtered_df.merge(pd.DataFrame(corrs), on="username")).dropna()

In [22]:
# add standard error of the weighted mean
user_var = pd.DataFrame(score.groupby("username")["normalized_score"].var()).rename(
    {"normalized_score": "user_var"}, axis=1
)
score = score.merge(user_var, on="username").dropna()
s2 = score.groupby("anime_id").apply(lambda x: np.dot(x["user_var"], x["corr"].abs()))
b = score.groupby("anime_id").apply(
    lambda x: x["corr"].abs().sum() * x["corr"].abs().sum()
)
sem = np.sqrt(s2 / b)

In [23]:
score

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity,user_var
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Ryan_,21,10,0.960564,-0.271638,0.799789,-0.083694,0.083694,1.0
Ryan_,7674,9,0.770314,-0.271638,0.443403,-0.083694,0.083694,1.0
Ryan_,9513,7,0.308555,-0.271638,-0.233605,-0.083694,0.083694,1.0
Ryan_,9863,6,0.537837,-0.271638,-0.774634,-0.083694,0.083694,1.0
Ryan_,19,7,1.180677,-0.271638,-0.617442,-0.083694,0.083694,1.0
...,...,...,...,...,...,...,...,...
MayaMyu,6774,7,0.446355,1.506140,-1.677478,0.088455,0.088455,1.0
seras-fortuna,396,10,0.347029,2.506140,-0.536864,-0.078524,0.078524,1.0
seras-fortuna,4503,10,-0.904091,2.506140,1.398655,-0.078524,0.078524,1.0
seras-fortuna,397,10,0.478340,2.506140,-0.740006,-0.078524,0.078524,1.0


In [24]:
deltas = score.groupby("anime_id").apply(
    lambda x: np.dot(x["normalized_score"], x["corr"]) / x["corr"].abs().sum()
)
weights = score.groupby("anime_id").apply(lambda x: x["corr"].abs().sum())
counts = score.groupby("anime_id").size()

pred_df = pd.DataFrame()
pred_df["delta"] = deltas
pred_df["weight"] = weights
pred_df["counts"] = counts
pred_df["delta_sem"] = sem
pred_df["blp"] = anime_bias + user_bias.loc[recommendee].squeeze() + average_rating
pred_df = pred_df.dropna()

recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on=["anime_id"])
recomendee_seen_shows["target"] = (
    recomendee_seen_shows["my_score"] - recomendee_seen_shows["blp"]
)
model = lm("target ~ delta + 0", recomendee_seen_shows)
pred_df["score"] = model.predict(pred_df) + pred_df["blp"]
pred_df["sem"] = np.sqrt(
    (
        (
            pred_df["delta_sem"] * pred_df["delta_sem"]
            + pred_df["delta"] * pred_df["delta"]
        )
        * (
            model.bse["delta"] * model.bse["delta"]
            + model.params["delta"] * model.params["delta"]
        )
    )
    - pred_df["delta"]
    * pred_df["delta"]
    * model.params["delta"]
    * model.params["delta"]
)
zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)

# account for variance scaling
# pred_df['score'] *= user_stds.loc[recommendee].squeeze()
# pred_df['sem'] *= user_stds.loc[recommendee].squeeze()

pred_df["score_lower_bound"] = pred_df["score"] - pred_df["sem"] * zscore
pred_df["score_upper_bound"] = pred_df["score"] + pred_df["sem"] * zscore

pred_df = pred_df.merge(anime, on="anime_id")
pred_df = pred_df.set_index("anime_id")

In [25]:
# confirm that setting blp = 1 is reasonable
print(lm("my_score ~ delta + blp + 0", recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.986
Model:                            OLS   Adj. R-squared (uncentered):              0.986
Method:                 Least Squares   F-statistic:                          1.201e+04
Date:                Fri, 23 Apr 2021   Prob (F-statistic):                   1.60e-317
Time:                        14:28:46   Log-Likelihood:                         -409.18
No. Observations:                 344   AIC:                                      822.4
Df Residuals:                     342   BIC:                                      830.0
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [26]:
# confirm that the top shows are ones that the user rates highly
pred_df.sort_values(by="score_lower_bound")[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
339,0.268323,129.764094,1540,0.087785,6.81444,7.731486,0.301631,7.140301,8.322671,Serial Experiments Lain,TV
4382,0.49261,176.30427,2092,0.075313,6.114352,7.797944,0.263181,7.282118,8.313769,Suzumiya Haruhi no Yuuutsu (2009),TV
9253,-0.110428,312.870366,3718,0.056535,8.054028,7.676617,0.193702,7.296969,8.056266,Steins;Gate,TV
820,0.215397,38.972568,465,0.160184,7.872021,8.608182,0.548259,7.533614,9.682751,Ginga Eiyuu Densetsu,OVA
2001,0.082904,274.178802,3268,0.060393,7.661604,7.944943,0.206712,7.539795,8.350091,Tengen Toppa Gurren Lagann,TV
849,0.285965,273.450786,3251,0.060473,7.05583,8.033171,0.209169,7.623206,8.443135,Suzumiya Haruhi no Yuuutsu,TV
12467,0.729283,51.894094,626,0.138816,6.198381,8.690847,0.481425,7.747272,9.634423,Nazo no Kanojo X,TV
1689,0.349579,257.658852,3051,0.062298,7.022911,8.217667,0.216478,7.793378,8.641957,Byousoku 5 Centimeter,Movie
3785,0.611084,106.412301,1247,0.09694,6.528636,8.617137,0.338246,7.954187,9.280087,Evangelion: 3.0 You Can (Not) Redo,Movie
227,0.479101,197.49093,2347,0.071158,6.835007,8.47243,0.248979,7.984441,8.960419,FLCL,OVA


In [27]:
# Movies tend to be recaps of TV series
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors="ignore").loc[
    lambda x: (x["type"] != "Movie")
    & (x["type"] != "Special")
    & (x["type"] != "OVA")
    & (x["type"] != "ONA")
]

In [28]:
seen_shows = pred_df.loc[
    pred_df.index.intersection(filtered_df.loc[recommendee].anime_id)
]

In [29]:
seen_shows["my_score"] = filtered_df.loc[recommendee].set_index("anime_id")[
    ["my_score"]
]

In [31]:
errors = seen_shows["my_score"] - seen_shows["score"]
mse = np.dot(errors, errors) / len(errors)
print(mse)

0.7201056634268247


In [32]:
# using all data gets you 0.6569931689855164

In [33]:
zscore = st.norm.ppf(1 - (1 - 0.95) / 2)
print(zscore)
new_recs["score_lower_bound"] = new_recs["score"] - new_recs["sem"] * zscore
new_recs["score_upper_bound"] = new_recs["score"] + new_recs["sem"] * zscore

1.959963984540054


In [38]:
new_recs.loc[lambda x: (x["score_upper_bound"] > 8) & (x["delta"] > 0)].sort_values(
    by="score_lower_bound"
)[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
12149,0.166111,12.858375,158,0.278873,6.309175,6.876892,0.953774,5.007529,8.746255,AKB0048,TV
369,0.151609,18.219909,219,0.234276,6.107472,6.625626,0.801272,5.055162,8.19609,Boogiepop wa Warawanai: Boogiepop Phantom,TV
593,0.235799,8.999241,106,0.333347,6.555974,7.361863,1.140167,5.127177,9.596549,Mugen no Ryvius,TV
1579,0.073146,13.790245,173,0.269286,6.731344,6.981335,0.920851,5.1765,8.78617,Kiniro no Corda: Primo Passo,TV
3604,0.056552,12.652631,151,0.281132,6.869922,7.063198,0.961341,5.179005,8.947392,Hidamari Sketch x 365,TV
1852,0.075446,20.830903,247,0.219102,6.394334,6.652186,0.749258,5.183667,8.120704,Hidamari Sketch,TV
8726,0.137174,17.048225,207,0.242192,6.37779,6.84661,0.828307,5.223158,8.470062,Soredemo Machi wa Mawatteiru,TV
158,0.188618,13.356078,162,0.273628,6.416937,7.061574,0.935893,5.227258,8.895891,Maria-sama ga Miteru,TV
32526,0.098443,21.04217,254,0.217999,6.370506,6.706953,0.74552,5.24576,8.168145,Love Live! Sunshine!!,TV
634,0.119219,22.771747,271,0.209557,6.273674,6.681129,0.716693,5.276436,8.085823,Koi Kaze,TV
