In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
import functools
import statsmodels.formula.api as smf
@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()
import scipy.stats as st

In [3]:
# TODO calculate loss metrics

In [128]:
recommendee = 'Fro116'
neighborhood_size = 3000
confidence_interval = 0.95
min_similarity = 0

In [5]:
anime = pd.read_csv('AnimeList.csv')
anime = anime[['anime_id', 'title', 'type']]

In [6]:
df = pd.read_csv('UserAnimeList.csv')

In [7]:
len(df['username'].unique()), len(df['anime_id'].unique())

(283045, 14478)

In [8]:
filtered_df = df[['username', 'anime_id', 'my_score']].loc[lambda x: x['my_score'] != 0]

In [9]:
def read_xml(file, username):
    import xml.etree.ElementTree as ET

    xml_data = open(file, 'r').read()  # Read file
    root = ET.XML(xml_data)  # Parse XML

    data = []
    cols = []
    for i, child in enumerate(root):
        data.append([subchild.text for subchild in child])
        cols.append(child.tag)
    new_list = pd.DataFrame(data).T
    new_list.columns = cols

    df = new_list.loc[[0, 9]].T.dropna().rename({0: 'anime_id', 9: 'my_score'}, axis=1)
    df['username'] = username
    df['anime_id'] = df['anime_id'].astype(int)
    df['my_score'] = df['my_score'].astype(int)
    df['username'] = df['username'].astype(str)
    df = df.loc[lambda x: x['my_score'] != 0]
    df = df.reset_index(drop=True)
    return df

def add_user(full_df, xml_file, username):
    user_df = read_xml(xml_file, username)
    without_user = full_df.loc[lambda x: x['username'] != username]
    return pd.concat([without_user, user_df], ignore_index=True)

In [10]:
filtered_df = add_user(filtered_df, 'user_profiles/Fro116.xml', 'Fro116')

In [11]:
average_rating = filtered_df['my_score'].mean()
user_bias = pd.DataFrame(filtered_df.groupby('username')['my_score'].mean()).rename(
    {'my_score': 'user_bias'}, axis=1) - average_rating
anime_bias = pd.DataFrame(filtered_df.groupby('anime_id')['my_score'].mean()).rename(
    {'my_score': 'anime_bias'}, axis=1) - average_rating

In [12]:
filtered_df = filtered_df.merge(anime_bias, on = ['anime_id']).merge(user_bias, on = ['username'])
filtered_df['normalized_score'] = (filtered_df['my_score'] 
    - filtered_df['anime_bias'] - filtered_df['user_bias'] - average_rating)
filtered_df = filtered_df.set_index('username')
filtered_df = filtered_df.dropna()

In [13]:
filtered_df

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
karthiga,21,9,0.960564,-0.059898,0.605473
karthiga,59,7,0.040203,-0.059898,-0.474166
karthiga,74,7,0.316283,-0.059898,-0.750245
karthiga,120,7,0.309858,-0.059898,-0.743821
karthiga,178,7,-0.227338,-0.059898,-0.206624
...,...,...,...,...,...
temptemptemp,10040,6,-1.636717,-1.493860,1.636717
cinnamoroller,12963,10,-0.798860,2.506140,0.798860
inactiveX,5143,7,-0.652951,-0.493860,0.652951
omgm,5581,5,-1.857496,-2.493860,1.857496


In [14]:
user_subset = filtered_df.loc[[recommendee]].merge(filtered_df.reset_index(), on = 'anime_id')

In [15]:
adj_cos_corr_numerator = user_subset.groupby('username').apply(
    lambda x: np.dot(x['normalized_score_x'], x['normalized_score_y']))
adj_cos_corr_denom = filtered_df.groupby('username').apply(
    lambda x: np.sqrt(np.dot(x['normalized_score'], x['normalized_score'])))
adj_cos_corr_denom *= adj_cos_corr_denom.loc[recommendee]
adj_cos_corrs = pd.DataFrame((adj_cos_corr_numerator / adj_cos_corr_denom), columns=['corr'])
adj_cos_corrs = adj_cos_corrs.dropna()

In [322]:
neighborhood_size = 20000
confidence_interval = 0.95

In [323]:
corrs = adj_cos_corrs.copy()
corrs['similarity'] = corrs['corr'].abs()
corrs = corrs.sort_values(by='similarity').dropna()[-neighborhood_size:]
corrs = corrs.drop(recommendee) # Technically not needed because its a noop for new series, but its useful for debugging

In [324]:
corrs['similarity'].describe()

count    19999.000000
mean         0.070785
std          0.016481
min          0.053761
25%          0.058769
50%          0.065766
75%          0.077706
max          0.193013
Name: similarity, dtype: float64

In [325]:
score = (filtered_df.merge(pd.DataFrame(corrs), on = 'username')).dropna()

In [326]:
# add standard error of the weighted mean
user_var = pd.DataFrame(score.groupby('username')['normalized_score'].var()).rename(
    {'normalized_score': 'user_var'}, axis=1)
score = score.merge(user_var, on = 'username').dropna()
s2 = score.groupby('anime_id').apply(lambda x: np.dot(x['user_var'], x['corr'].abs()))
b = score.groupby('anime_id').apply(lambda x: x['corr'].abs().sum() * x['corr'].abs().sum())
sem = np.sqrt(s2 / b)

In [327]:
score

Unnamed: 0_level_0,anime_id,my_score,anime_bias,user_bias,normalized_score,corr,similarity,user_var
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Johnny_FBI,21,9,0.960564,0.10214,0.443436,0.056223,0.056223,1.639109
Johnny_FBI,59,8,0.040203,0.10214,0.363797,0.056223,0.056223,1.639109
Johnny_FBI,853,8,0.889199,0.10214,-0.485199,0.056223,0.056223,1.639109
Johnny_FBI,1698,10,0.998063,0.10214,1.405937,0.056223,0.056223,1.639109
Johnny_FBI,3092,4,0.506021,0.10214,-4.102021,0.056223,0.056223,1.639109
...,...,...,...,...,...,...,...,...
seras-fortuna,398,10,0.318136,2.50614,-0.318136,-0.078524,0.078524,0.417833
joinedforFal,35968,10,0.386392,2.50614,-0.386392,0.056017,0.056017,0.132217
joinedforFal,36028,10,-0.056090,2.50614,0.056090,0.056017,0.056017,0.132217
joinedforFal,36296,10,0.510160,2.50614,-0.510160,0.056017,0.056017,0.132217


In [328]:
deltas = score.groupby('anime_id').apply(lambda x: np.dot(x['normalized_score'], x['corr']) / x['corr'].abs().sum()) 
weights = score.groupby('anime_id').apply(lambda x: x['corr'].abs().sum())
counts = score.groupby('anime_id').size()

pred_df = pd.DataFrame()
pred_df['delta'] = deltas 
pred_df['weight'] = weights
pred_df['counts'] = counts
pred_df['delta_sem'] = sem
pred_df['blp'] = anime_bias + user_bias.loc[recommendee].squeeze()  + average_rating
pred_df = pred_df.dropna()
        
recomendee_seen_shows = filtered_df.loc[recommendee].merge(pred_df, on = ['anime_id'])
recomendee_seen_shows['target'] = recomendee_seen_shows['my_score'] - recomendee_seen_shows['blp']
model = lm('target ~ delta + 0', recomendee_seen_shows)
pred_df['score'] = model.predict(pred_df) + pred_df['blp']


pred_df['sem'] = np.sqrt(((pred_df['delta_sem'] * pred_df['delta_sem'] + pred_df['delta'] * pred_df['delta']) * 
                  (model.bse['delta'] * model.bse['delta'] + model.params['delta'] * model.params['delta'])
                 ) - pred_df['delta'] * pred_df['delta'] * model.params['delta'] * model.params['delta'])
zscore = st.norm.ppf(1-(1 - confidence_interval)/2)
pred_df['score_lower_bound'] = (pred_df['score'] - pred_df['sem'] * zscore)
pred_df['score_upper_bound'] = (pred_df['score'] + pred_df['sem'] * zscore)

pred_df = pred_df.merge(anime,on='anime_id')
pred_df = pred_df.set_index('anime_id')

In [329]:
# confirm that setting blp = 1 is reasonable
print(lm('my_score ~ delta + blp + 0', recomendee_seen_shows).summary())

                                 OLS Regression Results                                
Dep. Variable:               my_score   R-squared (uncentered):                   0.987
Model:                            OLS   Adj. R-squared (uncentered):              0.987
Method:                 Least Squares   F-statistic:                          1.340e+04
Date:                Thu, 22 Apr 2021   Prob (F-statistic):                        0.00
Time:                        11:06:34   Log-Likelihood:                         -390.56
No. Observations:                 344   AIC:                                      785.1
Df Residuals:                     342   BIC:                                      792.8
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [330]:
# confirm that the top shows are ones that the user rates highly
pred_df.sort_values(by='score_lower_bound')[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1887,0.385574,355.053987,5115,0.071251,6.734825,7.985976,0.234111,7.527127,8.444825,Lucky☆Star,TV
9253,-0.058475,626.780894,8933,0.051703,8.054028,7.864283,0.167932,7.535142,8.193425,Steins;Gate,TV
4382,0.602554,356.931501,5093,0.069057,6.114352,8.069581,0.231194,7.61645,8.522713,Suzumiya Haruhi no Yuuutsu (2009),TV
820,0.274497,76.3393,1086,0.166014,7.872021,8.762737,0.53954,7.705259,9.820215,Ginga Eiyuu Densetsu,OVA
2001,0.183465,549.843398,7853,0.056291,7.661604,8.256928,0.183543,7.897192,8.616665,Tengen Toppa Gurren Lagann,TV
12467,0.787035,116.635502,1704,0.119283,6.198381,8.752235,0.394198,7.97962,9.524849,Nazo no Kanojo X,TV
1689,0.411163,515.094921,7329,0.057114,7.022911,8.357096,0.189376,7.985926,8.728266,Byousoku 5 Centimeter,Movie
849,0.411954,562.471438,8055,0.055653,7.05583,8.39258,0.184753,8.030472,8.754689,Suzumiya Haruhi no Yuuutsu,TV
2759,0.472118,327.412147,4594,0.073863,7.150698,8.682674,0.243834,8.204768,9.160579,Evangelion: 1.0 You Are (Not) Alone,Movie
3785,0.715846,197.543585,2761,0.095636,6.528636,8.851488,0.317638,8.228929,9.474048,Evangelion: 3.0 You Can (Not) Redo,Movie


In [331]:
# Movies tend to be recaps of TV series
new_recs = pred_df.drop(filtered_df.loc[recommendee].anime_id, errors='ignore').loc[
    lambda x: (x['type'] != 'Movie') & (x['type'] != 'Special') & (x['type'] != 'OVA') & (x['type'] != 'ONA')]

In [332]:
seen_shows = pred_df.loc[pred_df.index.intersection(filtered_df.loc[recommendee].anime_id)]

In [333]:
seen_shows['my_score'] = filtered_df.loc[recommendee].set_index('anime_id')[['my_score']]

In [334]:
errors = (seen_shows['my_score'] - seen_shows['score'])
mse = np.dot(errors, errors) / len(errors)
print(mse)

0.6109711376256705


In [335]:
# using all data gets you 0.6569931689855164

In [336]:
zscore = st.norm.ppf(1-(1 - 0.95)/2)
print(zscore)
new_recs['score_lower_bound'] = (new_recs['score'] - new_recs['sem'] * zscore)
new_recs['score_upper_bound'] = (new_recs['score'] + new_recs['sem'] * zscore)

1.959963984540054


In [337]:
new_recs.loc[lambda x: (x['score_upper_bound'] > 8) & (x['delta'] > 0)].sort_values(by='score_lower_bound')[-20:]

Unnamed: 0_level_0,delta,weight,counts,delta_sem,blp,score,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
23269,0.068718,30.438048,449,0.227932,6.572363,6.795345,0.739955,5.34506,8.24563,Hello!! Kiniro Mosaic,TV
31771,0.170547,32.184525,481,0.233171,6.27845,6.831858,0.757103,5.347963,8.315753,Amanchu!,TV
1454,0.162703,33.753439,484,0.242412,6.370114,6.898069,0.787082,5.355418,8.440721,Kemonozume,TV
325,0.214162,40.755291,607,0.21289,6.039678,6.734614,0.691389,5.379516,8.089712,Peach Girl,TV
5262,0.052473,38.341557,585,0.223692,6.644993,6.815263,0.726177,5.391982,8.238544,Shugo Chara!! Doki,TV
7062,0.070883,21.509949,308,0.282188,6.998473,7.228483,0.916079,5.433001,9.023964,Hidamari Sketch x ☆☆☆,TV
1453,0.105915,18.055503,260,0.311568,7.09993,7.443615,1.01148,5.46115,9.426079,Maison Ikkoku,TV
996,0.011016,40.835429,611,0.214321,6.824428,6.860174,0.695743,5.496542,8.223806,Bishoujo Senshi Sailor Moon: Sailor Stars,TV
1852,0.127874,45.962408,665,0.199703,6.394334,6.809271,0.648397,5.538437,8.080106,Hidamari Sketch,TV
3604,0.093558,26.694466,384,0.255929,6.869922,7.173509,0.830857,5.545059,8.801959,Hidamari Sketch x 365,TV
