In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data")

In [3]:
confidence_interval = 0.99

In [4]:
deltas = []
sources = ["item.pkl", "user.pkl"]
for i, source in enumerate(sources):
    delta = pickle.load(open(source, "rb"))
    delta = delta[["delta", "delta_sem", "blp"]]
    delta = delta.rename({x: x + f"_{i}" for x in delta.columns}, axis=1)
    deltas.append(delta)
df = pd.concat(deltas, axis=1)
df["blp"] = df["blp_0"]
df = df.drop([f"blp_{i}" for i in range(len(sources))], axis=1)

In [5]:
df

Unnamed: 0_level_0,delta_0,delta_sem_0,delta_1,delta_sem_1,blp
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,-0.503429,0.291880,-0.168239,0.042586,7.652311
5,-0.625288,0.335929,-0.228909,0.060957,7.309965
6,-0.380304,0.369579,-0.363544,0.053938,7.175470
7,-0.076563,0.733353,-0.197148,0.120180,6.258470
8,,,0.099198,0.431528,
...,...,...,...,...,...
37778,0.547533,14.287187,-0.148233,1.680772,5.187092
37814,,,0.308476,1.769349,
37831,,,0.025232,0.391894,
37843,,,-1.603569,1.179988,


In [6]:
truth = pickle.load(open("recommendee.pkl", "rb"))

In [7]:
df["ground_truth"] = truth.set_index("anime_id")[["my_score"]]

In [8]:
# confirm that setting fixing the blp coef to 1 is reasonable
variables = [f"delta_{i}" for i in range(len(sources))]
formula = "ground_truth ~ " + " + ".join(variables + ["blp", "0"])
labelled_data = df.loc[lambda x: ~x.ground_truth.isna()]
lm(formula, labelled_data).summary()

0,1,2,3
Dep. Variable:,ground_truth,R-squared (uncentered):,0.988
Model:,OLS,Adj. R-squared (uncentered):,0.988
Method:,Least Squares,F-statistic:,8855.0
Date:,"Wed, 28 Apr 2021",Prob (F-statistic):,1.87e-310
Time:,20:56:27,Log-Likelihood:,-366.72
No. Observations:,327,AIC:,739.4
Df Residuals:,324,BIC:,750.8
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
delta_0,-0.1071,0.066,-1.617,0.107,-0.237,0.023
delta_1,3.5928,0.120,29.942,0.000,3.357,3.829
blp,1.0340,0.006,159.887,0.000,1.021,1.047

0,1,2,3
Omnibus:,26.706,Durbin-Watson:,1.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,33.24
Skew:,-0.637,Prob(JB):,6.05e-08
Kurtosis:,3.902,Cond. No.,21.0


In [9]:
# get model
df["target"] = df["ground_truth"] - df["blp"]
variables = [f"delta_{i}" for i in range(len(sources))]
formula = "target ~ " + " + ".join(variables + ["0"])
labelled_data = df.loc[lambda x: ~x.ground_truth.isna()]
model = lm(formula, labelled_data)

In [10]:
df["score"] = model.predict(df) + df["blp"]
df['delta'] = df['score'] - df['blp']

In [11]:
model_deltas = df[[f'delta_{i}' for i in range(len(sources))]]
model_vars = pd.DataFrame()
for i in range(len(sources)):
    model_vars[f'model_delta_var_{i}'] = ((df[f"delta_sem_{i}"] ** 2 + df[f"delta_{i}"] ** 2)
            * (model.bse[f"delta_{i}"] ** 2 + model.params[f"delta_{i}"] ** 2)
    ) - df[f"delta_{i}"] ** 2 * model.params[f"delta_{i}"] ** 2
df['sem'] = np.sqrt(np.sum((model_vars.values @ model_deltas.corr().values) * model_vars.values, axis=1))

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
df["score_lower_bound"] = df["score"] - df["sem"] * zscore
df["score_upper_bound"] = df["score"] + df["sem"] * zscore

In [12]:
anime = pd.read_csv("AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]
df = df.merge(anime, on="anime_id")
df = df.set_index("anime_id")
df = df.loc[lambda x: ~x.score.isna()]

In [13]:
df.loc[lambda x: x['delta'] > 0].sort_values(by='score_lower_bound', ascending=False)[:20]

Unnamed: 0_level_0,delta_0,delta_sem_0,delta_1,delta_sem_1,blp,ground_truth,target,score,delta,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
32,-0.030274,0.3171,0.984038,0.046367,7.32393,10.0,2.67607,10.740885,3.416955,0.041555,10.633847,10.847923,Neon Genesis Evangelion: The End of Evangelion,Movie
2476,-0.192061,0.49376,1.599765,0.055662,5.172089,8.0,2.827911,10.749396,5.577307,0.078596,10.546946,10.951846,School Days,TV
30,0.146865,0.314164,0.759852,0.041185,7.191034,9.0,1.808966,9.802937,2.611903,0.030292,9.72491,9.880964,Neon Genesis Evangelion,TV
3297,0.419889,0.395643,0.650147,0.106418,7.650871,10.0,2.349129,9.839718,2.188847,0.144748,9.466872,10.212564,Aria The Origination,TV
16201,-0.24637,0.316523,1.102173,0.092813,5.714013,8.0,2.285987,9.574365,3.860352,0.123115,9.257241,9.891489,Aku no Hana,TV
3784,0.209351,0.381311,0.506915,0.046992,7.545625,10.0,2.454375,9.27069,1.725065,0.032341,9.187386,9.353995,Evangelion: 2.0 You Can (Not) Advance,Movie
721,-0.363701,0.282833,0.739969,0.129318,7.091929,10.0,2.908071,9.714637,2.622707,0.210638,9.172069,10.257204,Princess Tutu,TV
227,-0.468948,0.267311,0.580381,0.047033,6.835007,9.0,2.164993,8.920773,2.085766,0.032952,8.835894,9.005652,FLCL,OVA
11981,-0.076963,0.324732,0.428406,0.054047,7.409164,10.0,2.590836,8.906714,1.497551,0.039169,8.805822,9.007607,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie
9756,-0.411264,0.259471,0.377928,0.039904,7.447999,10.0,2.552001,8.822734,1.374735,0.022473,8.764847,8.880621,Mahou Shoujo Madoka★Magica,TV


In [14]:
new_recs = df.loc[lambda x: ~x.index.isin(labelled_data.index) & (~x['score'].isna()) & (x['type'] == 'TV')]

In [15]:
new_recs.loc[lambda x: x['delta'] > 0].sort_values(by='score_lower_bound', ascending=False)[:20]

Unnamed: 0_level_0,delta_0,delta_sem_0,delta_1,delta_sem_1,blp,ground_truth,target,score,delta,sem,score_lower_bound,score_upper_bound,title,type
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
33089,-0.065553,0.644602,0.356268,0.136439,6.411954,,,7.657582,1.245627,0.230785,7.063119,8.252044,Kemono Friends,TV
32681,-0.154924,0.383231,0.261898,0.095176,6.378826,,,7.311175,0.932349,0.111784,7.023238,7.599112,Uchuu Patrol Luluco,TV
26,-0.426106,0.310351,0.119637,0.090246,6.513763,,,6.995172,0.481409,0.099676,6.738424,7.251921,Texhnolyze,TV
7062,0.390294,0.449438,0.075471,0.130163,6.998473,,,7.199211,0.200738,0.206592,6.667065,7.731357,Hidamari Sketch x ☆☆☆,TV
1453,0.345939,0.86902,0.151751,0.172103,7.09993,,,7.572103,0.472173,0.365899,6.62961,8.514595,Maison Ikkoku,TV
3604,0.630296,0.422475,0.079667,0.12314,6.869922,,,7.047722,0.1778,0.185379,6.570218,7.525226,Hidamari Sketch x 365,TV
147,-0.190028,0.42281,0.099974,0.073091,6.362681,,,6.739029,0.376348,0.066645,6.567362,6.910697,Kimi ga Nozomu Eien,TV
26165,-0.003605,0.309611,0.278867,0.102497,5.895212,,,6.862767,0.967555,0.128702,6.531251,7.194282,Yuri Kuma Arashi,TV
14131,-0.511202,0.548712,0.028748,0.078637,6.539773,,,6.719308,0.179535,0.078709,6.516568,6.922048,Girls & Panzer,TV
11239,0.489917,0.462356,0.09737,0.149008,6.94698,,,7.208092,0.261112,0.270261,6.511946,7.904239,Hidamari Sketch x Honeycomb,TV
