In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
os.chdir("../data/deltas")

In [3]:
confidence_interval = 0.99
delta_pvalue_cutoff = 0.1
use_intercept = False

In [4]:
deltas = []
sources = ["item.pkl", "user.pkl"]
for i, source in enumerate(sources):
    delta = pickle.load(open(source, "rb"))
    delta = delta[["delta", "delta_sem", "blp", "title", "type"]]
    delta = delta.rename({x: x + f"_{i}" for x in delta.columns}, axis=1)
    deltas.append(delta)
df = pd.concat(deltas, axis=1)

common_cols = ["blp", "title", "type"]
for col in common_cols:
    for i in range(len(sources)):
        df[col] = df[f'{col}_{i}'].loc[lambda x: ~x.isna()]
        df = df.drop(f"{col}_{i}", axis=1)
delta_corrs = df[['delta_0', 'delta_1']].corr()
df = df.loc[lambda x: ~x['blp'].isna()]

# fill missing data with reasonable defaults
for i in range(len(sources)):
    df[f'delta_{i}'] = df[f'delta_{i}'].fillna(0)
    df[f'delta_sem_{i}'] = df[f'delta_sem_{i}'].fillna(df[f'delta_sem_{i}'].median())

In [5]:
ground_truth = pickle.load(open("recommendee.pkl", "rb"))
df["ground_truth"] = ground_truth.set_index("anime_id")[["my_score"]]
df["target"] = df["ground_truth"] - df["blp"]

In [6]:
# get preliminary model
delta_cols = [f"delta_{i}" for i in range(len(sources))]
formula = "target ~ " + " + ".join(delta_cols)
if not use_intercept:
    formula += " + 0"
preliminary_model = lm(formula, df)
print(preliminary_model.summary())

                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   0.740
Model:                            OLS   Adj. R-squared (uncentered):              0.739
Method:                 Least Squares   F-statistic:                              487.0
Date:                Fri, 30 Apr 2021   Prob (F-statistic):                   8.52e-101
Time:                        14:48:33   Log-Likelihood:                         -429.86
No. Observations:                 344   AIC:                                      863.7
Df Residuals:                     342   BIC:                                      871.4
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [7]:
delta_cols = [
    x
    for x in preliminary_model.pvalues.index
    if "delta" in x and preliminary_model.pvalues[x] < delta_pvalue_cutoff
]
formula = "target ~ " + " + ".join(delta_cols)
if not use_intercept:
    formula += " + 0"
model = lm(formula, df)
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                 target   R-squared (uncentered):                   0.740
Model:                            OLS   Adj. R-squared (uncentered):              0.739
Method:                 Least Squares   F-statistic:                              487.0
Date:                Fri, 30 Apr 2021   Prob (F-statistic):                   8.52e-101
Time:                        14:48:33   Log-Likelihood:                         -429.86
No. Observations:                 344   AIC:                                      863.7
Df Residuals:                     342   BIC:                                      871.4
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [8]:
df["score"] = model.predict(df) + df["blp"]
df["delta"] = df["score"] - df["blp"]

In [9]:
# compute error bars
model_vars = pd.DataFrame()
for col in delta_cols:
    i = col.split("_")[1]
    model_vars[f"model_delta_var_{i}"] = (
        (df[f"delta_sem_{i}"] ** 2 + df[f"delta_{i}"] ** 2)
        * (model.bse[f"delta_{i}"] ** 2 + model.params[f"delta_{i}"] ** 2)
    ) - df[f"delta_{i}"] ** 2 * model.params[f"delta_{i}"] ** 2
model_stds = np.sqrt(model_vars)

delta_corrs = delta_corrs.loc[lambda x: (x.index.isin(delta_cols)), delta_cols]
delta_variance = np.sum(
    (model_stds.values @ delta_corrs.values) * model_stds.values, axis=1
)
intercept_variance = 0
if "Intercept" in model.bse:
    intercept_variance = model.bse["Intercept"] ** 2
df["sem"] = np.sqrt(delta_variance + intercept_variance)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
df["score_lower_bound"] = df["score"] - df["sem"] * zscore
df["score_upper_bound"] = df["score"] + df["sem"] * zscore

In [10]:
cols = (
    ["title", "type", "score", "score_lower_bound", "score_upper_bound", "delta", "sem"]
    + delta_cols
    + ["ground_truth"]
)
df = df[cols]  # + [x for x in df.columns if x not in cols]]

In [11]:
df.sort_values(by="score_lower_bound", ascending=False)[
    :20
]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,sem,delta_0,delta_1,ground_truth
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
32,Neon Genesis Evangelion: The End of Evangelion,Movie,10.154677,9.56973,10.739624,2.830747,0.227091,-0.030274,0.780101,10.0
2476,School Days,TV,10.380535,9.544128,11.216941,5.208446,0.324714,-0.192061,1.429737,8.0
30,Neon Genesis Evangelion,TV,9.311553,8.799216,9.823889,2.120519,0.198902,0.146865,0.591355,9.0
3784,Evangelion: 2.0 You Can (Not) Advance,Movie,9.108983,8.542397,9.675569,1.563358,0.219963,0.209351,0.440139,10.0
16201,Aku no Hana,TV,9.397192,8.497745,10.296638,3.683179,0.349187,-0.24637,1.006494,8.0
3297,Aria The Origination,TV,9.476427,8.284529,10.668324,1.825555,0.462724,0.419889,0.521179,10.0
11981,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie,8.807902,8.193564,9.42224,1.398738,0.238501,-0.076963,0.382913,10.0
9756,Mahou Shoujo Madoka★Magica,TV,8.550078,8.087686,9.012469,1.102078,0.179512,-0.411264,0.287266,10.0
227,FLCL,OVA,8.570149,8.011109,9.129188,1.735142,0.217033,-0.468948,0.459631,9.0
3785,Evangelion: 3.0 You Can (Not) Redo,Movie,8.61884,7.925719,9.311962,2.090204,0.269087,0.564104,0.600165,9.0


In [12]:
new_recs = df.loc[
    lambda x: x.ground_truth.isna() & (~x["score"].isna()) & (x["type"] == "TV")
]
new_recs = new_recs.drop('ground_truth', axis=1)

In [13]:
new_recs.loc[lambda x: (x["delta"] > 0) & ~(x[delta_cols] < 0).all(axis=1)].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,sem,delta_0,delta_1
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
33089,Kemono Friends,TV,7.689167,6.24018,9.138154,1.277212,0.562532,-0.065553,0.349839
32681,Uchuu Patrol Luluco,TV,7.199993,6.193051,8.206936,0.821167,0.39092,-0.154924,0.220282
26,Texhnolyze,TV,6.940036,6.069221,7.810851,0.426273,0.338072,-0.426106,0.100118
26165,Yuri Kuma Arashi,TV,7.127059,6.038556,8.215562,1.231847,0.422583,-0.003605,0.339868
1454,Kemonozume,TV,7.153476,5.847962,8.45899,0.783362,0.506833,-0.671933,0.188562
14131,Girls & Panzer,TV,6.75147,5.83936,7.663579,0.211697,0.354103,-0.511202,0.037387
147,Kimi ga Nozomu Eien,TV,6.604219,5.78052,7.427919,0.241538,0.31978,-0.190028,0.058847
8726,Soredemo Machi wa Mawatteiru,TV,7.019911,5.735173,8.304649,0.642121,0.498767,-1.059693,0.133613
634,Koi Kaze,TV,6.820464,5.701509,7.939419,0.54679,0.434406,-0.08321,0.1475
182,Tenkuu no Escaflowne,TV,6.735623,5.683228,7.788019,0.042268,0.408566,0.307807,0.024339


In [None]:
# TODO cross-validated LR