In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "Fro116"
confidence_interval = 0.99
delta_sources = ["item", "user"]
cross_validate = False # if true, train linear model on out of sample data

In [3]:
os.chdir(f"../../data/deltas/{recommendee}")

In [4]:
def get_deltas(sources):
    deltas = []
    for i, source in enumerate(sources):
        delta = pickle.load(open(source, "rb"))
        delta = delta.rename({x: x + f"_{i}" for x in delta.columns}, axis=1)
        deltas.append(delta)
    return pd.concat(deltas, axis=1)

In [5]:
def clean_data(df):
    # fill missing data with reasonable defaults
    num_deltas = len([x for x in df.columns if "delta_var" in x])
    for i in range(num_deltas):
        df[f"delta_{i}"] = df[f"delta_{i}"].fillna(0)
        df[f"delta_var_{i}"] = df[f"delta_var_{i}"].fillna(df[f"delta_var_{i}"].max())
    return df

In [6]:
if cross_validate:
    train_df = get_deltas([f"{x}_oos.pkl" for x in delta_sources])
else:
    train_df = get_deltas([f"{x}_is.pkl" for x in delta_sources])    

In [7]:
labelled_data = pickle.load(open("recommendee.pkl", "rb"))
labelled_data = clean_data(labelled_data.merge(train_df, on="anime_id"))

In [8]:
# get model
delta_cols = [f"delta_{i}" for i in range(len(delta_sources))]
formula = "normalized_score ~ 0 +" + " + ".join(delta_cols)
model = lm(formula, labelled_data)
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:       normalized_score   R-squared (uncentered):                   0.733
Model:                            OLS   Adj. R-squared (uncentered):              0.731
Method:                 Least Squares   F-statistic:                              511.8
Date:                Sat, 01 May 2021   Prob (F-statistic):                   1.18e-107
Time:                        15:35:00   Log-Likelihood:                         -464.24
No. Observations:                 375   AIC:                                      932.5
Df Residuals:                     373   BIC:                                      940.3
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
df = get_deltas([f"{x}_is.pkl" for x in delta_sources])
delta_corrs = df[[f"delta_{i}" for i in range(len(delta_sources))]].corr()
df = clean_data(df)

In [10]:
blp = pickle.load(open("blp.pkl", "rb"))
df["blp"] = blp
df["score"] = model.predict(df) + df["blp"]
df["delta"] = df["score"] - df["blp"]

In [11]:
# compute error bars
model_vars = pd.DataFrame()
for col in delta_cols:
    i = col.split("_")[1]
    model_vars[f"model_delta_var_{i}"] = (
        (df[f"delta_var_{i}"] + df[f"delta_{i}"] ** 2)
        * (model.bse[f"delta_{i}"] ** 2 + model.params[f"delta_{i}"] ** 2)
    ) - df[f"delta_{i}"] ** 2 * model.params[f"delta_{i}"] ** 2
model_stds = np.sqrt(model_vars)

delta_corrs = delta_corrs.loc[lambda x: (x.index.isin(delta_cols)), delta_cols]
delta_variance = np.sum(
    (model_stds.values @ delta_corrs.values) * model_stds.values, axis=1
)
intercept_variance = 0
if "Intercept" in model.bse:
    intercept_variance = model.bse["Intercept"] ** 2
df["std"] = np.sqrt(delta_variance + intercept_variance)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
df["score_lower_bound"] = df["score"] - df["std"] * zscore
df["score_upper_bound"] = df["score"] + df["std"] * zscore

In [12]:
anime = pd.read_csv("../../AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]
df = df.merge(anime, on="anime_id").set_index("anime_id")

In [13]:
# reorder the columns
cols = [
    "title",
    "type",
    "score",
    "score_lower_bound",
    "score_upper_bound",
    "delta",
    "std",
] + delta_cols
df = df[cols]  # + [x for x in df.columns if x not in cols]]

In [14]:
new_recs = df.loc[lambda x: ~x.index.isin(labelled_data.anime_id) & (x["type"] == "TV")]

In [15]:
df.loc[
    lambda x: (x["delta"] > 0) & ~(x[delta_cols] < 0).all(axis=1)
].sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_0,delta_1
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
32,Neon Genesis Evangelion: The End of Evangelion,Movie,10.549572,9.940809,11.158336,3.225646,0.236337,-0.030274,1.096315
2476,School Days,TV,10.441554,9.593728,11.289381,5.269469,0.329147,-0.192061,1.788111
30,Neon Genesis Evangelion,TV,9.645521,9.124882,10.16616,2.454501,0.202125,0.146865,0.837614
3784,Evangelion: 2.0 You Can (Not) Advance,Movie,9.297447,8.744552,9.850342,1.751825,0.214648,0.209351,0.599908
16201,Aku no Hana,TV,9.584481,8.640208,10.528754,3.870472,0.36659,-0.24637,1.311282
3297,Aria The Origination,TV,9.626685,8.357253,10.896117,1.975818,0.492825,0.419889,0.680283
11981,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie,8.945575,8.327338,9.563812,1.536437,0.240015,-0.076963,0.520947
9756,Mahou Shoujo Madoka★Magica,TV,8.745676,8.270098,9.221255,1.297687,0.184631,-0.411264,0.433084
3785,Evangelion: 3.0 You Can (Not) Redo,Movie,8.86848,8.186068,9.550892,2.339847,0.264929,0.564104,0.806955
227,FLCL,OVA,8.75181,8.181556,9.322063,1.916793,0.221386,-0.468948,0.642466


In [16]:
new_recs.loc[
    lambda x: (x["delta"] > 0) & ~(x[delta_cols] < 0).all(axis=1)
].sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_0,delta_1
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
33089,Kemono Friends,TV,7.828507,6.285884,9.371131,1.416557,0.598884,-0.065553,0.480408
32681,Uchuu Patrol Luluco,TV,7.32919,6.280691,8.377689,0.950367,0.407053,-0.154924,0.320091
26,Texhnolyze,TV,7.135603,6.193017,8.07819,0.621844,0.365935,-0.426106,0.202959
26165,Yuri Kuma Arashi,TV,7.339487,6.114322,8.564652,1.444279,0.475639,-0.003605,0.491072
14131,Girls & Panzer,TV,6.864429,5.93668,7.792177,0.324659,0.360175,-0.511202,0.100199
593,Mugen no Ryvius,TV,7.655089,5.771968,9.538211,1.099119,0.731074,0.888882,0.391514
147,Kimi ga Nozomu Eien,TV,6.597165,5.770635,7.423695,0.234488,0.320879,-0.190028,0.075947
2403,Kodomo no Jikan (TV),TV,6.940058,5.765743,8.114372,0.86317,0.455898,1.270633,0.318898
1454,Kemonozume,TV,7.184866,5.763358,8.606373,0.814756,0.551864,-0.671933,0.263653
8726,Soredemo Machi wa Mawatteiru,TV,6.946065,5.75053,8.1416,0.568278,0.464136,-1.059693,0.172094


In [17]:
# results from non CV

title	type	score	score_lower_bound	score_upper_bound	delta	sem	delta_0	delta_1
anime_id									
33089	Kemono Friends	TV	7.689167	6.240180	9.138154	1.277212	0.562532	-0.065553	0.349839
32681	Uchuu Patrol Luluco	TV	7.199993	6.193051	8.206936	0.821167	0.390920	-0.154924	0.220282
26	Texhnolyze	TV	6.940036	6.069221	7.810851	0.426273	0.338072	-0.426106	0.100118
26165	Yuri Kuma Arashi	TV	7.127059	6.038556	8.215562	1.231847	0.422583	-0.003605	0.339868
1454	Kemonozume	TV	7.153476	5.847962	8.458990	0.783362	0.506833	-0.671933	0.188562
14131	Girls & Panzer	TV	6.751470	5.839360	7.663579	0.211697	0.354103	-0.511202	0.037387
147	Kimi ga Nozomu Eien	TV	6.604219	5.780520	7.427919	0.241538	0.319780	-0.190028	0.058847
8726	Soredemo Machi wa Mawatteiru	TV	7.019911	5.735173	8.304649	0.642121	0.498767	-1.059693	0.133613
634	Koi Kaze	TV	6.820464	5.701509	7.939419	0.546790	0.434406	-0.083210	0.147500
182	Tenkuu no Escaflowne	TV	6.735623	5.683228	7.788019	0.042268	0.408566	0.307807	0.024339
4981	Casshern Sins	TV	6.638886	5.648150	7.629622	0.122676	0.384628	-0.588719	0.009624
593	Mugen no Ryvius	TV	7.322752	5.582626	9.062878	0.766778	0.675560	0.888882	0.248242
3604	Hidamari Sketch x 365	TV	6.917824	5.561484	8.274163	0.047902	0.526564	0.630296	0.039170
2403	Kodomo no Jikan (TV)	TV	6.732618	5.556205	7.909031	0.655727	0.456712	1.270633	0.233305
2402	Ashita no Joe	TV	7.732315	5.550491	9.914140	0.746528	0.847038	-0.451633	0.187465
1088	Macross	TV	6.971987	5.534449	8.409524	0.036230	0.558087	0.518613	0.031351
31771	Amanchu!	TV	6.860630	5.516686	8.204574	0.582180	0.521752	0.660786	0.187898
1852	Hidamari Sketch	TV	6.563665	5.469115	7.658215	0.169331	0.424931	0.516729	0.068012
1254	Saint Seiya	TV	6.941152	5.459375	8.422929	0.306313	0.575262	0.000000	0.084549
15125	Teekyuu	TV	6.912558	5.428617	8.396498	1.495736	0.576102	-1.208408	0.363107

SyntaxError: invalid syntax (<ipython-input-17-907ccfd7eec0>, line 3)