In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "Fro116"
confidence_interval = 0.99
delta_sources = ["item", "user"]
cross_validate = False # if true, train linear model on out of sample data

In [3]:
os.chdir(f"../../data/deltas/{recommendee}")

In [4]:
def get_deltas(sources):
    deltas = []
    for i, source in enumerate(sources):
        delta = pickle.load(open(source, "rb"))
        delta = delta.rename({x: x + f"_{i}" for x in delta.columns}, axis=1)
        deltas.append(delta)
    return pd.concat(deltas, axis=1)

In [5]:
def clean_data(df):
    # fill missing data with reasonable defaults
    num_deltas = len([x for x in df.columns if "delta_var" in x])
    for i in range(num_deltas):
        df[f"delta_{i}"] = df[f"delta_{i}"].fillna(0)
        df[f"delta_var_{i}"] = df[f"delta_var_{i}"].fillna(df[f"delta_var_{i}"].max())
    return df

In [6]:
if cross_validate:
    train_df = get_deltas([f"{x}_oos.pkl" for x in delta_sources])
else:
    train_df = get_deltas([f"{x}_is.pkl" for x in delta_sources])  
delta_corrs = train_df[[f"delta_{i}" for i in range(len(delta_sources))]].corr()

In [7]:
labelled_data = pickle.load(open("recommendee.pkl", "rb"))
labelled_data = clean_data(labelled_data.merge(train_df, on="anime_id"))

In [8]:
delta_corrs

Unnamed: 0,delta_0,delta_1
delta_0,1.0,0.560888
delta_1,0.560888,1.0


In [9]:
# get model
delta_cols = [f"delta_{i}" for i in range(len(delta_sources))]
formula = "normalized_score ~ 0 +" + " + ".join(delta_cols)
model = lm(formula, labelled_data)
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:       normalized_score   R-squared (uncentered):                   0.251
Model:                            OLS   Adj. R-squared (uncentered):              0.247
Method:                 Least Squares   F-statistic:                              62.61
Date:                Sun, 02 May 2021   Prob (F-statistic):                    3.58e-24
Time:                        16:13:43   Log-Likelihood:                         -657.50
No. Observations:                 375   AIC:                                      1319.
Df Residuals:                     373   BIC:                                      1327.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [10]:
df = clean_data(get_deltas([f"{x}_is.pkl" for x in delta_sources]))

In [11]:
blp = pickle.load(open("blp.pkl", "rb"))
df["blp"] = blp
df["score"] = model.predict(df) + df["blp"]
df["delta"] = df["score"] - df["blp"]

In [12]:
# compute error bars
model_vars = pd.DataFrame()
for col in delta_cols:
    i = col.split("_")[1]
    model_vars[f"model_delta_var_{i}"] = (
        (df[f"delta_var_{i}"] + df[f"delta_{i}"] ** 2)
        * (model.bse[f"delta_{i}"] ** 2 + model.params[f"delta_{i}"] ** 2)
    ) - df[f"delta_{i}"] ** 2 * model.params[f"delta_{i}"] ** 2
model_stds = np.sqrt(model_vars)

delta_corrs = delta_corrs.loc[lambda x: (x.index.isin(delta_cols)), delta_cols]
delta_variance = np.sum(
    (model_stds.values @ delta_corrs.values) * model_stds.values, axis=1
)
intercept_variance = 0
if "Intercept" in model.bse:
    intercept_variance = model.bse["Intercept"] ** 2
df["std"] = np.sqrt(delta_variance + intercept_variance)

zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
df["score_lower_bound"] = df["score"] - df["std"] * zscore
df["score_upper_bound"] = df["score"] + df["std"] * zscore

In [13]:
anime = pd.read_csv("../../AnimeList.csv")
anime = anime[["anime_id", "title", "type"]]
df = df.merge(anime, on="anime_id").set_index("anime_id")

In [14]:
# reorder the columns
cols = [
    "title",
    "type",
    "score",
    "score_lower_bound",
    "score_upper_bound",
    "delta",
    "std",
] + delta_cols
df = df[cols]  # + [x for x in df.columns if x not in cols]]

In [15]:
new_recs = df.loc[lambda x: ~x.index.isin(labelled_data.anime_id) & (x["type"] == "TV")]

In [16]:
df.loc[
    lambda x: (x["delta"] > 0) & ~(x[delta_cols] < 0).all(axis=1)
].sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_0,delta_1
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3297,Aria The Origination,TV,8.916452,7.517793,10.315111,1.265584,0.542994,0.419889,0.680283
32,Neon Genesis Evangelion: The End of Evangelion,Movie,8.88087,7.472135,10.289605,1.556944,0.546906,-0.030274,1.096315
3784,Evangelion: 2.0 You Can (Not) Advance,Movie,8.551817,7.425428,9.678205,1.006195,0.437292,0.209351,0.599908
30,Neon Genesis Evangelion,TV,8.496619,7.306401,9.686837,1.305599,0.462072,0.146865,0.837614
2001,Tengen Toppa Gurren Lagann,TV,7.813388,7.133966,8.49281,0.151802,0.263768,-0.113173,0.159181
962,Aria The Natural,TV,8.39809,7.123828,9.672351,1.135786,0.4947,0.684754,0.464429
11981,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie,8.106247,7.109308,9.103186,0.697109,0.387036,-0.076963,0.520947
2759,Evangelion: 1.0 You Are (Not) Alone,Movie,8.145816,7.040827,9.250806,0.995122,0.428984,0.357482,0.52194
9756,Mahou Shoujo Madoka★Magica,TV,7.790441,6.975709,8.605173,0.342452,0.316299,-0.411264,0.433084
11979,Mahou Shoujo Madoka★Magica Movie 2: Eien no Mo...,Movie,7.972326,6.974563,8.97009,0.570934,0.387356,0.328184,0.241062


In [18]:
new_recs.loc[
    lambda x: (x["delta"] > 0) & ~(x[delta_cols] < 0).all(axis=1)
].sort_values(by="score_lower_bound", ascending=False)[:30]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_0,delta_1
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
444,Maria-sama ga Miteru: Haru,TV,7.752174,6.237421,9.266927,0.892822,0.588064,1.053056,0.120864
21,One Piece,TV,7.531965,6.158456,8.905475,0.104738,0.53323,0.415994,-0.124566
3604,Hidamari Sketch x 365,TV,7.427833,6.151248,8.704418,0.557915,0.495602,0.630296,0.08869
593,Mugen no Ryvius,TV,7.726183,6.14324,9.309127,1.170213,0.614537,0.888882,0.391514
158,Maria-sama ga Miteru,TV,7.411296,6.04077,8.781821,0.994363,0.532072,1.046183,0.194687
488,Ichigo Mashimaro,TV,7.066046,5.974164,8.157927,0.314761,0.423895,0.485514,-0.011597
3750,Maria-sama ga Miteru 4th,TV,7.641417,5.948315,9.334519,0.629453,0.657304,1.12359,-0.095619
182,Tenkuu no Escaflowne,TV,6.994682,5.942131,8.047234,0.30133,0.408626,0.307807,0.063375
7062,Hidamari Sketch x ☆☆☆,TV,7.233572,5.886412,8.580732,0.235103,0.523001,0.390294,-0.02178
32836,Senki Zesshou Symphogear AXZ,TV,8.606208,5.804269,11.408147,2.241017,1.087781,2.302232,0.465135
