In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
from tqdm import tqdm

@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "taapaye"
confidence_interval = 0.95  # x standard deviations
delta_sources = ["related", "maluserrec"]  # TODO get programmatically
cross_validate = True  # if true, train linear model on out of sample data
renormalize_variance_iters = 10

In [3]:
outdir = f"../../data/recommendations/{recommendee}"
os.chdir(outdir)

## Ensemble signals into a linear model

In [4]:
def get_deltas(sources):
    deltas = []
    for source_filename in sources:
        delta = pickle.load(open(source_filename, "rb"))
        source = source_filename.split(".")[0].split("_")[0]
        delta = delta.rename({x: x + f"_{source}" for x in delta.columns}, axis=1)
        deltas.append(delta)
    return pd.concat(deltas, axis=1)

In [5]:
def clean_data(df):
    # fill missing data with reasonable defaults
    delta_sources = [x.split("_")[-1] for x in df.columns if "delta_var" in x]
    for source in delta_sources:
        df.loc[lambda x: x[f"delta_var_{source}"] == np.inf, f"delta_{source}"] = np.nan
        df.loc[
            lambda x: x[f"delta_var_{source}"] == np.inf, f"delta_var_{source}"
        ] = np.nan

        df[f"delta_{source}"] = df[f"delta_{source}"].fillna(0)
        df[f"delta_var_{source}"] = df[f"delta_var_{source}"].fillna(df[f"delta_var_{source}"].quantile(0.8))
    return df

In [6]:
if cross_validate:
    train_df = get_deltas([f"{x}_loocv.pkl" for x in delta_sources])
else:
    train_df = get_deltas([f"{x}.pkl" for x in delta_sources])
delta_corrs = train_df[[f"delta_{source}" for source in delta_sources]].corr()

In [7]:
labelled_data = pickle.load(open("user_anime_list.pkl", "rb"))
labelled_data = clean_data(labelled_data.merge(train_df, on="anime_id", how="left"))

In [8]:
# get model
delta_cols = [f"delta_{source}" for source in delta_sources]
formula = "score ~ " + " + ".join(delta_cols)
model = lm(formula, labelled_data)
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.117
Model:                            OLS   Adj. R-squared:                  0.113
Method:                 Least Squares   F-statistic:                     26.63
Date:                Sat, 29 May 2021   Prob (F-statistic):           1.38e-11
Time:                        14:46:23   Log-Likelihood:                -652.08
No. Observations:                 405   AIC:                             1310.
Df Residuals:                     402   BIC:                             1322.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0705      0.145  

In [9]:
df = clean_data(get_deltas([f"{x}.pkl" for x in delta_sources]))

In [10]:
blp = pickle.load(open("baseline_predictor.pkl", "rb"))
df["blp"] = blp["blp"]
df["score"] = model.predict(df) + df["blp"]
df["delta"] = df["score"] - df["blp"]

In [11]:
valid_baseline = ~df['blp'].isna()
df = df.loc[valid_baseline]

## Compute Confidence Intervals

In [12]:
for _ in range(renormalize_variance_iters):
    for source in delta_sources:
        seen_shows = pickle.load(open("user_anime_list.pkl", "rb"))
        seen_shows = seen_shows.set_index("anime_id")
        seen_shows["delta"] = df[f"delta_{source}"]

        single_delta_model = lm("score ~ delta + 0", seen_shows)

        seen_shows["pred_score"] = single_delta_model.predict(df)
        seen_shows["pred_std"] = np.sqrt(
            (df[f"delta_var_{source}"] + df[f"delta_{source}"] ** 2)
            * (
                single_delta_model.bse["delta"] ** 2
                + single_delta_model.params["delta"] ** 2
            )
            - (df[f"delta_{source}"] ** 2 * single_delta_model.params["delta"] ** 2)
        )
        seen_shows = seen_shows.loc[lambda x: x["pred_std"] < np.inf]

        std_mult = (
            (seen_shows["pred_score"] - seen_shows["score"]) / seen_shows["pred_std"]
        ).std()
        df[f"delta_var_{source}"] *= std_mult ** 2

In [13]:
# compute error bars
model_vars = pd.DataFrame()
for col in delta_cols:
    source = col.split("_")[1]
    model_vars[f"model_delta_var_{source}"] = (
        (df[f"delta_var_{source}"] + df[f"delta_{source}"] ** 2)
        * (model.bse[f"delta_{source}"] ** 2 + model.params[f"delta_{source}"] ** 2)
    ) - df[f"delta_{source}"] ** 2 * model.params[f"delta_{source}"] ** 2
model_stds = np.sqrt(model_vars)

delta_corrs = delta_corrs.loc[lambda x: (x.index.isin(delta_cols)), delta_cols]
delta_variance = np.sum(
    (model_stds.values @ delta_corrs.values) * model_stds.values, axis=1
)
intercept_variance = 0
if "Intercept" in model.bse:
    intercept_variance = model.bse["Intercept"] ** 2
df["std"] = np.sqrt(delta_variance + intercept_variance)

In [14]:
for _ in range(renormalize_variance_iters):
    seen_shows = pickle.load(open("user_anime_list.pkl", "rb"))
    seen_shows = seen_shows.set_index("anime_id")
    seen_shows["score"] += df["blp"]
    seen_shows["pred_score"] = df[f"score"]
    seen_shows["pred_std"] = df["std"]

    std_mult = (
        (seen_shows["pred_score"] - seen_shows["score"]) / seen_shows["pred_std"]
    ).std()
    df["std"] *= std_mult

In [15]:
zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
df["score_lower_bound"] = df["score"] - df["std"] * zscore
df["score_upper_bound"] = df["score"] + df["std"] * zscore

## Display Recommendations

In [16]:
anime = pd.read_csv("../../cleaned_data/anime.csv")
anime = anime[["anime_id", "title", "medium", "genres"]]
df = df.merge(anime, on="anime_id").set_index("anime_id")

In [17]:
# reorder the columns
cols = [
    "title",
    "medium",
    "score",
    "score_lower_bound",
    "score_upper_bound",
    "delta",
    "std",
] + delta_cols
df = df[cols + [x for x in df.columns if x not in cols]]

In [18]:
related_series = pickle.load(open("../../processed_data/strict_relations_anime_graph.pkl", "rb"))
df = df.merge(related_series, on="anime_id").set_index("anime_id")

In [19]:
new_recs = df.loc[lambda x: ~x.index.isin(labelled_data.anime_id) & (x["medium"] == "tv")]

In [20]:
epsilon = 1e-6
min_bound = epsilon
if "Intercept" in model.params:
    min_bound += model.params["Intercept"]

In [21]:
df.loc[lambda x: x["delta"] > min_bound].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

Unnamed: 0_level_0,title,medium,score,score_lower_bound,score_upper_bound,delta,std,delta_related,delta_maluserrec,delta_var_related,delta_var_maluserrec,blp,genres,series_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
35247,Owarimonogatari 2nd Season,tv,8.392508,6.998008,9.787007,0.119918,0.711493,0.096574,0.0,0.308811,0.678813,8.27259,"['Mystery', 'Comedy', 'Supernatural', 'Vampire']",3237
11061,Hunter x Hunter (2011),tv,8.855971,6.945851,10.76609,0.433709,0.974569,1.477738,-0.395013,3.557429,0.443751,8.422262,"['Action', 'Adventure', 'Fantasy', 'Shounen', ...",5328
9756,Mahou Shoujo Madoka★Magica,tv,8.094088,6.827502,9.360674,0.429051,0.646229,1.384329,-0.351656,0.627074,0.399321,7.665037,"['Psychological', 'Drama', 'Magic', 'Thriller']",4843
31757,Kizumonogatari II: Nekketsu-hen,movie,7.992907,6.598407,9.387407,0.119918,0.711493,0.096574,0.0,0.308811,0.678813,7.87299,"['Action', 'Mystery', 'Supernatural', 'Vampire']",3237
10153,Mahou Shoujo Lyrical Nanoha: The Movie 2nd A's,movie,8.066672,6.553954,9.579389,0.519793,0.771809,0.877827,0.0,0.692523,0.678813,7.546878,"['Action', 'Magic', 'Comedy', 'Sci-Fi', 'Drama']",3198
30276,One Punch Man,tv,8.0813,6.543886,9.618714,0.327281,0.784409,1.120678,-0.318321,1.745659,0.405646,7.754019,"['Action', 'Sci-Fi', 'Comedy', 'Parody', 'Supe...",8680
397,Seikai no Senki II,tv,7.97258,6.449696,9.495464,0.832109,0.776996,1.488011,0.0,0.693191,0.678813,7.140471,"['Action', 'Military', 'Romance', 'Sci-Fi', 'S...",223
9989,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...,tv,8.303008,6.396693,10.209323,0.727733,0.972628,2.024725,-0.380892,3.663029,0.408946,7.575275,"['Slice of Life', 'Supernatural', 'Drama']",4956
38474,Yuru Camp△ Season 2,tv,7.931541,6.382198,9.480884,0.13085,0.790496,0.556769,-0.225682,1.057986,0.609841,7.800691,"['Slice of Life', 'Comedy']",10449
77,Mahou Shoujo Lyrical Nanoha A's,tv,7.568016,6.269816,8.866215,0.212151,0.662359,0.877827,-0.309107,0.692523,0.425075,7.355865,"['Action', 'Comedy', 'Drama', 'Magic', 'Super ...",52


In [22]:
new_recs.loc[lambda x: (x["delta"] > min_bound)].sort_values(
    by="score_lower_bound", ascending=False
).groupby("series_id").first().sort_values(by="score_lower_bound", ascending=False)[:50]

Unnamed: 0_level_0,title,medium,score,score_lower_bound,score_upper_bound,delta,std,delta_related,delta_maluserrec,delta_var_related,delta_var_maluserrec,blp,genres
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
93,Hunter x Hunter,tv,8.100671,6.195244,10.006098,0.447179,0.972174,1.477738,-0.381479,3.557429,0.439297,7.653492,"['Action', 'Adventure', 'Super Power', 'Fantas..."
2196,Minami-ke Okaeri,tv,7.407833,5.687173,9.128493,0.53143,0.877904,1.166262,-0.136643,1.063954,0.8251,6.876403,"['Slice of Life', 'Comedy', 'School']"
7541,World Trigger 2nd Season,tv,7.68491,5.558179,9.811641,0.337567,1.085087,0.521804,0.0,4.011786,0.678813,7.347343,"['Action', 'Sci-Fi', 'Supernatural', 'School',..."
12668,Beastars 2nd Season,tv,7.389615,5.518418,9.260813,0.085597,0.95471,0.695482,-0.342488,3.666366,0.387956,7.304019,"['Slice of Life', 'Psychological', 'Drama', 'S..."
52,Mahou Shoujo Lyrical Nanoha ViVid,tv,6.159791,4.893876,7.425706,0.222596,0.645887,0.877827,-0.298612,0.692523,0.393671,5.937194,"['Action', 'Adventure', 'Magic', 'Martial Arts']"
372,Aria the Natural,tv,7.764196,4.709165,10.819226,0.238463,1.558718,0.0,0.168776,3.368336,2.669725,7.525732,"['Sci-Fi', 'Slice of Life', 'Fantasy', 'Shounen']"
3089,Cyborg 009 (1979),tv,6.45749,4.448814,8.466166,0.079813,1.024853,0.01822,0.0,3.248328,0.678813,6.377677,"['Action', 'Adventure', 'Mecha', 'Sci-Fi', 'Sh..."
790,Nekojiru Gekijou Jirujiru Original,tv,6.589117,4.255245,8.92299,1.046698,1.190773,2.11392,-0.106279,4.885957,0.783143,5.542419,"['Action', 'Comedy']"
1655,Great Mazinger,tv,6.227489,4.218813,8.236164,0.079813,1.024853,0.01822,0.0,3.248328,0.678813,6.147676,"['Action', 'Mecha', 'Sci-Fi', 'Shounen']"
2345,Daisuki! Hello Kitty,tv,5.859177,4.158848,7.559505,0.2972,0.86753,0.442938,0.0,1.524338,0.678813,5.561977,"['Fantasy', 'Kids']"


In [23]:
# Inreased serendipity!
new_recs.loc[lambda x: (x["delta_user"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
).groupby("series_id").first().sort_values(by="score_lower_bound", ascending=False)[:50]

KeyError: 'delta_user'