In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf
from tqdm import tqdm

@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "taapaye"
confidence_interval = 0.95  # x standard deviations
delta_sources = ["item", "item16", "user", "maluserrec", "related"]  # TODO get programmatically
cross_validate = True  # if true, train linear model on out of sample data
renormalize_variance_iters = 10

In [3]:
outdir = f"../../data/recommendations/{recommendee}"
os.chdir(outdir)

## Ensemble signals into a linear model

In [4]:
def get_deltas(sources):
    deltas = []
    for source_filename in sources:
        delta = pickle.load(open(source_filename, "rb"))
        source = source_filename.split(".")[0].split("_")[0]
        delta = delta.rename({x: x + f"_{source}" for x in delta.columns}, axis=1)
        deltas.append(delta)
    return pd.concat(deltas, axis=1)

In [5]:
def clean_data(df):
    # fill missing data with reasonable defaults
    delta_sources = [x.split("_")[-1] for x in df.columns if "delta_var" in x]
    for source in delta_sources:
        df.loc[lambda x: x[f"delta_var_{source}"] == np.inf, f"delta_{source}"] = np.nan
        df.loc[
            lambda x: x[f"delta_var_{source}"] == np.inf, f"delta_var_{source}"
        ] = np.nan

        df[f"delta_{source}"] = df[f"delta_{source}"].fillna(0)
        df[f"delta_var_{source}"] = df[f"delta_var_{source}"].fillna(df[f"delta_var_{source}"].quantile(0.8))
    return df

In [6]:
if cross_validate:
    train_df = get_deltas([f"{x}_loocv.pkl" for x in delta_sources])
else:
    train_df = get_deltas([f"{x}.pkl" for x in delta_sources])
delta_corrs = train_df[[f"delta_{source}" for source in delta_sources]].corr()

In [7]:
labelled_data = pickle.load(open("user_anime_list.pkl", "rb"))
labelled_data = clean_data(labelled_data.merge(train_df, on="anime_id", how="left"))

In [8]:
# get model
delta_cols = [f"delta_{source}" for source in delta_sources]
formula = "score ~ " + " + ".join(delta_cols)
model = lm(formula, labelled_data)
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  score   R-squared:                       0.203
Model:                            OLS   Adj. R-squared:                  0.192
Method:                 Least Squares   F-statistic:                     17.50
Date:                Sat, 29 May 2021   Prob (F-statistic):           1.93e-15
Time:                        08:35:22   Log-Likelihood:                -551.26
No. Observations:                 349   AIC:                             1115.
Df Residuals:                     343   BIC:                             1138.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.1398      0.109  

In [9]:
df = clean_data(get_deltas([f"{x}.pkl" for x in delta_sources]))

In [10]:
blp = pickle.load(open("baseline_predictor.pkl", "rb"))
df["blp"] = blp["blp"]
df["score"] = model.predict(df) + df["blp"]
df["delta"] = df["score"] - df["blp"]

In [11]:
valid_baseline = ~df['blp'].isna()
df = df.loc[valid_baseline]

## Compute Confidence Intervals

In [12]:
for _ in range(renormalize_variance_iters):
    for source in delta_sources:
        seen_shows = pickle.load(open("user_anime_list.pkl", "rb"))
        seen_shows = seen_shows.set_index("anime_id")
        seen_shows["delta"] = df[f"delta_{source}"]

        single_delta_model = lm("score ~ delta + 0", seen_shows)

        seen_shows["pred_score"] = single_delta_model.predict(df)
        seen_shows["pred_std"] = np.sqrt(
            (df[f"delta_var_{source}"] + df[f"delta_{source}"] ** 2)
            * (
                single_delta_model.bse["delta"] ** 2
                + single_delta_model.params["delta"] ** 2
            )
            - (df[f"delta_{source}"] ** 2 * single_delta_model.params["delta"] ** 2)
        )
        seen_shows = seen_shows.loc[lambda x: x["pred_std"] < np.inf]

        std_mult = (
            (seen_shows["pred_score"] - seen_shows["score"]) / seen_shows["pred_std"]
        ).std()
        df[f"delta_var_{source}"] *= std_mult ** 2

In [13]:
# compute error bars
model_vars = pd.DataFrame()
for col in delta_cols:
    source = col.split("_")[1]
    model_vars[f"model_delta_var_{source}"] = (
        (df[f"delta_var_{source}"] + df[f"delta_{source}"] ** 2)
        * (model.bse[f"delta_{source}"] ** 2 + model.params[f"delta_{source}"] ** 2)
    ) - df[f"delta_{source}"] ** 2 * model.params[f"delta_{source}"] ** 2
model_stds = np.sqrt(model_vars)

delta_corrs = delta_corrs.loc[lambda x: (x.index.isin(delta_cols)), delta_cols]
delta_variance = np.sum(
    (model_stds.values @ delta_corrs.values) * model_stds.values, axis=1
)
intercept_variance = 0
if "Intercept" in model.bse:
    intercept_variance = model.bse["Intercept"] ** 2
df["std"] = np.sqrt(delta_variance + intercept_variance)

In [14]:
for _ in range(renormalize_variance_iters):
    seen_shows = pickle.load(open("user_anime_list.pkl", "rb"))
    seen_shows = seen_shows.set_index("anime_id")
    seen_shows["score"] += df["blp"]
    seen_shows["pred_score"] = df[f"score"]
    seen_shows["pred_std"] = df["std"]

    std_mult = (
        (seen_shows["pred_score"] - seen_shows["score"]) / seen_shows["pred_std"]
    ).std()
    df["std"] *= std_mult

In [15]:
zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
df["score_lower_bound"] = df["score"] - df["std"] * zscore
df["score_upper_bound"] = df["score"] + df["std"] * zscore

## Display Recommendations

In [16]:
anime = pd.read_csv("../../cleaned_data/anime.csv")
anime = anime[["anime_id", "title", "type", "genres"]]
df = df.merge(anime, on="anime_id").set_index("anime_id")

In [17]:
# reorder the columns
cols = [
    "title",
    "type",
    "score",
    "score_lower_bound",
    "score_upper_bound",
    "delta",
    "std",
] + delta_cols
df = df[cols + [x for x in df.columns if x not in cols]]

In [18]:
related_series = pickle.load(open("../../processed_data/related_anime_graph.pkl", "rb"))
df = df.merge(related_series, on="anime_id").set_index("anime_id")

In [19]:
new_recs = df.loc[lambda x: ~x.index.isin(labelled_data.anime_id) & (x["type"] == "TV")]

In [20]:
epsilon = 1e-6
min_bound = epsilon
if "Intercept" in model.params:
    min_bound += model.params["Intercept"]

In [21]:
df.loc[lambda x: x["delta"] > min_bound].sort_values(
    by="score_lower_bound", ascending=False
)[:20]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_item,delta_item16,delta_user,delta_maluserrec,delta_related,delta_var_item,delta_var_item16,delta_var_user,delta_var_maluserrec,delta_var_related,blp,genres,series_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9756,Mahou Shoujo Madoka★Magica,TV,8.520119,7.399133,9.641104,0.786356,0.571942,-0.180945,0.424327,0.238463,-0.296815,1.763286,0.169719,0.583851,0.156777,1.064748,0.577888,7.733763,"Drama, Magic, Psychological, Thriller",4026
11977,Mahou Shoujo Madoka★Magica Movie 1: Hajimari n...,Movie,8.638204,7.083808,10.1926,1.17574,0.793074,0.0016,1.086926,0.344723,-0.080459,1.763286,0.50637,1.402858,0.330955,1.127715,0.577888,7.462465,"Psychological, Drama, Magic, Thriller",4026
11981,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie,8.523027,7.027728,10.018326,0.8281,0.762922,-0.210841,0.782247,0.233431,-0.989657,1.763286,0.285154,0.787161,0.243497,6.607238,0.577888,7.694927,"Psychological, Drama, Magic, Thriller",4026
11979,Mahou Shoujo Madoka★Magica Movie 2: Eien no Mo...,Movie,8.784172,6.980043,10.588301,1.097013,0.920491,-0.006584,1.11961,0.338489,-0.783209,1.763286,0.480728,1.264306,0.344165,11.166306,0.577888,7.687159,"Drama, Magic, Psychological, Thriller",4026
2001,Tengen Toppa Gurren Lagann,TV,8.120288,6.979,9.261575,0.17291,0.5823,-0.290416,0.355857,-0.169642,-0.161803,0.760486,0.160788,0.651437,0.117547,1.142753,2.45629,7.947378,"Action, Adventure, Comedy, Mecha, Sci-Fi",1228
1887,Lucky☆Star,TV,8.556521,6.634086,10.478956,1.535936,0.980852,-0.131394,0.485463,0.858264,0.112967,1.754205,0.485134,1.757811,0.684241,0.858343,1.272798,7.020585,"Slice of Life, Comedy, Parody, School",1166
18679,Kill la Kill,TV,7.655339,6.570751,8.739926,0.304012,0.553371,-0.252379,0.349051,-0.006415,-0.424914,0.663338,0.176076,0.85075,0.117784,0.779499,1.150177,7.351327,"Action, Comedy, Super Power, Ecchi, School",5352
10165,Nichijou,TV,7.795575,6.32454,9.26661,0.223606,0.750542,-0.079173,-0.385788,0.018876,0.522423,0.935895,0.200845,1.506013,0.265198,2.331364,2.375245,7.571969,"Slice of Life, Comedy, School",3809
4472,Lucky☆Star: Original na Visual to Animation,OVA,8.649603,6.295813,11.003392,1.462869,1.200935,0.199857,2.004933,0.249025,0.359108,1.754205,1.087538,5.482304,0.693011,1.144802,1.272798,7.186734,"Comedy, Parody, School, Slice of Life",1166
820,Ginga Eiyuu Densetsu,OVA,8.446069,6.196177,10.695961,0.288294,1.147925,-0.164171,-0.244,0.093243,0.209639,0.750089,0.333438,3.187673,1.07496,1.810372,3.477408,8.157775,"Military, Sci-Fi, Space, Drama",553


In [22]:
new_recs.loc[lambda x: (x["delta"] > min_bound)].sort_values(
    by="score_lower_bound", ascending=False
).groupby("series_id").first().sort_values(by="score_lower_bound", ascending=False)[:50]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_item,delta_item16,delta_user,delta_maluserrec,delta_related,delta_var_item,delta_var_item16,delta_var_user,delta_var_maluserrec,delta_var_related,blp,genres
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
7187,Prison School,TV,7.254345,6.065596,8.443095,0.167673,0.606516,-0.19883,0.583448,-0.166482,0.028087,0.0,0.18709,0.782211,0.173491,0.535451,2.397548,7.086673,"Comedy, Ecchi, Romance, School, Seinen"
42,Azumanga Daioh,TV,7.627188,5.691436,9.562939,0.430952,0.987647,0.003778,0.722896,0.00388,0.193631,0.0,0.390392,1.73177,0.844012,1.352767,2.397548,7.196236,"Slice of Life, Comedy, School"
8404,Demi-chan wa Kataritai,TV,7.185648,5.633831,8.737466,0.23046,0.791758,-0.335583,1.191519,-0.316381,0.105927,0.0,0.233768,2.36681,0.246658,0.788312,2.397548,6.955188,"Comedy, Vampire, Fantasy, School, Seinen"
7708,New Game!,TV,7.234335,5.484465,8.984205,0.294525,0.892807,-0.19254,1.375773,-0.233192,-0.723338,0.0,0.270614,3.442736,0.325985,1.499241,2.397548,6.93981,"Game, Slice of Life, Comedy"
3268,Kuuchuu Buranko,TV,7.713456,5.328586,10.098325,0.514147,1.216792,-0.186263,0.407349,0.197433,0.128963,0.0,0.581234,1.816337,1.557977,2.065591,2.397548,7.199309,"Comedy, Psychological, Drama, Seinen"
3801,Panty & Stocking with Garterbelt,TV,7.229419,5.319888,9.138949,0.367353,0.974268,-0.022397,0.399533,0.0726,0.025044,0.0,0.383196,3.073299,0.644438,0.74913,2.397548,6.862066,"Action, Comedy, Parody, Supernatural, Ecchi"
7707,Kono Bijutsubu ni wa Mondai ga Aru!,TV,7.116479,5.238333,8.994626,0.516051,0.958256,-0.09103,1.639414,-0.162703,-0.344398,0.0,0.376384,3.447539,0.40808,2.707502,2.397548,6.600428,"Comedy, Romance, School"
8607,Blend S,TV,6.953812,5.231563,8.676062,0.241667,0.878715,-0.362791,0.985364,-0.280652,0.510754,0.0,0.298111,2.272893,0.363897,2.040832,2.397548,6.712145,"Slice of Life, Comedy"
7908,Uchuu Patrol Luluco,TV,7.063373,5.228644,8.898103,0.398784,0.936104,-0.011575,0.116653,0.214501,-0.151781,0.0,0.381431,1.857501,0.671821,1.232793,2.397548,6.664589,"Action, Adventure, Comedy, Space"
1826,Minami-ke Okaeri,TV,7.779192,5.194879,10.363504,0.858352,1.318551,0.428905,1.307538,-0.012111,0.294014,1.341275,1.774241,3.602049,0.975032,5.777493,0.981969,6.92084,"Slice of Life, Comedy, School"


In [23]:
# Inreased serendipity!
new_recs.loc[lambda x: (x["delta_user"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
).groupby("series_id").first().sort_values(by="score_lower_bound", ascending=False)[:50]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_item,delta_item16,delta_user,delta_maluserrec,delta_related,delta_var_item,delta_var_item16,delta_var_user,delta_var_maluserrec,delta_var_related,blp,genres
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
42,Azumanga Daioh,TV,7.627188,5.691436,9.562939,0.430952,0.987647,0.003778,0.722896,0.00388,0.193631,0.0,0.390392,1.73177,0.844012,1.352767,2.397548,7.196236,"Slice of Life, Comedy, School"
3268,Kuuchuu Buranko,TV,7.713456,5.328586,10.098325,0.514147,1.216792,-0.186263,0.407349,0.197433,0.128963,0.0,0.581234,1.816337,1.557977,2.065591,2.397548,7.199309,"Comedy, Psychological, Drama, Seinen"
3801,Panty & Stocking with Garterbelt,TV,7.229419,5.319888,9.138949,0.367353,0.974268,-0.022397,0.399533,0.0726,0.025044,0.0,0.383196,3.073299,0.644438,0.74913,2.397548,6.862066,"Action, Comedy, Parody, Supernatural, Ecchi"
7908,Uchuu Patrol Luluco,TV,7.063373,5.228644,8.898103,0.398784,0.936104,-0.011575,0.116653,0.214501,-0.151781,0.0,0.381431,1.857501,0.671821,1.232793,2.397548,6.664589,"Action, Adventure, Comedy, Space"
5284,Tonari no Seki-kun,TV,6.905579,4.863692,8.947466,0.087816,1.041798,-0.049502,-0.245169,0.008166,0.252201,0.0,1.01127,4.752676,0.412719,0.936996,2.397548,6.817763,"Comedy, School, Seinen"
1826,Minami-ke Okawari,TV,7.476436,4.845928,10.106945,0.81692,1.342121,0.405827,1.241569,0.004617,0.0,1.341275,2.189934,3.710381,0.912374,6.721908,0.981969,6.659516,"Comedy, School, Slice of Life"
3776,Soredemo Machi wa Mawatteiru,TV,7.18779,4.774363,9.601216,0.524236,1.231363,0.031221,0.100584,0.285662,0.286352,0.0,0.542051,2.682089,1.59676,0.682714,2.397548,6.663554,"Comedy, Slice of Life"
16,Texhnolyze,TV,6.787076,4.713075,8.861077,-0.012451,1.058183,-0.124228,-0.245199,0.006248,-0.553806,0.0,0.382896,1.371036,1.161852,1.525195,2.397548,6.799527,"Action, Sci-Fi, Psychological, Drama"
2393,Sekirei: Pure Engagement,TV,6.969844,4.675058,9.26463,0.119477,1.170831,0.106501,0.0,0.02368,-0.416876,0.0,0.323453,11.102327,0.441665,1.412812,2.397548,6.850367,"Action, Harem, Comedy, Super Power, Ecchi"
1842,Bamboo Blade,TV,7.183249,4.490852,9.875646,0.441267,1.373697,0.142404,0.667761,0.063539,-0.13556,0.0,0.717878,7.170143,1.331639,1.64905,2.397548,6.741982,"Comedy, School, Seinen, Sports"
