In [1]:
import functools
import os
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as st
import statsmodels.formula.api as smf


@functools.wraps(smf.ols)
def lm(*args, **kwargs):
    return smf.ols(*args, **kwargs).fit()

In [2]:
recommendee = "Fro116"
confidence_interval = 0.50 # 0.9545 # 2 standard deviations
delta_sources = ["item", "user"]  # "item", "user"
cross_validate = True  # if true, train linear model on out of sample data
renormalize_variance = True

In [3]:
outdir = f"../../data/recommendations/{recommendee}"
os.chdir(outdir)

In [4]:
def get_deltas(sources):
    deltas = []
    for source_filename in sources:
        delta = pickle.load(open(source_filename, "rb"))
        source = source_filename.split('.')[0].split('_')[0]
        delta = delta.rename({x: x + f"_{source}" for x in delta.columns}, axis=1)
        deltas.append(delta)
    return pd.concat(deltas, axis=1)

In [5]:
def clean_data(df):
    # fill missing data with reasonable defaults
    delta_sources = [x.split("_")[-1] for x in df.columns if "delta_var" in x]
    for source in delta_sources:
        df.loc[lambda x: x[f'delta_var_{source}'] == np.inf, f'delta_{source}'] = np.nan                
        df.loc[lambda x: x[f'delta_var_{source}'] == np.inf, f'delta_var_{source}'] = np.nan        
        
        df[f"delta_{source}"] = df[f"delta_{source}"].fillna(0)
        df[f"delta_var_{source}"] = df[f"delta_var_{source}"].fillna(df[f"delta_var_{source}"].dropna().median())
    return df

In [6]:
if cross_validate:
    train_df = get_deltas([f"{x}_loocv.pkl" for x in delta_sources])
else:
    train_df = get_deltas([f"{x}.pkl" for x in delta_sources])
delta_corrs = train_df[[f"delta_{source}" for source in delta_sources]].corr()

In [7]:
labelled_data = pickle.load(open("user_anime_list.pkl", "rb"))
labelled_data = clean_data(labelled_data.merge(train_df, on="anime_id"))

In [8]:
# get model
delta_cols = [f"delta_{source}" for source in delta_sources]
formula = "score ~ 0 +" + " + ".join(delta_cols)
model = lm(formula, labelled_data)
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:                  score   R-squared (uncentered):                   0.217
Model:                            OLS   Adj. R-squared (uncentered):              0.213
Method:                 Least Squares   F-statistic:                              48.11
Date:                Sun, 23 May 2021   Prob (F-statistic):                    3.61e-19
Time:                        12:59:20   Log-Likelihood:                         -639.59
No. Observations:                 349   AIC:                                      1283.
Df Residuals:                     347   BIC:                                      1291.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [9]:
df = clean_data(get_deltas([f"{x}.pkl" for x in delta_sources]))

In [10]:
blp = pickle.load(open("baseline_predictor.pkl", "rb"))
df["blp"] = blp["blp"]
df["score"] = model.predict(df) + df["blp"]
df["delta"] = df["score"] - df["blp"]

## renormalize the confidence intervals

In [11]:
if renormalize_variance:
    for source in delta_sources:
        seen_shows = pickle.load(open("user_anime_list.pkl", "rb"))
        seen_shows = seen_shows.set_index("anime_id")
        seen_shows["delta"] = df[f"delta_{source}"]

        single_delta_model = lm("score ~ delta + 0", seen_shows)

        seen_shows["pred_score"] = single_delta_model.predict(df)
        seen_shows["pred_std"] = np.sqrt(
            (df[f"delta_var_{source}"] + df[f"delta_{source}"] ** 2)
            * (
                single_delta_model.bse["delta"] ** 2
                + single_delta_model.params["delta"] ** 2
            )
            - (df[f"delta_{source}"] ** 2 * single_delta_model.params["delta"] ** 2)
        )
        seen_shows = seen_shows.loc[lambda x: x["pred_std"] < np.inf]

        std_mult = (
            (seen_shows["pred_score"] - seen_shows["score"]) / seen_shows["pred_std"]
        ).std()
        df[f"delta_var_{source}"] *= std_mult ** 2

In [12]:
# compute error bars
model_vars = pd.DataFrame()
for col in delta_cols:
    source = col.split("_")[1]
    model_vars[f"model_delta_var_{source}"] = (
        (df[f"delta_var_{source}"] + df[f"delta_{source}"] ** 2)
        * (model.bse[f"delta_{source}"] ** 2 + model.params[f"delta_{source}"] ** 2)
    ) - df[f"delta_{source}"] ** 2 * model.params[f"delta_{source}"] ** 2
model_stds = np.sqrt(model_vars)

delta_corrs = delta_corrs.loc[lambda x: (x.index.isin(delta_cols)), delta_cols]
delta_variance = np.sum(
    (model_stds.values @ delta_corrs.values) * model_stds.values, axis=1
)
intercept_variance = 0
if "Intercept" in model.bse:
    intercept_variance = model.bse["Intercept"] ** 2
df["std"] = np.sqrt(delta_variance + intercept_variance)

In [13]:
if renormalize_variance:
    seen_shows = pickle.load(open("user_anime_list.pkl", "rb"))
    seen_shows = seen_shows.set_index("anime_id")
    seen_shows["score"] += df["blp"]
    seen_shows["pred_score"] = df[f"score"]
    seen_shows["pred_std"] = df["std"]

    seen_shows = seen_shows.loc[lambda x: x["pred_std"] < np.inf]

    std_mult = (
        (seen_shows["pred_score"] - seen_shows["score"]) / seen_shows["pred_std"]
    ).std()
    df["std"] *= std_mult

In [14]:
zscore = st.norm.ppf(1 - (1 - confidence_interval) / 2)
df["score_lower_bound"] = df["score"] - df["std"] * zscore
df["score_upper_bound"] = df["score"] + df["std"] * zscore

In [15]:
anime = pd.read_csv("../../cleaned_data/anime.csv")
anime = anime[["anime_id", "title", "type"]]
df = df.merge(anime, on="anime_id").set_index("anime_id")

In [16]:
# reorder the columns
cols = [
    "title",
    "type",
    "score",
    "score_lower_bound",
    "score_upper_bound",
    "delta",
    "std",
] + delta_cols
df = df[cols + [x for x in df.columns if x not in cols]]

In [17]:
confidence_interval, zscore

(0.5, 0.6744897501960817)

In [18]:
related_series = pickle.load(open("../../processed_data/related_anime_graph.pkl", "rb"))
df = df.merge(related_series, on="anime_id").set_index("anime_id")

In [19]:
new_recs = df.loc[lambda x: ~x.index.isin(labelled_data.anime_id) & (x["type"] == "TV")]

In [20]:
df.loc[lambda x: (x["delta"] > 0)].sort_values(by="score_lower_bound", ascending=False)[
    :20
]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_item,delta_user,delta_var_item,delta_var_user,blp,series_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
32,Neon Genesis Evangelion: The End of Evangelion,Movie,8.511379,7.852083,9.170675,1.230053,0.977474,-0.103332,1.092234,0.31111,0.185522,7.281326,20
30,Neon Genesis Evangelion,TV,8.109074,7.521555,8.696592,0.960631,0.871057,-0.062037,0.835813,0.285,0.144081,7.148442,20
3784,Evangelion: 2.0 You Can (Not) Advance,Movie,8.087159,7.492446,8.681871,0.584137,0.881722,-0.106574,0.571642,0.301722,0.183385,7.503022,1701
3297,Aria The Origination,TV,8.480107,7.483882,9.476333,0.871839,1.477006,-0.009118,0.715106,0.349068,1.053962,7.608268,339
11981,Mahou Shoujo Madoka★Magica Movie 3: Hangyaku n...,Movie,7.827709,7.243489,8.411929,0.461148,0.866166,-0.151492,0.513312,0.216984,0.244905,7.36656,4026
820,Ginga Eiyuu Densetsu,OVA,8.208669,7.197527,9.21981,0.379261,1.499121,-0.256765,0.543877,0.316687,1.166584,7.829408,553
9756,Mahou Shoujo Madoka★Magica,TV,7.646732,7.184501,8.108963,0.241336,0.685305,-0.245452,0.421659,0.15085,0.13681,7.405396,4026
2759,Evangelion: 1.0 You Are (Not) Alone,Movie,7.707012,7.122743,8.291282,0.598894,0.866239,-0.021774,0.505513,0.300701,0.176846,7.108118,1701
962,Aria The Natural,TV,7.962002,7.084559,8.839444,0.742297,1.300899,0.11572,0.495139,0.402416,0.679181,7.219704,339
11979,Mahou Shoujo Madoka★Magica Movie 2: Eien no Mo...,Movie,7.653861,7.000668,8.307054,0.295069,0.968426,-0.001944,0.240971,0.251887,0.356013,7.358792,4026


In [21]:
new_recs.loc[lambda x: (x["delta"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
).groupby("series_id").first().sort_values(by="score_lower_bound", ascending=False)[:50]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_item,delta_user,delta_var_item,delta_var_user,blp
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1143,Hidamari Sketch x ☆☆☆,TV,7.10998,6.033641,8.186318,0.15411,1.595782,0.026403,0.100608,0.355586,1.366797,6.95587
5698,Gochuumon wa Usagi desu ka??,TV,6.990816,5.971828,8.009804,0.149253,1.510754,0.155319,-0.022046,0.488619,1.011429,6.841564
4084,Love Live! School Idol Project 2nd Season,TV,6.927018,5.9549,7.899135,0.045217,1.441264,0.161711,-0.112264,0.775709,0.60982,6.881801
7708,New Game!!,TV,6.884592,5.936474,7.832711,0.074861,1.405683,0.12598,-0.05533,0.379934,0.925355,6.809731
346,Ichigo Mashimaro,TV,6.793884,5.867244,7.720523,0.085199,1.373838,0.053271,0.020005,0.388178,0.855182,6.708685
427,Mugen no Ryvius,TV,7.169871,5.827136,8.512605,0.6565,1.990741,0.142223,0.401186,0.532712,2.139447,6.513371
130,Macross,TV,7.038905,5.813434,8.264376,0.145752,1.816886,0.045987,0.075798,0.580894,1.612791,6.893153
8062,Kemono Friends,TV,7.089994,5.806529,8.373459,0.720643,1.902868,0.171328,0.426378,0.660027,1.719808,6.369351
4809,Girls & Panzer,TV,6.608053,5.785452,7.430653,0.110883,1.21959,0.031447,0.060922,0.369567,0.604486,6.49717
102,Maria-sama ga Miteru: Haru,TV,7.19609,5.770453,8.621726,0.379337,2.113651,0.271762,0.057228,0.758364,2.214877,6.816752


In [22]:
new_recs.loc[lambda x: (x["delta_item"] < 0) & (x["delta_user"] > 0)].sort_values(
    by="score_lower_bound", ascending=False
).groupby("series_id").first().sort_values(by="score_lower_bound", ascending=False)[:50]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_item,delta_user,delta_var_item,delta_var_user,blp
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
124,Tenkuu no Escaflowne,TV,6.601719,5.656515,7.546923,-0.049034,1.401362,-0.0645,0.01965,0.415938,0.876021,6.650753
1143,Hidamari Sketch,TV,6.536194,5.636364,7.436023,0.184463,1.334089,-0.003216,0.152487,0.311484,0.869493,6.351731
16,Texhnolyze,TV,6.440085,5.618848,7.261322,-0.031075,1.217568,-0.271167,0.224523,0.361092,0.601878,6.47116
91,Kimi ga Nozomu Eien,TV,6.371381,5.606,7.136763,0.051304,1.134756,-0.04855,0.086295,0.407514,0.439497,6.320078
51,Mobile Suit Zeta Gundam,TV,7.03584,5.576057,8.495623,0.166883,2.164277,-0.040197,0.172291,0.767065,2.35925,6.868958
7657,Amanchu!,TV,6.474207,5.501505,7.44691,0.238361,1.442131,-0.043518,0.23329,0.305866,1.089396,6.235846
894,Maison Ikkoku,TV,7.017126,5.497447,8.536804,-0.040201,2.253079,-0.091406,0.051587,1.007819,2.341857,7.057327
6441,Yuri Kuma Arashi,TV,6.219358,5.248846,7.189871,0.36675,1.438884,-0.099799,0.389189,0.399629,0.951495,5.852608
7908,Uchuu Patrol Luluco,TV,6.190904,5.230089,7.15172,-0.145318,1.424507,-0.397821,0.248551,0.56983,0.747692,6.336223
5293,Gatchaman Crowds Insight,TV,6.348295,5.175217,7.521373,0.086246,1.739208,-0.027966,0.095665,0.548685,1.457073,6.262049


In [24]:
new_recs.loc[lambda x: (x["delta_item"] > 0) & (x["delta_user"] < 0)].sort_values(
    by="score_lower_bound", ascending=False
).groupby("series_id").first().sort_values(by="score_lower_bound", ascending=False)[:20]

Unnamed: 0_level_0,title,type,score,score_lower_bound,score_upper_bound,delta,std,delta_item,delta_user,delta_var_item,delta_var_user,blp
series_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5698,Gochuumon wa Usagi desu ka??,TV,6.990816,5.971828,8.009804,0.149253,1.510754,0.155319,-0.022046,0.488619,1.011429,6.841564
4084,Love Live! School Idol Project 2nd Season,TV,6.927018,5.9549,7.899135,0.045217,1.441264,0.161711,-0.112264,0.775709,0.60982,6.881801
7708,New Game!!,TV,6.884592,5.936474,7.832711,0.074861,1.405683,0.12598,-0.05533,0.379934,0.925355,6.809731
4245,Yuru Yuri San☆Hai!,TV,6.831396,5.88308,7.779713,-0.084449,1.405976,0.018765,-0.085734,0.472133,0.822926,6.915845
315,Kaleido Star,TV,7.203862,5.739883,8.667841,0.302505,2.170499,0.344943,-0.072442,0.89702,2.213291,6.901357
1842,Bamboo Blade,TV,6.498646,5.525972,7.47132,0.08503,1.442089,0.130664,-0.051401,0.457227,0.908172,6.413616
265,Fate/kaleid liner Prisma☆Illya 3rei!!,TV,6.664283,5.518817,7.80975,0.165868,1.698272,0.165678,-0.018118,0.585879,1.314181,6.498416
102,Maria-sama ga Miteru 4th,TV,7.021502,5.460431,8.582573,0.052137,2.314447,0.209909,-0.151039,0.849062,2.734112,6.969365
792,Bokura ga Ita,TV,6.391356,5.441477,7.341235,-0.016259,1.408293,0.028208,-0.039156,0.563752,0.736468,6.407615
9175,Karakai Jouzu no Takagi-san,TV,6.446765,5.438569,7.45496,-0.147497,1.494753,0.030351,-0.147511,0.600838,0.8604,6.594262
