In [1]:
import os 
from pathlib import Path
from dotenv import load_dotenv
import pickle
import pandas as pd
import torch
from tqdm import tqdm
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()


True

In [3]:
base_path = Path.home()/Path(os.environ.get("SAVE_PATH"))

In [4]:
# loading slateq results
ALPHA = 0.25
SEEDS = [5, 7, 46, 53, 77]

In [5]:
def parse_data(data_dict: dict, alg_nam:str, seed: int)->pd.DataFrame:
    # convert from torch to float
    for k, v in data_dict.items():
        if isinstance(v, torch.Tensor):
            data_dict[k] = v.item()
        elif isinstance(v, list):
            if isinstance(v[0], torch.Tensor):
                data_dict[k] = [x.item() for x in v]
    data_df = pd.DataFrame(data_dict)
    data_df["alg_name"] = alg_nam
    data_df["seed"] = seed
    data_df = data_df.reset_index().rename(columns={"index": "episode"})
    
    return data_df

In [6]:
# load slateq results
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"serving_slateq_2000_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ", seed)
    res_df_list.append(res_df)
# concat all results
final_df = pd.concat(res_df_list)

100%|██████████| 5/5 [00:00<00:00, 41.40it/s]


In [7]:
# load wp results
res_df_list = []

for seed in tqdm(SEEDS):
    for knearest in [10,20]:
        slateq_dict = pickle.load(open(base_path / Path(f"serving_proto_item_2000_{knearest}_{seed}")/ "logs_dict.pickle", "rb"))
        res_df = parse_data(slateq_dict, f"Item-Wolpertinger {knearest}%", seed)
        res_df_list.append(res_df)
# concat all results
wp_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, wp_df])

100%|██████████| 5/5 [00:00<00:00, 12.01it/s]


In [8]:
res_df_list = []

for seed in tqdm(SEEDS):
    for knearest in [10,20]:
        slateq_dict = pickle.load(open(base_path / Path(f"serving_proto_slate_2000_{knearest}_{seed}")/ "logs_dict.pickle", "rb"))
        res_df = parse_data(slateq_dict, f"Slate-Wolpertinger {knearest}%", seed)
        res_df_list.append(res_df)
# concat all results
slate_wp_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slate_wp_df])

100%|██████████| 5/5 [00:00<00:00, 22.04it/s]


In [9]:
final_df = final_df.reset_index(drop=True)

In [10]:
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon
def stat_test(r1,r2):
    print(levene(r1, r2))
    differences = [a-b for a,b in zip(r1, r2)]
    print(shapiro(differences))
    print(ttest_rel(r1, r2))
    print(wilcoxon(r1, r2))

# Statistical tests

### SlateQ | Item-WP20 | Slate-WP20

In [13]:
print("===== SlateQ | Item-WP20 | Slate-WP20 =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 20%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 20%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 20%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 20%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)

===== SlateQ | WP20 =====
===== Return =====
SlateQ 215.53400000000002 1.8768015345262268
Item-WP 201.573 2.853234305135139
Slate-WP 212.327 5.800686683488428
SlateQ vs Item-WP
LeveneResult(statistic=0.31588588543779067, pvalue=0.5894864385725671)
ShapiroResult(statistic=0.7695077061653137, pvalue=0.04466827213764191)
Ttest_relResult(statistic=6.682929681839215, pvalue=0.0026066661371982824)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
Slate-Wp vs SlateQ
LeveneResult(statistic=1.8803948118208547, pvalue=0.20751824461672697)
ShapiroResult(statistic=0.9630462527275085, pvalue=0.829020619392395)
Ttest_relResult(statistic=1.617150469437075, pvalue=0.18115349898992922)
WilcoxonResult(statistic=2.0, pvalue=0.1875)
===== Cum satisfaction =====
SlateQ 12.478878202378748 1.8294516254383966
Item-WP -0.7373401412963867 2.7041389304995125
Slate-WP 9.45061971861124 5.613848506759906
SlateQ vs Item-WP
LeveneResult(statistic=0.2434941959986534, pvalue=0.6349537359554283)
ShapiroResult(statistic=0.752

In [130]:
print("===== SlateQ | Item-WP10 | Slate-WP10=====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 10%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 10%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 10%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 10%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)

===== SlateQ | WP10 =====
===== Return =====
SlateQ 244.776 2.7584405014427973
WP 238.06600000000003 7.100988241083074
LeveneResult(statistic=4.27485464665415, pvalue=0.07250207538860465)
ShapiroResult(statistic=0.8594928979873657, pvalue=0.22643327713012695)
Ttest_relResult(statistic=3.0073120782097305, pvalue=0.039655178421421126)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
===== Cum satisfaction =====
SlateQ 40.56651997661591 2.675467144996049
WP 34.07347901815176 6.8623124941032705
LeveneResult(statistic=4.219782751961376, pvalue=0.07402256027078906)
ShapiroResult(statistic=0.8661351799964905, pvalue=0.2511243522167206)
Ttest_relResult(statistic=3.022989778046748, pvalue=0.039048362618222345)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
===== Avg satisfaction =====
SlateQ 0.6184224149733782 0.04614209242149694
WP 0.502226670967182 0.1043726417868731
LeveneResult(statistic=2.716361309072732, pvalue=0.13793688908527466)
ShapiroResult(statistic=0.8940783739089966, pvalue=0.3780272603

In [131]:
print("===== SlateQ | Item-WP10 | Slate-WP10=====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 5%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 5%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 5%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 5%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)  

===== SlateQ | WP5 =====
===== Return =====
SlateQ 244.776 2.7584405014427973
WP 219.079 13.064051209330128
LeveneResult(statistic=1.1508009095625247, pvalue=0.3146721173875861)
ShapiroResult(statistic=0.6741660833358765, pvalue=0.005039647687226534)
Ttest_relResult(statistic=4.724218392061491, pvalue=0.009143038018326185)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
===== Cum satisfaction =====
SlateQ 40.56651997661591 2.675467144996049
WP 16.047076077818872 12.584033893978804
LeveneResult(statistic=1.2092195095870424, pvalue=0.3034733788766488)
ShapiroResult(statistic=0.6851897239685059, pvalue=0.006660476326942444)
Ttest_relResult(statistic=4.6723137344045815, pvalue=0.009501924250765431)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
===== Avg satisfaction =====
SlateQ 0.6184224149733782 0.04614209242149694
WP 0.22396962070278822 0.19863702275602155
LeveneResult(statistic=1.155367369873131, pvalue=0.3137749486304136)
ShapiroResult(statistic=0.6590058207511902, pvalue=0.00339217809960