In [1]:
import os 
from pathlib import Path
from dotenv import load_dotenv
import pickle
import pandas as pd
import torch
from tqdm import tqdm
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()


True

In [3]:
base_path = Path.home()/Path(os.environ.get("SAVE_PATH"))

In [4]:
# loading slateq results
ALPHA = 0.25
SEEDS = [5, 7, 46, 53, 77]

In [5]:
def parse_data(data_dict: dict, alg_nam:str, seed: int)->pd.DataFrame:
    # convert from torch to float
    for k, v in data_dict.items():
        if isinstance(v, torch.Tensor):
            data_dict[k] = v.item()
        elif isinstance(v, list):
            if isinstance(v[0], torch.Tensor):
                data_dict[k] = [x.item() for x in v]
    data_df = pd.DataFrame(data_dict)
    data_df["alg_name"] = alg_nam
    data_df["seed"] = seed
    data_df = data_df.reset_index().rename(columns={"index": "episode"})
    
    return data_df

In [6]:
# load slateq results
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"serving_slateq_2000_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ", seed)
    res_df_list.append(res_df)
# concat all results
final_df = pd.concat(res_df_list)

100%|██████████| 5/5 [00:00<00:00, 10.50it/s]


In [7]:
# load wp results
res_df_list = []

for seed in tqdm(SEEDS):
    for knearest in [5,10,20]:
        slateq_dict = pickle.load(open(base_path / Path(f"serving_proto_item_2000_{knearest}_{seed}")/ "logs_dict.pickle", "rb"))
        res_df = parse_data(slateq_dict, f"Item-Wolpertinger {knearest}%", seed)
        res_df_list.append(res_df)
# concat all results
wp_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, wp_df])

100%|██████████| 5/5 [00:01<00:00,  4.62it/s]


In [8]:
res_df_list = []

for seed in tqdm(SEEDS):
    for knearest in [5,10,20]:
        slateq_dict = pickle.load(open(base_path / Path(f"serving_proto_slate_2000_{knearest}_{seed}")/ "logs_dict.pickle", "rb"))
        res_df = parse_data(slateq_dict, f"Slate-Wolpertinger {knearest}%", seed)
        res_df_list.append(res_df)
# concat all results
slate_wp_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slate_wp_df])

100%|██████████| 5/5 [00:01<00:00,  4.62it/s]


In [9]:
final_df = final_df.reset_index(drop=True)

In [13]:
final_df

Unnamed: 0,episode,session_length,ep_cum_reward,ep_avg_reward,best_rl_avg_diff,best_avg_avg_diff,cum_normalized,alg_name,seed
0,0,223.0,18.041277,0.311056,0.251696,0.863313,0.552741,SlateQ,5
1,1,204.0,0.507943,0.008466,0.238343,0.661508,0.034301,SlateQ,5
2,2,204.0,2.157166,0.035953,0.333708,0.874238,0.097259,SlateQ,5
3,3,227.0,24.506971,0.415372,0.259116,0.906081,0.615833,SlateQ,5
4,4,214.0,12.542114,0.216243,0.220634,0.879364,0.494975,SlateQ,5
...,...,...,...,...,...,...,...,...,...
195,195,206.5,3.026659,0.055030,0.426174,0.671976,0.114359,Slate-Wolpertinger 20%,77
196,196,206.5,2.828488,0.051427,0.658290,1.043433,0.072461,Slate-Wolpertinger 20%,77
197,197,206.5,2.825793,0.051378,0.449641,0.736655,0.102547,Slate-Wolpertinger 20%,77
198,198,202.5,-0.352704,-0.006532,0.435780,0.629620,-0.015216,Slate-Wolpertinger 20%,77


In [9]:
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon
def stat_test(r1,r2):
    print(levene(r1, r2))
    differences = [a-b for a,b in zip(r1, r2)]
    print(shapiro(differences))
    print(ttest_rel(r1, r2))
    print(wilcoxon(r1, r2))

# Statistical tests

### SlateQ | Item-WP20 | Slate-WP20

In [10]:
print("===== SlateQ | Item-WP20 | Slate-WP20 =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 20%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 20%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 20%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 20%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)

===== SlateQ | Item-WP20 | Slate-WP20 =====
===== Return =====
SlateQ 214.0375 1.5834298216214013
Item-WP 201.8755 2.6607162569503715
Slate-WP 211.65099999999998 5.965040988962268
SlateQ vs Item-WP
LeveneResult(statistic=0.38309525029713626, pvalue=0.5531636485509502)
ShapiroResult(statistic=0.6662633419036865, pvalue=0.004107433836907148)
Ttest_relResult(statistic=6.248794945171744, pvalue=0.003343727307969354)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
Slate-Wp vs SlateQ
LeveneResult(statistic=2.4135947295168285, pvalue=0.15889095585437663)
ShapiroResult(statistic=0.9152214527130127, pvalue=0.49959444999694824)
Ttest_relResult(statistic=1.0727569864716895, pvalue=0.3437890756465674)
WilcoxonResult(statistic=3.0, pvalue=0.3125)
===== Cum satisfaction =====
SlateQ 11.138902307793497 1.5669716402048743
Item-WP -0.5115765998512506 2.5758825300468557
Slate-WP 8.838447588816283 5.807151682668436
SlateQ vs Item-WP
LeveneResult(statistic=0.3658323251481381, pvalue=0.5620437809223182)
Shapi

In [11]:
print("===== SlateQ | Item-WP10 | Slate-WP10=====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 10%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 10%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 10%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 10%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)

===== SlateQ | Item-WP10 | Slate-WP10=====
===== Return =====
SlateQ 214.0375 1.5834298216214013
Item-WP 192.7485 8.111544951487359
Slate-WP 212.30700000000002 7.259101425107659
SlateQ vs Item-WP
LeveneResult(statistic=2.1814667371808225, pvalue=0.17792923865304075)
ShapiroResult(statistic=0.8075520396232605, pvalue=0.09323883801698685)
Ttest_relResult(statistic=4.546250718763904, pvalue=0.010447330130068764)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
Slate-Wp vs SlateQ
LeveneResult(statistic=4.608999776658728, pvalue=0.06408817358986581)
ShapiroResult(statistic=0.9012526869773865, pvalue=0.41684791445732117)
Ttest_relResult(statistic=0.5961761655503777, pvalue=0.5831562299632651)
WilcoxonResult(statistic=4.0, pvalue=0.4375)
===== Cum satisfaction =====
SlateQ 11.138902307793497 1.5669716402048743
Item-WP -9.246158276423811 7.755636976120997
Slate-WP 9.471297344103457 7.0146497959363545
SlateQ vs Item-WP
LeveneResult(statistic=2.1444181973404395, pvalue=0.1812478320393184)
ShapiroRes

In [12]:
print("===== SlateQ | Item-WP10 | Slate-WP10=====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 5%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 5%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 5%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 5%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)  

===== SlateQ | Item-WP10 | Slate-WP10=====
===== Return =====
SlateQ 214.0375 1.5834298216214013
Item-WP 186.842 10.897245225285147
Slate-WP 217.07850000000002 6.434852873220952
SlateQ vs Item-WP
LeveneResult(statistic=4.6187302682536435, pvalue=0.06386245672717719)
ShapiroResult(statistic=0.973307192325592, pvalue=0.8960248231887817)
Ttest_relResult(statistic=4.5700657774179865, pvalue=0.010260279293271779)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
Slate-Wp vs SlateQ
LeveneResult(statistic=2.079445567982149, pvalue=0.18727290524258544)
ShapiroResult(statistic=0.9455087184906006, pvalue=0.7050985097885132)
Ttest_relResult(statistic=-1.0520793073358847, pvalue=0.3521199777070371)
WilcoxonResult(statistic=4.0, pvalue=0.4375)
===== Cum satisfaction =====
SlateQ 11.138902307793497 1.5669716402048743
Item-WP -15.07692925256491 10.58290860061259
Slate-WP 14.047510831803084 6.300947686614635
SlateQ vs Item-WP
LeveneResult(statistic=4.579645761311687, pvalue=0.06477532476328247)
ShapiroResu