In [1]:
import os 
from pathlib import Path
from dotenv import load_dotenv
import pickle
import pandas as pd
import torch
from tqdm import tqdm
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()


True

In [3]:
base_path = Path.home()/Path(os.environ.get("SAVE_PATH"))

In [4]:
# loading slateq results
ALPHA = 0.25
SEEDS = [5, 7, 46, 53, 77]

In [5]:
def parse_data(data_dict: dict, alg_nam:str, seed: int)->pd.DataFrame:
    # convert from torch to float
    for k, v in data_dict.items():
        if isinstance(v, torch.Tensor):
            data_dict[k] = v.item()
        elif isinstance(v, list):
            if isinstance(v[0], torch.Tensor):
                data_dict[k] = [x.item() for x in v]
    data_df = pd.DataFrame(data_dict)
    data_df["alg_name"] = alg_nam
    data_df["seed"] = seed
    data_df = data_df.reset_index().rename(columns={"index": "episode"})
    
    return data_df

In [6]:
# load slateq results
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"serving_slateq_300_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ", seed)
    res_df_list.append(res_df)
# concat all results
final_df = pd.concat(res_df_list)

100%|██████████| 5/5 [00:00<00:00, 11.56it/s]


In [7]:
# load wp results
res_df_list = []

for seed in tqdm(SEEDS):
    for knearest in [5,10,20]:
        slateq_dict = pickle.load(open(base_path / Path(f"serving_proto_item_300_{knearest}_{seed}")/ "logs_dict.pickle", "rb"))
        res_df = parse_data(slateq_dict, f"Item-Wolpertinger {knearest}%", seed)
        res_df_list.append(res_df)
# concat all results
wp_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, wp_df])

100%|██████████| 5/5 [00:00<00:00,  7.38it/s]


In [8]:
res_df_list = []

for seed in tqdm(SEEDS):
    for knearest in [5,10,20]:
        slateq_dict = pickle.load(open(base_path / Path(f"serving_proto_slate_300_{knearest}_{seed}")/ "logs_dict.pickle", "rb"))
        res_df = parse_data(slateq_dict, f"Slate-Wolpertinger {knearest}%", seed)
        res_df_list.append(res_df)
# concat all results
slate_wp_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slate_wp_df])

100%|██████████| 5/5 [00:00<00:00,  7.11it/s]


In [9]:
final_df = final_df.reset_index(drop=True)

In [13]:
final_df

Unnamed: 0,episode,session_length,ep_cum_reward,ep_avg_reward,best_rl_avg_diff,best_avg_avg_diff,cum_normalized,alg_name,seed
0,0,223.0,18.041277,0.311056,0.251696,0.863313,0.552741,SlateQ,5
1,1,204.0,0.507943,0.008466,0.238343,0.661508,0.034301,SlateQ,5
2,2,204.0,2.157166,0.035953,0.333708,0.874238,0.097259,SlateQ,5
3,3,227.0,24.506971,0.415372,0.259116,0.906081,0.615833,SlateQ,5
4,4,214.0,12.542114,0.216243,0.220634,0.879364,0.494975,SlateQ,5
...,...,...,...,...,...,...,...,...,...
195,195,206.5,3.026659,0.055030,0.426174,0.671976,0.114359,Slate-Wolpertinger 20%,77
196,196,206.5,2.828488,0.051427,0.658290,1.043433,0.072461,Slate-Wolpertinger 20%,77
197,197,206.5,2.825793,0.051378,0.449641,0.736655,0.102547,Slate-Wolpertinger 20%,77
198,198,202.5,-0.352704,-0.006532,0.435780,0.629620,-0.015216,Slate-Wolpertinger 20%,77


In [10]:
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon
def stat_test(r1,r2):
    print(levene(r1, r2))
    differences = [a-b for a,b in zip(r1, r2)]
    print(shapiro(differences))
    print(ttest_rel(r1, r2))
    print(wilcoxon(r1, r2))

# Statistical tests

### SlateQ | Item-WP20 | Slate-WP20

In [11]:
print("===== SlateQ | Item-WP20 | Slate-WP20 =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 20%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 20%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 20%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 20%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)

===== SlateQ | Item-WP20 | Slate-WP20 =====
===== Return =====
SlateQ 218.058 0.8516607305729172
Item-WP 201.613 6.038137626785269
Slate-WP 219.08100000000005 3.1145600010274315
SlateQ vs Item-WP
LeveneResult(statistic=2.08881304792654, pvalue=0.18638765382818745)
ShapiroResult(statistic=0.8773323893547058, pvalue=0.2973969578742981)
Ttest_relResult(statistic=5.863679535085234, pvalue=0.00422306417581704)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
Slate-Wp vs SlateQ
LeveneResult(statistic=0.7050637560152448, pvalue=0.4254673777312607)
ShapiroResult(statistic=0.7845602035522461, pvalue=0.06028958782553673)
Ttest_relResult(statistic=-0.7526935656945823, pvalue=0.4935071527320136)
WilcoxonResult(statistic=5.0, pvalue=0.625)
===== Cum satisfaction =====
SlateQ 15.032428655415774 0.796045076788641
Item-WP -0.895941579222679 5.920529848880206
Slate-WP 16.08601001775265 3.097191614536211
SlateQ vs Item-WP
LeveneResult(statistic=2.220465933149621, pvalue=0.17452392298507513)
ShapiroResult(st

In [12]:
print("===== SlateQ | Item-WP10 | Slate-WP10=====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 10%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 10%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 10%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 10%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)

===== SlateQ | Item-WP10 | Slate-WP10=====
===== Return =====
SlateQ 218.058 0.8516607305729172
Item-WP 191.216 4.993246839482297
Slate-WP 220.33800000000002 4.5279648850228496
SlateQ vs Item-WP
LeveneResult(statistic=4.028446696531847, pvalue=0.07963542642921916)
ShapiroResult(statistic=0.9615320563316345, pvalue=0.8186097741127014)
Ttest_relResult(statistic=10.808727999102022, pvalue=0.00041559294307445463)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
Slate-Wp vs SlateQ
LeveneResult(statistic=1.0340863048239948, pvalue=0.33896514068370687)
ShapiroResult(statistic=0.661445140838623, pvalue=0.0036188485100865364)
Ttest_relResult(statistic=-1.1318105613297558, pvalue=0.32096707767059435)
WilcoxonResult(statistic=5.0, pvalue=0.625)
===== Cum satisfaction =====
SlateQ 15.032428655415774 0.796045076788641
Item-WP -10.740671027064323 4.793026808276126
Slate-WP 17.272378032624722 4.370977888035167
SlateQ vs Item-WP
LeveneResult(statistic=3.921877936954553, pvalue=0.08300076531468513)
Shapiro

In [14]:
print("===== SlateQ | Item-WP5 | Slate-WP5=====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Item-Wolpertinger 5%"].values
slate_wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 5%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("Item-WP",wp_return.mean(), wp_return.std())
print("Slate-WP",slate_wp_return.mean(), slate_wp_return.std())
print("SlateQ vs Item-WP")
stat_test(sq_return, wp_return)
print("Slate-Wp vs SlateQ")
stat_test(sq_return, slate_wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Item-Wolpertinger 5%"].values
slate_wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("Item-WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
print("Slate-WP",slate_wp_cum_satisfaction.mean(), slate_wp_cum_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_cum_satisfaction, slate_wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Item-Wolpertinger 5%"].values
slate_wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("Item-WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
print("Slate-WP",slate_wp_avg_satisfaction.mean(), slate_wp_avg_satisfaction.std())
print("SlateQ vs Item-WP")
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)
print("Slate-Wp vs SlateQ")
stat_test(sq_avg_satisfaction, slate_wp_avg_satisfaction)  

===== SlateQ | Item-WP5 | Slate-WP5=====
===== Return =====
SlateQ 218.058 0.8516607305729172
Item-WP 187.905 6.0946353459415485
Slate-WP 212.289 1.1893250186555437
SlateQ vs Item-WP
LeveneResult(statistic=2.2615419814538504, pvalue=0.17103154126043482)
ShapiroResult(statistic=0.9378151893615723, pvalue=0.6505358219146729)
Ttest_relResult(statistic=9.578818952757578, pvalue=0.0006637266801965869)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
Slate-Wp vs SlateQ
LeveneResult(statistic=0.047276151330576034, pvalue=0.8333156667434368)
ShapiroResult(statistic=0.8766244053840637, pvalue=0.29429322481155396)
Ttest_relResult(statistic=10.976014683216833, pvalue=0.00039148399892377956)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
===== Cum satisfaction =====
SlateQ 15.032428655415774 0.796045076788641
Item-WP -14.00834770345688 5.687908250975408
Slate-WP 9.63846852475405 1.2164511284228168
SlateQ vs Item-WP
LeveneResult(statistic=2.2807187915022604, pvalue=0.16943325315166063)
ShapiroResult(stat