In [24]:
import os 
from pathlib import Path
from dotenv import load_dotenv
import pickle
import pandas as pd
import torch
from tqdm import tqdm
import seaborn as sns

In [25]:
load_dotenv()


True

In [26]:
base_path = Path.home()/Path(os.environ.get("SAVE_PATH"))

In [27]:
# loading slateq results
ALPHA = 0.25
SEEDS = [42, 5, 7, 97, 53]

In [28]:
def parse_data(data_dict: dict, alg_nam:str, seed: int)->pd.DataFrame:
    # convert from torch to float
    for k, v in data_dict.items():
        if isinstance(v, torch.Tensor):
            data_dict[k] = v.item()
        elif isinstance(v, list):
            if isinstance(v[0], torch.Tensor):
                data_dict[k] = [x.item() for x in v]
    data_df = pd.DataFrame(data_dict)
    data_df["alg_name"] = alg_nam
    data_df["seed"] = seed
    data_df = data_df.reset_index().rename(columns={"index": "episode"})
    
    return data_df

In [29]:
# load slateq results
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"test_serving_observed_topic_slateq_2000_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ", seed)
    res_df_list.append(res_df)
# concat all results
final_df = pd.concat(res_df_list)

100%|██████████| 5/5 [00:00<00:00, 18.11it/s]


In [30]:
# load wp results
res_df_list = []

for seed in tqdm(SEEDS):
    for knearest in [20, 10, 5]:
        slateq_dict = pickle.load(open(base_path / Path(f"test_wa_{knearest}_serving_observed_topic_slateq_2000_{seed}")/ "logs_dict.pickle", "rb"))
        res_df = parse_data(slateq_dict, f"Slate-Wolpertinger {knearest}%", seed)
        res_df_list.append(res_df)
# concat all results
wp_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, wp_df])

100%|██████████| 5/5 [00:00<00:00,  5.65it/s]


In [31]:
final_df = final_df.reset_index(drop=True)

In [32]:
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon
def stat_test(r1,r2):
    print(levene(r1, r2))
    differences = [a-b for a,b in zip(r1, r2)]
    print(shapiro(differences))
    print(ttest_rel(r1, r2))
    print(wilcoxon(r1, r2))

# Statistical tests

### SlateQ | WP20

In [33]:
print("===== SlateQ | WP20 =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 20%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("WP",wp_return.mean(), wp_return.std())
stat_test(sq_return, wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 20%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)

===== SlateQ | WP20 =====
===== Return =====
SlateQ 246.198 2.4751335317513714
WP 245.04899999999998 4.593331470730145
LeveneResult(statistic=0.43250501442492356, pvalue=0.5292266651357699)
ShapiroResult(statistic=0.8886706233024597, pvalue=0.35046282410621643)
Ttest_relResult(statistic=0.7076558800502115, pvalue=0.5182118022537707)
WilcoxonResult(statistic=6.0, pvalue=0.8125)
===== Cum satisfaction =====
SlateQ 41.9388566827774 2.396559214302558
WP 40.74824743986129 4.438687065586514
LeveneResult(statistic=0.46465003605320154, pvalue=0.5147040366017046)
ShapiroResult(statistic=0.8903428316116333, pvalue=0.35882997512817383)
Ttest_relResult(statistic=0.7458550306934277, pvalue=0.49720054930329843)
WilcoxonResult(statistic=6.0, pvalue=0.8125)
===== Avg satisfaction =====
SlateQ 0.6357181721664965 0.048799222117163796
WP 0.6083089765775949 0.07378319253194625
LeveneResult(statistic=0.21536719728227832, pvalue=0.6549595039825544)
ShapiroResult(statistic=0.8867229223251343, pvalue=0.340893

In [34]:
print("===== SlateQ | WP10 =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 10%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("WP",wp_return.mean(), wp_return.std())
stat_test(sq_return, wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 10%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)

===== SlateQ | WP10 =====
===== Return =====
SlateQ 246.198 2.4751335317513714
WP 241.35100000000003 6.492766282564002
LeveneResult(statistic=1.2387559921678413, pvalue=0.29803295406469404)
ShapiroResult(statistic=0.9123519062995911, pvalue=0.4818522334098816)
Ttest_relResult(statistic=1.8293022638478202, pvalue=0.14134199074467235)
WilcoxonResult(statistic=1.0, pvalue=0.125)
===== Cum satisfaction =====
SlateQ 41.9388566827774 2.396559214302558
WP 37.20725725543499 6.276050482802271
LeveneResult(statistic=1.2746829342485546, pvalue=0.29160535154345973)
ShapiroResult(statistic=0.9004870653152466, pvalue=0.4125833213329315)
Ttest_relResult(statistic=1.8320013891414715, pvalue=0.14089988498894734)
WilcoxonResult(statistic=1.0, pvalue=0.125)
===== Avg satisfaction =====
SlateQ 0.6357181721664965 0.048799222117163796
WP 0.5555506880814209 0.09411378484972549
LeveneResult(statistic=0.8934690742496622, pvalue=0.37220847793425677)
ShapiroResult(statistic=0.9128715991973877, pvalue=0.485038101

In [35]:
print("===== SlateQ | WP5 =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["SlateQ"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["session_length"]["Slate-Wolpertinger 5%"].values
print("===== Return =====")
print("SlateQ",sq_return.mean(), sq_return.std())
print("WP",wp_return.mean(), wp_return.std())
stat_test(sq_return, wp_return)
print("===== Cum satisfaction =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["SlateQ"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_cum_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("WP",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("===== Avg satisfaction =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["SlateQ"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_reward"]["Slate-Wolpertinger 5%"].values
print("SlateQ",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("WP",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)

===== SlateQ | WP5 =====
===== Return =====
SlateQ 246.198 2.4751335317513714
WP 224.26100000000002 16.136361237899955
LeveneResult(statistic=3.975574104021438, pvalue=0.08128261120165467)
ShapiroResult(statistic=0.8993992209434509, pvalue=0.406574010848999)
Ttest_relResult(statistic=2.7760228158017233, pvalue=0.050021610425840114)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
===== Cum satisfaction =====
SlateQ 41.9388566827774 2.396559214302558
WP 20.823199695408345 15.448908770542232
LeveneResult(statistic=3.974057663695129, pvalue=0.08133050012160724)
ShapiroResult(statistic=0.8978506326675415, pvalue=0.3981201648712158)
Ttest_relResult(statistic=2.776032840806059, pvalue=0.05002109727640227)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
===== Avg satisfaction =====
SlateQ 0.6357181721664965 0.048799222117163796
WP 0.30034496704046615 0.24030001274547844
LeveneResult(statistic=2.8441990271700197, pvalue=0.13018881769071353)
ShapiroResult(statistic=0.8817130923271179, pvalue=0.3171462