In [22]:
import os 
from pathlib import Path
from dotenv import load_dotenv
import pickle
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
import matplotlib.ticker as mtick

In [23]:
load_dotenv()

True

In [24]:
base_path = Path.home()/Path(os.environ.get("TEST_PATH"))

In [25]:
SEEDS = [1, 3, 7, 9]
window = 100

In [26]:
def parse_data(data_dict: dict, alg_nam:str, seed: int)->pd.DataFrame:
    # convert from torch to float
    for k, v in data_dict.items():
        if isinstance(v, torch.Tensor):
            data_dict[k] = v.item()
        elif isinstance(v, list):
            if isinstance(v[0], torch.Tensor):
                data_dict[k] = [x.item() for x in v]
    data_df = pd.DataFrame(data_dict)
    data_df["alg_name"] = alg_nam
    data_df["seed"] = seed
    data_df = data_df.reset_index().rename(columns={"index": "episode"})
    return data_df

In [27]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"slateq_generalist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ_Gen", seed)
    res_df_list.append(res_df)
# concat all results
final_df = pd.concat(res_df_list)

100%|██████████| 4/4 [00:00<00:00,  7.33it/s]


In [28]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"slateq_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ_Spec", seed)
    res_df_list.append(res_df)
# concat all results
slateq_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slateq_spec_df])

100%|██████████| 4/4 [00:00<00:00,  5.48it/s]


In [29]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"wp_slate_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateWP_Spec", seed)
    res_df_list.append(res_df)
# concat all results
slatewp_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slatewp_spec_df])

100%|██████████| 4/4 [00:00<00:00,  7.20it/s]


In [30]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"wp_slate_generalist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateWP_Gen", seed)
    res_df_list.append(res_df)
# concat all results
slatewp_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slatewp_gen_df])

100%|██████████| 4/4 [00:00<00:00,  7.51it/s]


In [31]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"greedy_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Greedy_Spec", seed)
    res_df_list.append(res_df)
# concat all results
greedy_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, greedy_spec_df])

100%|██████████| 4/4 [00:00<00:00,  5.69it/s]


In [32]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"greedy_generalist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Greedy_Gen", seed)
    res_df_list.append(res_df)
# concat all results
greedy_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, greedy_gen_df])

100%|██████████| 4/4 [00:00<00:00,  7.32it/s]


In [33]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"random_generalist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Random_Gen", seed)
    res_df_list.append(res_df)
# concat all results
random_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_gen_df])

100%|██████████| 4/4 [00:00<00:00,  7.26it/s]


In [34]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"random_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Random_Spec", seed)
    res_df_list.append(res_df)
# concat all results
random_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_spec_df])

100%|██████████| 4/4 [00:00<00:00, 11.39it/s]


In [35]:
final_df

Unnamed: 0,episode,hit_documents,ep_cum_satisfaction,ep_avg_satisfaction,diverse_score,user_satisfaction,relevance,alg_name,seed
0,0,0.333333,2.240666,0.746889,5,0.934161,0.903118,SlateQ_Gen,1
1,1,0.500000,1.580026,0.790013,7,0.933813,0.896881,SlateQ_Gen,1
2,2,0.666667,2.191263,0.730421,7,0.985698,1.000000,SlateQ_Gen,1
3,3,1.000000,2.511098,0.837033,5,0.956170,1.000000,SlateQ_Gen,1
4,4,0.250000,3.040608,0.760152,6,0.965305,0.926073,SlateQ_Gen,1
...,...,...,...,...,...,...,...,...,...
495,495,0.666667,2.446146,0.815382,3,0.884961,0.868938,Random_Spec,9
496,496,0.666667,2.950584,0.491764,5,0.974228,1.000000,Random_Spec,9
497,497,0.500000,1.640778,0.820389,6,0.961247,0.926101,Random_Spec,9
498,498,0.000000,1.615212,0.807606,5,0.984998,0.963644,Random_Spec,9


In [36]:
final_df = final_df.reset_index(drop=True)

In [37]:
final_df['diverse_score'] /= 18

In [38]:
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon
def stat_test(r1,r2):
    print(levene(r1, r2))
    differences = [a-b for a,b in zip(r1, r2)]
    print(shapiro(differences))
    print(ttest_rel(r1, r2))
    print(wilcoxon(r1, r2))

In [47]:

print("===== RandomGen | SlateQGen =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Gen"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateQ_Gen"].values
print("===== Avg_Satisfaction =====")
print("RandomGen",sq_return.mean(), sq_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
stat_test(sq_return, slateq_return)
print("===== Hit Ratio =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Gen"].values
slateq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateQ_Gen"].values
print("RandomGen",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("slateqGen",slateq_cum_satisfaction.mean(), slateq_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, slateq_cum_satisfaction)
print("===== Diverse Score =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Gen"].values
slateq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateQ_Gen"].values
print("RandomGen",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("slateqGen",slateq_avg_satisfaction.mean(), slateq_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, slateq_avg_satisfaction)

===== RandomGen | SlateQGen =====
===== Avg_Satisfaction =====
RandomGen 0.7331492991484702 0.01125922393715332
slateqGen 0.7433408668027259 0.004013879441680596
LeveneResult(statistic=1.1130764065701808, pvalue=0.3320329174196939)
ShapiroResult(statistic=0.8202669957472594, pvalue=0.1438134660000947)
TtestResult(statistic=-2.43305437586624, pvalue=0.09307561765618898, df=3)
WilcoxonResult(statistic=1.0, pvalue=0.25)
===== Hit Ratio =====
RandomGen 0.16301190580986438 0.011826346131694145
slateqGen 0.26953184276819225 0.014020690500779753
LeveneResult(statistic=0.2771619213667985, pvalue=0.6174528243166244)
ShapiroResult(statistic=0.8720984540928692, pvalue=0.30598587611188666)
TtestResult(statistic=-7.2701819157177745, pvalue=0.00537054785866752, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Diverse Score =====
RandomGen 0.3288888888888889 0.0023791792339293233
slateqGen 0.3424722222222222 0.0011742478548804577
LeveneResult(statistic=0.178711333243283, pvalue=0.6872146623170

In [48]:
print("===== RandomGen | GreedyGen =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Gen"].values
Greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_Gen"].values
print("===== Avg_Satisfaction =====")
print("RandomGen",sq_return.mean(), sq_return.std())
print("GreedyGen",Greedy_return.mean(), Greedy_return.std())
stat_test(sq_return, Greedy_return)
print("===== Hit Ratio =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Gen"].values
Greedy_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_Gen"].values
print("RandomGen",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("GreedyGen",Greedy_cum_satisfaction.mean(), Greedy_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, Greedy_cum_satisfaction)
print("===== Diverse Score =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Gen"].values
Greedy_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_Gen"].values
print("RandomGen",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("GreedyGen",Greedy_avg_satisfaction.mean(), Greedy_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, Greedy_avg_satisfaction)

===== RandomGen | GreedyGen =====
===== Avg_Satisfaction =====
RandomGen 0.7331492991484702 0.01125922393715332
GreedyGen 0.7407609611582011 0.010511547004334192
LeveneResult(statistic=0.05176570266935993, pvalue=0.8275735069445337)
ShapiroResult(statistic=0.9722407571006152, pvalue=0.8553066925948043)
TtestResult(statistic=-1.004359862890938, pvalue=0.3892033572528802, df=3)
WilcoxonResult(statistic=3.0, pvalue=0.625)
===== Hit Ratio =====
RandomGen 0.16301190580986438 0.011826346131694145
GreedyGen 0.27128289867565036 0.010187151184775391
LeveneResult(statistic=0.08958117060772731, pvalue=0.7748070748745793)
ShapiroResult(statistic=0.8707849109283736, pvalue=0.30077049717702975)
TtestResult(statistic=-15.69294116877692, pvalue=0.0005623999557759641, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Diverse Score =====
RandomGen 0.3288888888888889 0.0023791792339293233
GreedyGen 0.31652777777777774 0.0029683255869468214
LeveneResult(statistic=0.9698397737982354, pvalue=0.3627508

In [49]:
print("===== RandomGen | SlateWPGen =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Gen"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Greedy_Gen"].values
print("===== Avg_Satisfaction =====")
print("RandomGen",sq_return.mean(), sq_return.std())
print("WPGen",wp_return.mean(), wp_return.std())
stat_test(sq_return, wp_return)
print("===== Hit Ratio =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Gen"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Greedy_Gen"].values
print("RandomGen",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("WPGen",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("===== Diverse Score =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Gen"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Greedy_Gen"].values
print("RandomGen",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("WPGen",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)


===== RandomGen | SlateWPGen =====
===== Avg_Satisfaction =====
RandomGen 0.7331492991484702 0.01125922393715332
WPGen 0.742077767123701 0.0043026094255041395
LeveneResult(statistic=1.0245999125127816, pvalue=0.350495063326405)
ShapiroResult(statistic=0.9403090407435813, pvalue=0.6562119804523058)
TtestResult(statistic=-2.0113980341971307, pvalue=0.13779699108272245, df=3)
WilcoxonResult(statistic=1.0, pvalue=0.25)
===== Hit Ratio =====
RandomGen 0.16301190580986438 0.011826346131694145
WPGen 0.2558402444701642 0.015737376369610635
LeveneResult(statistic=0.2453721298502776, pvalue=0.6379691854502294)
ShapiroResult(statistic=0.8436141993470602, pvalue=0.20614683915337473)
TtestResult(statistic=-5.893184049921394, pvalue=0.009753027677821977, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Diverse Score =====
RandomGen 0.3288888888888889 0.0023791792339293233
WPGen 0.34391666666666665 0.000640095478989044
LeveneResult(statistic=0.5479349932906069, pvalue=0.48710644125878116)
Shap

In [50]:
print("===== RandomSpec | SlateQSpec =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Spec"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateQ_Spec"].values
print("===== Avg_Satisfaction =====")
print("RandomSpec",sq_return.mean(), sq_return.std())
print("slateqSpec",slateq_return.mean(), slateq_return.std())
stat_test(sq_return, slateq_return)
print("===== Hit Ratio =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Spec"].values
slateq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateQ_Spec"].values
print("RandomSpec",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("slateqSpec",slateq_cum_satisfaction.mean(), slateq_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, slateq_cum_satisfaction)
print("===== Diverse Score =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Spec"].values
slateq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateQ_Spec"].values
print("RandomSpec",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("slateqSpec",slateq_avg_satisfaction.mean(), slateq_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, slateq_avg_satisfaction)

===== RandomSpec | SlateQSpec =====
===== Avg_Satisfaction =====
RandomSpec 0.7499277144595982 0.0061448907809963415
slateqSpec 0.7608782651536168 0.0069613110825593625
LeveneResult(statistic=4.956719747763589e-05, pvalue=0.9946108702057739)
ShapiroResult(statistic=0.7891824600233532, pvalue=0.08419349379523336)
TtestResult(statistic=-4.301247930243883, pvalue=0.02312239567393783, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Hit Ratio =====
RandomSpec 0.14941229179129006 0.014887689857188002
slateqSpec 0.25085416935011745 0.009269971472765651
LeveneResult(statistic=0.3141544280380546, pvalue=0.5954352699206701)
ShapiroResult(statistic=0.8517489219503263, pvalue=0.2318746508040952)
TtestResult(statistic=-9.92680783260033, pvalue=0.002174699308154693, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Diverse Score =====
RandomSpec 0.3263888888888889 0.004219662967057381
slateqSpec 0.33702777777777776 0.0034429880433522633
LeveneResult(statistic=0.0016775396085742813, pva

In [51]:
print("===== RandomSpec | GreedySpec =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Spec"].values
Greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_Spec"].values
print("===== Avg_Satisfaction =====")
print("RandomSpec",sq_return.mean(), sq_return.std())
print("GreedySpec",Greedy_return.mean(), Greedy_return.std())
stat_test(sq_return, Greedy_return)
print("===== Hit Ratio =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Spec"].values
Greedy_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_Spec"].values
print("RandomSpec",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("GreedySpec",Greedy_cum_satisfaction.mean(), Greedy_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, Greedy_cum_satisfaction)
print("===== Diverse Score =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Spec"].values
Greedy_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_Spec"].values
print("RandomSpec",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("GreedySpec",Greedy_avg_satisfaction.mean(), Greedy_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, Greedy_avg_satisfaction)

===== RandomSpec | GreedySpec =====
===== Avg_Satisfaction =====
RandomSpec 0.7499277144595982 0.0061448907809963415
GreedySpec 0.7528846669979393 0.006473899260272004
LeveneResult(statistic=0.3904411943880018, pvalue=0.5550701001481236)
ShapiroResult(statistic=0.9911810388865305, pvalue=0.963439970011198)
TtestResult(statistic=-1.2760497382305056, pvalue=0.291767552578006, df=3)
WilcoxonResult(statistic=2.0, pvalue=0.375)
===== Hit Ratio =====
RandomSpec 0.14941229179129006 0.014887689857188002
GreedySpec 0.2243195896372199 0.008464534463789105
LeveneResult(statistic=0.28835510099437506, pvalue=0.6105952422334183)
ShapiroResult(statistic=0.9581723370662022, pvalue=0.7674069261529004)
TtestResult(statistic=-13.368553971859942, pvalue=0.0009047690690808592, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Diverse Score =====
RandomSpec 0.3263888888888889 0.004219662967057381
GreedySpec 0.3104166666666667 0.003719630109049562
LeveneResult(statistic=0.007989564650251298, pvalue=0.9

In [52]:
print("===== RandomSpec | SlateWPSpec =====")
sq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Spec"].values
wp_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Greedy_Spec"].values
print("===== Avg_Satisfaction =====")
print("RandomSpec",sq_return.mean(), sq_return.std())
print("WPSpec",wp_return.mean(), wp_return.std())
stat_test(sq_return, wp_return)
print("===== Hit Ratio =====")
sq_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Spec"].values
wp_cum_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Greedy_Spec"].values
print("RandomSpec",sq_cum_satisfaction.mean(), sq_cum_satisfaction.std())
print("WPSpec",wp_cum_satisfaction.mean(), wp_cum_satisfaction.std())
stat_test(sq_cum_satisfaction, wp_cum_satisfaction)
print("===== Diverse Score =====")
sq_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Spec"].values
wp_avg_satisfaction = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Greedy_Spec"].values
print("RandomSpec",sq_avg_satisfaction.mean(), sq_avg_satisfaction.std())
print("WPSpec",wp_avg_satisfaction.mean(), wp_avg_satisfaction.std())
stat_test(sq_avg_satisfaction, wp_avg_satisfaction)

===== RandomSpec | SlateWPSpec =====
===== Avg_Satisfaction =====
RandomSpec 0.7499277144595982 0.0061448907809963415
WPSpec 0.7565885217897593 0.006296898786224263
LeveneResult(statistic=5.23369302808821e-06, pvalue=0.9982488258889094)
ShapiroResult(statistic=0.8549882132573158, pvalue=0.24272782224038153)
TtestResult(statistic=-3.6481069977451455, pvalue=0.035539368837945416, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Hit Ratio =====
RandomSpec 0.14941229179129006 0.014887689857188002
WPSpec 0.23476405919343235 0.006925620970305998
LeveneResult(statistic=0.5333637593236832, pvalue=0.49271440945739625)
ShapiroResult(statistic=0.7381132171101031, pvalue=0.02970379368816965)
TtestResult(statistic=-7.787490140893448, pvalue=0.0044063634830143646, df=3)
WilcoxonResult(statistic=0.0, pvalue=0.125)
===== Diverse Score =====
RandomSpec 0.3263888888888889 0.004219662967057381
WPSpec 0.3391388888888889 0.003510456162542967
LeveneResult(statistic=0.0014516909511255998, pvalue=0.970