In [1]:
import os 
from pathlib import Path
from dotenv import load_dotenv
import pickle
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
import matplotlib.ticker as mtick

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

In [3]:
import io
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else:
            return super().find_class(module, name)

In [4]:
base_path = Path.home()/Path(os.environ.get("TEST_PATH"))

In [5]:
SEEDS = [5, 42, 97, 33, 99]
window = 100

In [6]:
def parse_data(data_dict: dict, alg_nam:str, seed: int)->pd.DataFrame:
    # convert from torch to float
    for k, v in data_dict.items():
        if isinstance(v, torch.Tensor):
            data_dict[k] = v.item()
        elif isinstance(v, list):
            if isinstance(v[0], torch.Tensor):
                data_dict[k] = [x.item() for x in v]
    data_df = pd.DataFrame(data_dict)
    data_df["alg_name"] = alg_nam
    data_df["seed"] = seed
    data_df = data_df.reset_index().rename(columns={"index": "episode"})
    return data_df

In [7]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"slateq_generalist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ_Gen", seed)
    res_df_list.append(res_df)
# concat all results
final_df = pd.concat(res_df_list)



100%|██████████| 5/5 [00:10<00:00,  2.09s/it]


In [8]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"slateq_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ_Spec", seed)
    res_df_list.append(res_df)
# concat all results
slateq_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slateq_spec_df])

100%|██████████| 5/5 [00:07<00:00,  1.59s/it]


In [9]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"slateq_coldstart_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateQ_ColdStart", seed)
    res_df_list.append(res_df)
# concat all results
slateq_cs_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slateq_cs_df])

100%|██████████| 5/5 [00:08<00:00,  1.60s/it]


In [10]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"wp_slate_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateWP_Spec", seed)
    res_df_list.append(res_df)
# concat all results
slatewp_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slatewp_spec_df])

100%|██████████| 5/5 [00:07<00:00,  1.58s/it]


In [11]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = CPU_Unpickler(open(base_path / Path(f"wp_slate_generalist_{seed}")/ "logs_dict.pickle", "rb")).load()
    res_df = parse_data(slateq_dict, "SlateWP_Gen", seed)
    res_df_list.append(res_df)
# concat all results
slatewp_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slatewp_gen_df])

100%|██████████| 5/5 [00:00<00:00, 12.13it/s]


In [12]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"wp_slate_coldstart_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "SlateWP_ColdStart", seed)
    res_df_list.append(res_df)
# concat all results
slatewp_cs_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, slatewp_cs_df])

100%|██████████| 5/5 [00:07<00:00,  1.58s/it]


In [13]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"greedy_coldstart_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Greedy_ColdStart", seed)
    res_df_list.append(res_df)
# concat all results
greedy_cs_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, greedy_cs_df])

100%|██████████| 5/5 [00:00<00:00, 18.51it/s]


In [14]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"greedy_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Greedy_Spec", seed)
    res_df_list.append(res_df)
# concat all results
greedy_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, greedy_spec_df])

100%|██████████| 5/5 [00:00<00:00, 12.30it/s]


In [15]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"greedy_generalist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Greedy_Gen", seed)
    res_df_list.append(res_df)
# concat all results
greedy_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, greedy_gen_df])

100%|██████████| 5/5 [00:00<00:00, 18.29it/s]


In [16]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = CPU_Unpickler(open(base_path / Path(f"random_generalist_{seed}")/ "logs_dict.pickle", "rb")).load()
    res_df = parse_data(slateq_dict, "Random_Gen", seed)
    res_df_list.append(res_df)
# concat all results
random_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_gen_df])

100%|██████████| 5/5 [00:00<00:00, 13.31it/s]


In [17]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = CPU_Unpickler(open(base_path / Path(f"random_specialist_{seed}")/ "logs_dict.pickle", "rb")).load()
    res_df = parse_data(slateq_dict, "Random_Spec", seed)
    res_df_list.append(res_df)
# concat all results
random_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_spec_df])

100%|██████████| 5/5 [00:00<00:00, 18.13it/s]


In [18]:

res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = CPU_Unpickler(open(base_path / Path(f"random_coldstart_{seed}")/ "logs_dict.pickle", "rb")).load()
    res_df = parse_data(slateq_dict, "Random_ColdStart", seed)
    res_df_list.append(res_df)
# concat all results
random_cs_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_cs_df])

100%|██████████| 5/5 [00:00<00:00, 18.13it/s]


In [19]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"knn_state_coldstart_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "KNN_State_ColdStart", seed)
    res_df_list.append(res_df)
# concat all results
knn_cs_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, knn_cs_df])

100%|██████████| 5/5 [00:08<00:00,  1.62s/it]


In [20]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = CPU_Unpickler(open(base_path / Path(f"knn_state_generalist_{seed}")/ "logs_dict.pickle", "rb")).load()
    res_df = parse_data(slateq_dict, "KNN_State_Gen", seed)
    res_df_list.append(res_df)
# concat all results
knn_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, knn_gen_df])

100%|██████████| 5/5 [00:00<00:00, 18.13it/s]


In [21]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = CPU_Unpickler(open(base_path / Path(f"knn_state_specialist_{seed}")/ "logs_dict.pickle", "rb")).load()
    res_df = parse_data(slateq_dict, "KNN_State_Spec", seed)
    res_df_list.append(res_df)
# concat all results
knn_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, knn_spec_df])

100%|██████████| 5/5 [00:00<00:00, 18.15it/s]


In [22]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"random_slateq_coldstart_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Random_Slateq_ColdStart", seed)
    res_df_list.append(res_df)
# concat all results
random_slateq_cs_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_slateq_cs_df])

100%|██████████| 5/5 [00:00<00:00, 12.76it/s]


In [23]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"random_slateq_generalist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Random_Slateq_Gen", seed)
    res_df_list.append(res_df)
# concat all results
random_slateq_gen_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_slateq_gen_df])

100%|██████████| 5/5 [00:00<00:00, 18.27it/s]


In [24]:
res_df_list = []
for seed in tqdm(SEEDS):
    slateq_dict = pickle.load(open(base_path / Path(f"random_slateq_specialist_{seed}")/ "logs_dict.pickle", "rb"))
    res_df = parse_data(slateq_dict, "Random_Slateq_Spec", seed)
    res_df_list.append(res_df)
# concat all results
random_slateq_spec_df = pd.concat(res_df_list)
final_df = pd.concat([final_df, random_slateq_spec_df])

100%|██████████| 5/5 [00:00<00:00, 13.32it/s]


In [25]:
final_df

Unnamed: 0,episode,hit_documents,ep_cum_satisfaction,ep_avg_satisfaction,diverse_score,user_satisfaction,relevance,alg_name,seed
0,0,0.000000,0.923530,0.461765,6,0.815647,0.748012,SlateQ_Gen,5
1,1,0.000000,0.782511,0.260837,7,0.970378,0.948471,SlateQ_Gen,5
2,2,0.000000,1.326244,0.331561,8,0.578357,0.579343,SlateQ_Gen,5
3,3,0.333333,0.887048,0.295683,6,0.992782,1.000000,SlateQ_Gen,5
4,4,0.000000,0.499515,0.249757,7,0.905194,0.901347,SlateQ_Gen,5
...,...,...,...,...,...,...,...,...,...
295,295,0.000000,0.551647,0.551647,6,0.842924,0.842924,Random_Slateq_Spec,99
296,296,0.333333,1.693517,0.564506,5,0.961884,1.000000,Random_Slateq_Spec,99
297,297,0.000000,1.277138,0.638569,6,0.844704,0.826997,Random_Slateq_Spec,99
298,298,0.000000,1.061631,0.530815,7,0.953088,0.954769,Random_Slateq_Spec,99


In [26]:
final_df = final_df.reset_index(drop=True)

In [27]:
final_df['diverse_score'] /= 10

In [28]:
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon
def stat_test(r1,r2):
    print(levene(r1, r2))
    differences = [a-b for a,b in zip(r1, r2)]
    print(shapiro(differences))
    print(ttest_rel(r1, r2))
    print(wilcoxon(r1, r2))

**AVERAGE SATISFACTION GENERALISTS**

In [29]:

print("===== AVG SAT GEN =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_Gen"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateQ_Gen"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Greedy_Gen"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["KNN_State_Gen"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Gen"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_Slateq_Gen"].values
print("===== Avg_Satisfaction =====")
print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== AVG SAT GEN =====
===== Avg_Satisfaction =====
RandomGen 0.32887493156859027 0.005853234013571824
slateqGen 0.35561865602588905 0.0030959157696911826
greedyGen 0.3284749030812332 0.010159205854886427
knnGen 0.3341606241191427 0.0014186384741497742
slatewpGen 0.34916605462258055 0.003723806996830134
random_slateqGen 0.3362783153094739 0.002044484391533123
LeveneResult(statistic=0.9063845169781569, pvalue=0.36894973427026384)
ShapiroResult(statistic=0.9884162266403835, pvalue=0.9738561959087487)
TtestResult(statistic=-6.08263016295854, pvalue=0.0036924902217428117, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=1.548663978915369, pvalue=0.24855423678475147)
ShapiroResult(statistic=0.9611235698153668, pvalue=0.8157850907873458)
TtestResult(statistic=0.11207190672929966, pvalue=0.9161652903087776, df=4)
WilcoxonResult(statistic=6.0, pvalue=0.8125)
LeveneResult(statistic=3.415981501361039, pvalue=0.10175149367139241)
ShapiroResult(statistic=0.923467882867942

**HIT RATIO GENERALISTS**

In [30]:

print("===== HIT RATIO GEN =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_Gen"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateQ_Gen"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Greedy_Gen"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["KNN_State_Gen"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Gen"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_Slateq_Gen"].values
print("===== Avg_Satisfaction =====")
print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== HIT RATIO GEN =====
===== Avg_Satisfaction =====
RandomGen 0.10170098266998928 0.008119791252449866
slateqGen 0.15747886919975282 0.012705131531934042
greedyGen 0.12084461682538192 0.00843216457869964
knnGen 0.08235783718029657 0.007533764722598866
slatewpGen 0.06076453433434169 0.0031193768111913395
random_slateqGen 0.09167844729622206 0.003274241126504869
LeveneResult(statistic=0.14258691249993335, pvalue=0.7155438951512851)
ShapiroResult(statistic=0.770178154615992, pvalue=0.04528005686242472)
TtestResult(statistic=-5.945017701667561, pvalue=0.004015671831969886, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=0.006887857926109945, pvalue=0.9358960615190215)
ShapiroResult(statistic=0.9521262636598653, pvalue=0.7523672003436188)
TtestResult(statistic=-3.7505797579384472, pvalue=0.019937047290932385, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=0.012068213929735675, pvalue=0.9152298047641767)
ShapiroResult(statistic=0.907471

**SLATE DIVERSITY GEN**


In [31]:
print("===== SLATE DIVERSITY GEN =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_Gen"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateQ_Gen"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Greedy_Gen"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["KNN_State_Gen"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Gen"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_Slateq_Gen"].values

print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== SLATE DIVERSITY GEN =====
RandomGen 0.5354666666666666 0.0040474957964427944
slateqGen 0.6038666666666667 0.005397941994665923
greedyGen 0.5481333333333334 0.006298500703430229
knnGen 0.6073333333333333 0.007697041278598178
slatewpGen 0.6006666666666667 0.008374564665302498
random_slateqGen 0.6047999999999999 0.006270211763214679
LeveneResult(statistic=0.04883931263190023, pvalue=0.8306315265779509)
ShapiroResult(statistic=0.9382705844121709, pvalue=0.6537384603570969)
TtestResult(statistic=-25.722446401927943, pvalue=1.3568720031981161e-05, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=0.8816513761467953, pvalue=0.375229786792289)
ShapiroResult(statistic=0.7992443210473872, pvalue=0.07989224212882653)
TtestResult(statistic=-4.8733971724045055, pvalue=0.008199561752838178, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=1.2680025046963093, pvalue=0.2927851562904429)
ShapiroResult(statistic=0.8281799669940906, pvalue=0.13480588

**AVG SATISFACTION SPEC**

In [32]:
print("===== AVG SATISFCATION =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_Spec"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateQ_Spec"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Greedy_Spec"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["KNN_State_Spec"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_Spec"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_Slateq_Spec"].values

print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== AVG SATISFCATION =====
RandomGen 0.5669099953894814 0.0037959185091998686
slateqGen 0.5806364796310663 0.0024425011839549495
greedyGen 0.5777852992977326 0.009963698388225774
knnGen 0.5772698127975067 0.0027001458308901667
slatewpGen 0.5729282042061289 0.0015463723917006394
random_slateqGen 0.574853402433296 0.0024794521154702875
LeveneResult(statistic=0.31307872419148014, pvalue=0.5911138231072657)
ShapiroResult(statistic=0.9920795490171023, pvalue=0.9864457324669232)
TtestResult(statistic=-8.894979895572439, pvalue=0.0008827467800670871, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=2.793558007138787, pvalue=0.1331875881260844)
ShapiroResult(statistic=0.761873716733807, pvalue=0.038196272393642566)
TtestResult(statistic=-2.2447240964910806, pvalue=0.08815899664757679, df=4)
WilcoxonResult(statistic=1.0, pvalue=0.125)
LeveneResult(statistic=0.34554343433899426, pvalue=0.5728616056054909)
ShapiroResult(statistic=0.9737773047517181, pvalue=0.89889705472

**HIT RATIO SPEC**


In [33]:
print("===== HIT RATIO =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_Spec"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateQ_Spec"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Greedy_Spec"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["KNN_State_Spec"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_Spec"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_Slateq_Spec"].values

print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== HIT RATIO =====
RandomGen 0.08581905553738275 0.007309226934707466
slateqGen 0.13481849576036137 0.011277529875169385
greedyGen 0.11510984703401725 0.015833434994131527
knnGen 0.08263938049475353 0.006698719103107463
slatewpGen 0.06151506067812442 0.0060452004541142095
random_slateqGen 0.09431745174527167 0.0038077525077763095
LeveneResult(statistic=0.5351489468480437, pvalue=0.48532391283212983)
ShapiroResult(statistic=0.8374417486622433, pvalue=0.15797378061464157)
TtestResult(statistic=-6.772035168647467, pvalue=0.0024810687838333146, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=1.8673417279379616, pvalue=0.20894855675878066)
ShapiroResult(statistic=0.95542507202071, pvalue=0.7758222274349508)
TtestResult(statistic=-2.5953057347853727, pvalue=0.06034562745507531, df=4)
WilcoxonResult(statistic=1.0, pvalue=0.125)
LeveneResult(statistic=0.009935300068996804, pvalue=0.9230541091720963)
ShapiroResult(statistic=0.947677918740116, pvalue=0.72059294561563

**SLATE DIVERSITY SPEC**

In [34]:
print("===== SLATE DIVERSITY SPEC =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_Spec"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateQ_Spec"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Greedy_Spec"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["KNN_State_Spec"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_Spec"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_Slateq_Spec"].values

print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== SLATE DIVERSITY SPEC =====
RandomGen 0.5314 0.0051570878948836735
slateqGen 0.6025333333333334 0.007392488680335561
greedyGen 0.6295333333333334 0.01021458652015722
knnGen 0.5767333333333333 0.004841487374764088
slatewpGen 0.5971333333333334 0.005335832747670327
random_slateqGen 0.6048666666666666 0.006605048574133741
LeveneResult(statistic=0.38897058823530917, pvalue=0.5502056470235659)
ShapiroResult(statistic=0.8498774794091558, pvalue=0.19413155973548385)
TtestResult(statistic=-12.776839752826172, pvalue=0.00021623540351174253, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=0.537952430196474, pvalue=0.48421881616341245)
ShapiroResult(statistic=0.8480768505466019, pvalue=0.18851581748191992)
TtestResult(statistic=-13.127169168687152, pvalue=0.00019446856082605974, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=0.03634528016153464, pvalue=0.853553321657925)
ShapiroResult(statistic=0.8753763941789751, pvalue=0.2888816172552506

**AVG SATISFACTION COLDSTART**

In [35]:
print("===== AVG SATISFCATION =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_ColdStart"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateQ_ColdStart"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Greedy_ColdStart"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["KNN_State_ColdStart"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["SlateWP_ColdStart"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["ep_avg_satisfaction"]["Random_Slateq_ColdStart"].values

print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== AVG SATISFCATION =====
RandomGen 0.8557303752005101 0.00487806586791665
slateqGen 0.8584959629600247 0.01045361789092417
greedyGen 0.8660079503854116 0.009998290941095866
knnGen 0.8767388913333416 0.004268795729126884
slatewpGen 0.8471174865836899 0.005434518021786598
random_slateqGen 0.8615151694541175 0.0020574838431131493
LeveneResult(statistic=0.825178347635458, pvalue=0.39021463609652807)
ShapiroResult(statistic=0.9459592207308927, pvalue=0.708314361396337)
TtestResult(statistic=-0.8410984603586463, pvalue=0.44763524820963946, df=4)
WilcoxonResult(statistic=4.0, pvalue=0.4375)
LeveneResult(statistic=2.346457629737757, pvalue=0.16410411637205283)
ShapiroResult(statistic=0.9155754077019369, pvalue=0.5018082573840095)
TtestResult(statistic=-2.0095611310991393, pvalue=0.11485642958417672, df=4)
WilcoxonResult(statistic=1.0, pvalue=0.125)
LeveneResult(statistic=0.17709783711080934, pvalue=0.6849567666799278)
ShapiroResult(statistic=0.8870793901801273, pvalue=0.34263038842112703)


**HIT RATIO COLD START**

In [36]:
print("===== HIT RATIO COLD START =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_ColdStart"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateQ_ColdStart"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Greedy_ColdStart"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["KNN_State_ColdStart"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["SlateWP_ColdStart"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["hit_documents"]["Random_Slateq_ColdStart"].values

print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== HIT RATIO COLD START =====
RandomGen 0.08227486854791641 0.016416935066314177
slateqGen 0.12572010670105613 0.009917870760881093
greedyGen 0.1281433873027563 0.01344049687695003
knnGen 0.06048201108972231 0.008246634056148833
slatewpGen 0.04715555592874685 0.005203955458622988
random_slateqGen 0.08158412778377533 0.003705757639931202
LeveneResult(statistic=0.8449445002096807, pvalue=0.38486391855593216)
ShapiroResult(statistic=0.8606831300071035, pvalue=0.2307114424300316)
TtestResult(statistic=-3.399257863521548, pvalue=0.02729598253860911, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=0.13377466519922845, pvalue=0.7240323613485646)
ShapiroResult(statistic=0.8726905386083962, pvalue=0.2774898193674473)
TtestResult(statistic=-3.6686415390942435, pvalue=0.021416726612098837, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=1.4755017575628986, pvalue=0.25911329956675266)
ShapiroResult(statistic=0.9303020231358933, pvalue=0.598431

**SLATE DIVERSITY COLD START**

In [37]:
print("===== SLATE DIVERSITY COLD START =====")
random_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_ColdStart"].values
slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateQ_ColdStart"].values
greedy_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Greedy_ColdStart"].values
knn_state_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["KNN_State_ColdStart"].values
slatewp_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["SlateWP_ColdStart"].values
random_slateq_return = final_df.groupby(["alg_name", "seed"]).mean()["diverse_score"]["Random_Slateq_ColdStart"].values

print("RandomGen",random_return.mean(), random_return.std())
print("slateqGen",slateq_return.mean(), slateq_return.std())
print("greedyGen",greedy_return.mean(), greedy_return.std())
print("knnGen",knn_state_return.mean(), knn_state_return.std())
print("slatewpGen",slatewp_return.mean(), slatewp_return.std())
print("random_slateqGen",random_slateq_return.mean(), random_slateq_return.std())
stat_test(random_return, slateq_return)
stat_test(random_return, greedy_return)
stat_test(random_return, knn_state_return)
stat_test(random_return, slatewp_return)
stat_test(random_return, random_slateq_return)

===== SLATE DIVERSITY COLD START =====
RandomGen 0.5307333333333333 0.008736640595153809
slateqGen 0.5993333333333333 0.005838569078882991
greedyGen 0.6312 0.0026633312473917517
knnGen 0.5817999999999999 0.003930507459462332
slatewpGen 0.6047333333333332 0.009262589030911164
random_slateqGen 0.5908 0.009504618760254255
LeveneResult(statistic=0.30717151454364333, pvalue=0.594570542667686)
ShapiroResult(statistic=0.9536413858349718, pvalue=0.763158750560974)
TtestResult(statistic=-30.36336323154087, pvalue=7.008382565411367e-06, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=1.5287338893100881, pvalue=0.2513694788152565)
ShapiroResult(statistic=0.9841586000774194, pvalue=0.9555626789229458)
TtestResult(statistic=-22.38063963421729, pvalue=2.3599511147708045e-05, df=4)
WilcoxonResult(statistic=0.0, pvalue=0.0625)
LeveneResult(statistic=0.9278880479794385, pvalue=0.3636216484991674)
ShapiroResult(statistic=0.985942838389983, pvalue=0.9636579607226265)
TtestResult

In [38]:
final_df

Unnamed: 0,episode,hit_documents,ep_cum_satisfaction,ep_avg_satisfaction,diverse_score,user_satisfaction,relevance,alg_name,seed
0,0,0.000000,0.923530,0.461765,0.6,0.815647,0.748012,SlateQ_Gen,5
1,1,0.000000,0.782511,0.260837,0.7,0.970378,0.948471,SlateQ_Gen,5
2,2,0.000000,1.326244,0.331561,0.8,0.578357,0.579343,SlateQ_Gen,5
3,3,0.333333,0.887048,0.295683,0.6,0.992782,1.000000,SlateQ_Gen,5
4,4,0.000000,0.499515,0.249757,0.7,0.905194,0.901347,SlateQ_Gen,5
...,...,...,...,...,...,...,...,...,...
26995,295,0.000000,0.551647,0.551647,0.6,0.842924,0.842924,Random_Slateq_Spec,99
26996,296,0.333333,1.693517,0.564506,0.5,0.961884,1.000000,Random_Slateq_Spec,99
26997,297,0.000000,1.277138,0.638569,0.6,0.844704,0.826997,Random_Slateq_Spec,99
26998,298,0.000000,1.061631,0.530815,0.7,0.953088,0.954769,Random_Slateq_Spec,99


In [39]:
final_df[final_df["alg_name"]=="Random_Gen"].groupby(['seed', 'alg_name'])['ep_avg_satisfaction'].std()

seed  alg_name  
5     Random_Gen    0.153156
33    Random_Gen    0.137615
42    Random_Gen    0.157775
97    Random_Gen    0.125769
99    Random_Gen    0.118723
Name: ep_avg_satisfaction, dtype: float64

In [40]:
final_df.dtypes

episode                  int64
hit_documents          float64
ep_cum_satisfaction    float64
ep_avg_satisfaction    float64
diverse_score          float64
user_satisfaction      float64
relevance              float64
alg_name                object
seed                     int64
dtype: object