In [87]:
import os
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, mannwhitneyu

osl = os.listdir
ospj = os.path.join

### get relevant runs data given filters of interest

In [88]:
import re

def natural_sort(l):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)

In [97]:
def augment_expstring(base_str, hg=False, hgr=False):
    noopt_str = "NOOPT"
    opt_str = "OPT-"
    hg_str = "HG"
    hgr_str = "HGR"
    if not hg and not hgr:
        return base_str + noopt_str
    if hg and not hgr:
        return base_str + opt_str + hg_str
    if hgr and not hg:
        return base_str + opt_str + hgr_str
    if hg and hgr:
        return base_str + opt_str + hg_str + "-" + hgr_str
    raise Exception("")

def baseline_exp(exp_qualifier = "210h_10hp", hg=False, hgr=False):
    base_fcscout_str = f"fcscout_{exp_qualifier}_"
    return augment_expstring(base_fcscout_str, hg=hg, hgr=hgr) + "_"

def gcn_exp(exp_qualifier = "210h_10hp", hg=False, hgr=False, is_global=False):
    base = f"gcnscout_{exp_qualifier}_"
    return augment_expstring(base, hg=hg, hgr=hgr) + ("_global" if is_global else "_local")

def gat_exp(exp_qualifier = "210h_10hp", hg=False, hgr=False, is_global=False):
    base = f"gatscout_{exp_qualifier}_"
    return augment_expstring(base, hg=hg, hgr=hgr) + ("_global" if is_global else "_local")

def gt_exp(exp_qualifier = "210h_10hp", hg=False, hgr=False, is_global=False):
    base = f"gtscout_{exp_qualifier}_"
    return augment_expstring(base, hg=hg, hgr=hgr) + ("_global" if is_global else "_local")

def get_exps(exp_qualifier="210h_10hp", baseline_exp_string="210h_10hp", hg=False, hgr=False, is_global=False):
    return [
        baseline_exp(exp_qualifier=baseline_exp_string, hg=hg, hgr=hgr),
        gcn_exp(exp_qualifier=exp_qualifier, hg=hg, hgr=hgr, is_global=is_global),
        gat_exp(exp_qualifier=exp_qualifier, hg=hg, hgr=hgr, is_global=is_global),
        gt_exp(exp_qualifier=exp_qualifier, hg=hg, hgr=hgr, is_global=is_global),
    ]


hg=False
hgr=False
runs_filters = get_exps(
    exp_qualifier="210h-10hp-400bs-1em4lr-eval",
    baseline_exp_string="210h-10hp-400bs-1em4lr-eval",
    is_global=True,
    hg=hg,
    hgr=hgr,
)
runs_filters = [
    baseline_exp(exp_qualifier="210h-10hp-400bs-1em4lr-eval", hg=hg, hgr=hgr),
    gcn_exp(exp_qualifier="210h-10hp-400bs-1em4lr-eval", is_global=True, hg=hg, hgr=hgr),
    gat_exp(exp_qualifier="210h-10hp-400bs-1em4lr-eval", is_global=True, hg=hg, hgr=hgr),
    gt_exp(exp_qualifier="210h-10hp-400bs-1em4lr-eval", is_global=True, hg=hg, hgr=hgr),
]
print(runs_filters)

maindir = "/home/vchad/ray_results/"
scout_baselines = "/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/"
scout_gcns = "/home/vchad/results_aaai/scout_gcn_210h-10hp-400bs-1em4lr/"
runs_dirs = [maindir, scout_baselines, scout_gcns]
runs = sum(([ospj(runs_dir, x) for x in osl(runs_dir)] for runs_dir in runs_dirs), [])
tot_steps_len = 75#150#75#200
runs = natural_sort(runs)

['fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_', 'gcnscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global', 'gatscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global', 'gtscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global']


In [98]:
all_filters_runs = [
    [run for run in runs if runs_filter in run]
    for runs_filter in runs_filters
]

In [99]:
print(all_filters_runs)

[['/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED0_wchhpgwm', '/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED1_jha9k0gi', '/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED2_hf1gqc2q', '/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED3_x8k_36gy', '/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED4_6yn2ydfl', '/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED5_9uv1r9pk', '/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED6_gkphq9kv', '/home/vchad/results_aaai/scout_baselines_210h-10hp-400bs-1em4lr/fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_SEED7_1tmkm0ua', '/home

# Calculate p-values on a per-starting location basis

In [100]:
# Check out runs on a per-start-point basis
import json
def get_best_eval_rewards(run):
    # Fix json file because for some reason it's multiple documents in one
    run_json_data_file = "result.json"
    with open(os.path.join(run, run_json_data_file), "r") as file:
        lines = file.readlines()
    results = [json.loads(result) for result in lines]

    # Get the highest eval rewards and return
    results_ordered_by_eval = sorted([(x["evaluation"]["episode_reward_mean"], x) for x in results], key=lambda x: x[0], reverse=True)
    best_eval, best_result_by_eval = results_ordered_by_eval[0]
    best_result_by_eval_rewards = best_result_by_eval["evaluation"]["hist_stats"]["episode_reward"]
    return best_result_by_eval_rewards

# all_runs_data shape: (n_experiments_to_compare, n_runs_per_experiment, n_evals_per_run/starting_points)
all_runs_data = []
for filter_runs in all_filters_runs:
    curr_run_data = []
    for run in filter_runs:
        run_best_eval_data = get_best_eval_rewards(run)
        curr_run_data.append(run_best_eval_data)
    all_runs_data.append(curr_run_data)

# get stats for mapped values for a given experiment
def get_stats_by_start(all_runs_by_start):
    stats = {}
    baselines = all_runs_by_start[0]
    for exp_name, experiment in zip(runs_filters, all_runs_by_start):
        stats[exp_name] = {}
        for starting_pt_idx in range(len(experiment)):
            ci95z = 1.96
            ci90z = 1.645
            alternative = "less" # "two-sided" #
            baselines_idx = baselines[starting_pt_idx]
            experiment_idx = experiment[starting_pt_idx]
            ttest_results = ttest_ind(baselines_idx, experiment_idx, alternative=alternative, equal_var=False)
            mu, sigma = torch.mean(experiment_idx), torch.std(experiment_idx)
            stats[exp_name][starting_pt_idx] = {
                "str": f"{mu}+/-{ci90z*sigma} -- p={ttest_results.pvalue}",
            }
    return stats

import torch
import pprint
all_runs_by_starting_point = torch.Tensor(all_runs_data).permute(0, 2, 1) # (n_experiments, starting_point, n_runs)
results_by_starting_pt = get_stats_by_start(all_runs_by_starting_point)
printer = pprint.PrettyPrinter(width=160)
printer.pprint(results_by_starting_pt)

{'fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_': {0: {'str': '37.400001525878906+/-18.05674171447754 -- p=0.5'},
                                                1: {'str': '97.80000305175781+/-155.14791870117188 -- p=0.5'},
                                                2: {'str': '112.5999984741211+/-210.0727996826172 -- p=0.5'},
                                                3: {'str': '66.5+/-109.34335327148438 -- p=0.5'},
                                                4: {'str': '145.5+/-216.2189483642578 -- p=0.5'},
                                                5: {'str': '67.19999694824219+/-144.94639587402344 -- p=0.5'},
                                                6: {'str': '152.0+/-183.4631805419922 -- p=0.5'},
                                                7: {'str': '78.9000015258789+/-156.7609100341797 -- p=0.5'},
                                                8: {'str': '87.69999694824219+/-177.9999237060547 -- p=0.5'},
                                            

# Calculate p-values on a wholistic basis (v1)

In [101]:
# get stats for mapped values for a given experiment
def get_stats_all(all_runs_by_start):
    all_runs_data = all_runs_by_start.reshape(all_runs_by_start.shape[0], -1)
    stats = {}
    baselines = all_runs_data[0]
    for exp_name, vals in zip(runs_filters, all_runs_data):
        mu = torch.mean(vals)
        sigma = torch.std(vals)
        ci95z = 1.96
        ci90z = 1.645
        alternative = "less" # "two-sided"
        ttest_results = ttest_ind(baselines.numpy(), vals.numpy(), alternative=alternative, equal_var=False)
        stats[exp_name] = {
            "str": f"{mu}+/-{ci90z*sigma} -- p={ttest_results.pvalue}"
        }
    return stats
results_by_starting_pt = get_stats_all(all_runs_by_starting_point)
printer = pprint.PrettyPrinter(width=160)
printer.pprint(results_by_starting_pt)

{'fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_': {'str': '96.78888702392578+/-168.14315795898438 -- p=0.5'},
 'gatscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global': {'str': '97.79444122314453+/-171.38015747070312 -- p=0.4632023447484611'},
 'gcnscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global': {'str': '143.8222198486328+/-261.02679443359375 -- p=0.0004657014854765041'},
 'gtscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global': {'str': '113.06666564941406+/-205.84669494628906 -- p=0.08869114009409511'}}


# Calculate p-values on a wholistic basis (v2)

In [102]:
run_data_file = "progress.csv"
#run_data_col = "episode_reward_mean"
run_data_col = "evaluation/episode_reward_mean"
# run_data_col = "ray/tune/episode_reward_mean"
run_step_col = "timesteps_total"
all_runs_data = []
for filter_runs in all_filters_runs:
    curr_run_data = []
    for run in filter_runs:
        #parse_exp_json(run) # TODO
        datafile = pd.read_csv(ospj(run, run_data_file))
        curr_run_data.append(datafile[run_data_col])
    all_runs_data.append(curr_run_data)
runs_steps = [
    pd.read_csv(ospj(run, run_data_file))[run_step_col]
    for run in runs
]

### given data, run "map" on data to get values suitable for analysis

In [103]:
# finds the first training step when reward exceeds a given value ge
def first_ep_to_val(runs, steps, ge=30):
    first_eps = []
    for run, steps in zip(runs, steps):
        found = False
        for step, val in zip(steps, run):
            if val >= ge:
                first_eps.append(step)
                found = True
                break
        if not found:
            first_eps.append(-1)
    return first_eps

# finds max avg reward
ZONE1_SPAWNS = [85, 87, 94, 95, 96, 102, 103, 111, 112]
ZONE2_SPAWNS = [98, 99, 106, 106, 107, 108, 113, 114, 115]
POSSIBLE_SPAWNS = ZONE1_SPAWNS + ZONE2_SPAWNS
def max_avg_reward(runs, steps, N_EVAL_EPISODES = len(POSSIBLE_SPAWNS)):
    max_vals = []
    for run in runs:
        if len(run) < tot_steps_len: continue
        max_vals += [max(run)]*N_EVAL_EPISODES
    return max_vals

# returns last reward
def last_ep_reward(runs, steps):
    last_vals = []
    for run in runs:
        if len(run) < tot_steps_len: continue
        last_vals.append(run.tolist()[-1])
    return last_vals

In [104]:
first_ep_to_30 = [
    first_ep_to_val(filter_runs_data, runs_steps)
    for filter_runs_data in all_runs_data
]
max_reward = [
    max_avg_reward(filter_runs_data, runs_steps)
    for filter_runs_data in all_runs_data
]
last_reward = [
    last_ep_reward(filter_runs_data, runs_steps)
    for filter_runs_data in all_runs_data
]

### given "map" values, run "reduce" to aggregate values and show significance

In [105]:
# pretty print a dictionary
def pretty_print(d, tabs=0, tabsize=4):
    for k in d:
        v = d[k]
        nspaces = " " * tabsize * tabs
        if type(v) == dict:
            print(f"{nspaces}{k}:")
            pretty_print(v, tabs+1, tabsize)
        else:
            print(f"{nspaces}{k}: {v}")

# get stats for mapped values for a given experiment
def get_stats(all_runs_vals, firstn=-1):
    stats = {}
    #baselines = all_runs_vals[0]
    baselines = all_runs_vals[0]
    for filter, vals in zip(runs_filters, all_runs_vals):
        if firstn > 0: vals = vals[:firstn]
        mu = np.mean(vals)
        sigma = np.std(vals)
        ci95z = 1.96
        ci90z = 1.645
        alternative = "less" # "two-sided" #
        ttest_results = ttest_ind(baselines, vals, alternative=alternative, equal_var=False)
        stats[filter] = {
            "str": f"{mu}+/-{ci90z*sigma} -- p={ttest_results.pvalue}",
            #"median": np.median(vals),
            #"std": sigma,
            #"ci": [mu-ci95z*sigma, mu+ci95z*sigma],
            #"n": len(vals),
        }
    return stats

In [106]:
firstn = -1
#first_ep_to_30_stats = get_stats(first_ep_to_30, firstn=firstn)
max_reward_stats = get_stats(max_reward, firstn=firstn)
last_reward_stats = get_stats(last_reward, firstn=firstn)

In [107]:
def print_stat(stats, name=""):
    assert name != ""
    print(name+" stats:")
    pretty_print(stats)
    print()

print_stat(max_reward_stats, "max reward")
#print_stat(first_ep_to_30_stats, "first ep to 30")
print_stat(last_reward_stats, "last reward")


max reward stats:
fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_:
    str: 96.7888888888889+/-46.418283452758324 -- p=0.5
gcnscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global:
    str: 143.82222222222222+/-84.05484597297246 -- p=3.502328414660579e-23
gatscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global:
    str: 97.79444444444442+/-39.67336926235678 -- p=0.35862503572069665
gtscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global:
    str: 113.06666666666666+/-73.79762047522641 -- p=2.560693420473987e-05

last reward stats:
fcscout_210h-10hp-400bs-1em4lr-eval_NOOPT_:
    str: 71.59444444444445+/-34.71950489546124 -- p=0.5
gcnscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global:
    str: 101.2+/-54.162088958730685 -- p=0.018983622981371146
gatscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global:
    str: 69.60555555555555+/-25.61610993107212 -- p=0.5885838285793796
gtscout_210h-10hp-400bs-1em4lr-eval_NOOPT_global:
    str: 87.80555555555554+/-65.59199314776501 -- p=0.1498437653647084

