In [None]:
import numpy as np
import pandas as pd
import sys, copy, os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from utils_v2 import *

# get our list of datasets
datasets = sorted([d for d in load_data("QUERY") if "sparse" in d])

# Compute metrics on all FSOL variants.

In [None]:
# what model are we working with?
model = "FSOL"

# create logs for all of our variants
FSOL_logs = pd.DataFrame(data=None, columns=["model", "dataset", "log2eta", "log10lmbda", "seed",
                                             "final_test-set-acc", "l1-regret", "l2-regret"])

# go thru each of our sparse datasets
for dataset in tqdm(datasets):
    
    # get the filenames for this dataset
    fnames = [f for f in sorted(os.listdir(f"results/{model}/{dataset}")) if ".csv" in f]
    
    # iterate thru each filename
    for fname in fnames:
        
        # unpack the settings + cast to appropriate types
        model, log2eta, log10lmbda, seed = [s.split("=")[1] for s in fname.split("_")[:-1]]
        log2eta, log10lmbda, seed = float(log2eta), float(log10lmbda), int(seed)
        df = pd.read_csv(f"results/{model}/{dataset}/{fname}")
        
        # get our test accuracies + compute relevant metrics
        test_accs, cum_max = df["inst_test-set-acc"], df["inst_test-set-acc"].cummax()
        l1 = (cum_max - test_accs).mean()
        l2 = np.sqrt((np.sign(cum_max - test_accs) * ((cum_max - test_accs) ** 2)).mean())
        
        # add to our logs
        FSOL_logs.loc[len(FSOL_logs.index)] = [model, dataset, log2eta, log10lmbda, seed,
                                               test_accs.values[-1], l1, l2]
        
# save our files as a .csv + then immediately average across 5x seeds so that we can compare better.
FSOL_logs.to_csv("FSOL_performances.csv", index=False)
FSOL_logs = FSOL_logs.groupby(["model", "dataset", "log2eta", "log10lmbda"]).mean().reset_index()

# Pick FSOL variants that we will use for further experiments.

In [None]:
# determine our threshold (how much best performance can we deviate from to create adverse conditions?)
threshold = 0.025

# create dataframe of best hyperparameter variants for FSOL
FSOL_hparams = pd.DataFrame(data=None, columns=["model", "dataset", "log2eta", "log10lmbda", 
                                                "test_acc", "l1_regret", "l2_regret"])

# go thru each of our datasets
for dataset in datasets:

    # get our highest possible final test accuracy
    top_acc = FSOL_logs.query(f"dataset == '{dataset}'")["final_test-set-acc"].max()
    
    # let's use the variant that got within 2.5% accuracy as the top acc, but with more oscillation to test.
    model, dataset, log2eta, log10lmbda, _, test_acc, l1_regret, l2_regret = \
    FSOL_logs.query(f"dataset == '{dataset}' and `final_test-set-acc` >= {top_acc - threshold}")\
    .sort_values(by="l1-regret", ascending=False).iloc[0]
    
    # add to our logs
    FSOL_hparams.loc[len(FSOL_hparams.index)] = [model, dataset, log2eta, log10lmbda, 
                                                 test_acc, l1_regret, l2_regret]
    
# save our chosen hyperparameters
FSOL_hparams.to_csv("FSOL_hparams.csv", index=False)

# Compute metrics on all PAC variants.

In [None]:
# what model are we working with?
model = "PAC"

# create logs for all of our variants
PAC_logs = pd.DataFrame(data=None, columns=["model", "dataset", "log10Cerr", "seed",
                                            "final_test-set-acc", "l1-regret", "l2-regret"])

# go thru each of our sparse datasets
for dataset in tqdm(datasets):
    
    # get the filenames for this dataset
    fnames = [f for f in sorted(os.listdir(f"results/{model}/{dataset}")) if ".csv" in f]
    
    # iterate thru each filename
    for fname in fnames:
        
        # unpack the settings + cast to appropriate types
        model, log10Cerr, seed = [s.split("=")[1] for s in fname.split("_")[:-1]]
        log10Cerr, seed = float(log10Cerr), int(seed)
        df = pd.read_csv(f"results/{model}/{dataset}/{fname}")
        
        # get our test accuracies + compute relevant metrics
        test_accs, cum_max = df["inst_test-set-acc"], df["inst_test-set-acc"].cummax()
        l1 = (cum_max - test_accs).mean()
        l2 = np.sqrt((np.sign(cum_max - test_accs) * ((cum_max - test_accs) ** 2)).mean())
        
        # add to our logs
        PAC_logs.loc[len(PAC_logs.index)] = [model, dataset, log10Cerr, seed,
                                             test_accs.values[-1], l1, l2]
        
# save our files as a .csv + then immediately average across 5x seeds so that we can compare better.
PAC_logs.to_csv("PAC_performances.csv", index=False)
PAC_logs = PAC_logs.groupby(["model", "dataset", "log10Cerr"]).mean().reset_index()

# Pick PAC variants that we will use for further experiments.

In [None]:
# determine our threshold (how much best performance can we deviate from to create adverse conditions?)
threshold = 0.025

# create dataframe of best hyperparameter variants for PAC
PAC_hparams = pd.DataFrame(data=None, columns=["model", "dataset", "log10Cerr", 
                                               "test_acc", "l1_regret", "l2_regret"])

# go thru each of our datasets
for dataset in datasets:

    # get our highest possible final test accuracy
    top_acc = PAC_logs.query(f"dataset == '{dataset}'")["final_test-set-acc"].max()
    
    # let's use the variant that got within 2.5% accuracy as the top acc, but with more oscillation to test.
    model, dataset, log10Cerr, _, test_acc, l1_regret, l2_regret = \
    PAC_logs.query(f"dataset == '{dataset}' and `final_test-set-acc` >= {top_acc - threshold}")\
    .sort_values(by="l1-regret", ascending=False).iloc[0]
    
    # add to our logs
    PAC_hparams.loc[len(PAC_hparams.index)] = [model, dataset, log10Cerr, 
                                               test_acc, l1_regret, l2_regret]
    
# save our chosen hyperparameters
PAC_hparams.to_csv("PAC_hparams.csv", index=False)