In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys, copy, os, shutil
from tqdm.notebook import tqdm
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
from matplotlib.ticker import MaxNLocator
from scipy.stats import norm

# make a directory to store figures
if "figures" not in os.listdir():
    os.mkdir("figures")
    
# translations for our datasets
dataset_descs = {"avazu-app_binary_sparse" : "Avazu (App)",
                 "avazu-site_binary_sparse" : "Avazu (Site)",
                 "criteo_binary_sparse" : "Criteo",
                 "dexter_binary_sparse" : "Dexter",
                 "dorothea_binary_sparse" : "Dorothea",
                 "kdd2010-a_binary_sparse" : "KDD2010 (Algebra)",
                 "mnist8-4+9_binary_sparse" : "MNIST8 (4+9)",
                 "news20_binary_sparse" : "News20",
                 "newsgroups_binary_sparse" : "Newsgroups (Binary, CS)",
                 "pcmac_binary_sparse" : "PCMAC",
                 "rcv1_binary_sparse" : "RCV1",
                 "real-sim_binary_sparse" : "Real-Sim",
                 "sst2_binary_sparse" : "SST-2",
                 "url_binary_sparse" : "URL",
                 "w8a_binary_sparse" : "W8A",
                 "webspam_binary_sparse" : "Webspam"}

# Generate summary of results as a .csv file.

In [None]:
# create a directory for logs
if "logs" not in os.listdir():
    os.mkdir("logs")

# create a dataframe to store all the logging results
columns = ["dataset", "model", "K", "seed",
           "fin_test_acc_WA", "fin_test_hinge_WA", "fin_sparsity_WA", "L1_WA",
           "fin_test_acc_WA_VZ", "fin_test_hinge_WA_VZ", "fin_sparsity_WA_VZ", "L1_WA_VZ",
           "fin_test_acc_SA", "fin_test_hinge_SA", "fin_sparsity_SA", "L1_SA",
           "fin_test_acc_SA_VZ", "fin_test_hinge_SA_VZ", "fin_sparsity_SA_VZ", "L1_SA_VZ",
           "L1_inst"]
master = pd.DataFrame(data=None, columns=columns)


# master table of results on our finished datasets
for model in ["PAC", "FSOL"]:
    for dataset in tqdm(list(dataset_descs.keys())):
        
        # get all filenames that are .csv and correspond to this model + dataset
        fnames = sorted([f for f in os.listdir(f"results/TOPK/{model}/{dataset}") if ".csv" in f])
        
        # iterate thru all these filenames
        for fname in fnames:
            
            # unpack our settings
            model, K, seed = [s.split("=")[1] for s in fname.replace("_TOPK", "").split("_")[:-1]]
            K, seed = int(K), int(seed)
            
            # start our row
            row = [dataset, model, K, seed]
            
            # get the corresponding instantaneous results for either PAC or FSOL
            if model == "PAC":
                
                # immediately load in the best hyperparameters for this dataset + model
                log10Cerr = pd.read_csv("base_variants/PAC_hparams.csv")\
                .query(f"dataset == '{dataset}'")[["log10Cerr"]].values[0,0]
                log10Cerr = int(log10Cerr)
                
                # load in the file
                logs_inst = pd.read_csv(f"../hparam_tuning/results/{model}/{dataset}/model={model}_log10Cerr={log10Cerr}_seed={seed}_metrics.csv")
                
            elif model == "FSOL":
                
                # immediately load in the best hyperparameters for this dataset + model
                log2eta, log10lmbda = pd.read_csv("base_variants/FSOL_hparams.csv")\
                .query(f"dataset == '{dataset}'")[["log2eta", "log10lmbda"]].values[0]
                log2eta, log10lmbda = log2eta, log10lmbda
                
                # load in the file
                logs_inst = pd.read_csv(f"../hparam_tuning/results/{model}/{dataset}/model={model}_log2eta={log2eta}_log10lmbda={log10lmbda}_seed={seed}_metrics.csv")
            
            # load in the logs for this variant
            logs = pd.read_csv(f"results/TOPK/{model}/{dataset}/{fname}")
            
            # get the metrics that we are interested in
            for a_type in ["WA", "SA"]:
                for v_type in ["", "_VZ"]:
                    
                    # add the relevant columns to our row
                    row += list(logs[[f"TOPK_test-set-acc_{a_type}{v_type}", 
                                      f"TOPK_test-set-hinge_{a_type}{v_type}", 
                                      f"TOPK_sparsity_{a_type}{v_type}"]].iloc[-1].values)
                    
                    # compute the L1 metric + add to our list
                    cm_inst_test_accs = logs_inst["inst_test-set-acc"].cummax()
                    test_accs = logs[f"TOPK_test-set-acc_{a_type}{v_type}"]
                    row += [(cm_inst_test_accs[1:] - test_accs[1:]).mean()]
                    
            # also need to store the L1 metric of the instantaneous solution
            inst_test_accs = logs_inst["inst_test-set-acc"]
            row += [(cm_inst_test_accs[1:] - inst_test_accs[1:]).mean()]
                    
            # add to our dataframe
            master.loc[len(master.index)] = row

# at the very end
master.to_csv("logs/topk_master.csv", index=False)

# Number of Datasets Where Top-K beats Base

In [None]:
# for each model, check how often we can beat the baselines in terms of L1, using K=64
for model in ["PAC", "FSOL"]:
    
    # what model are we using?
    print(f"Model: {model} (K=64)")
    
    ##########
    
    # check how many variants where we were able to beat the instantaneous baseline
    q = master.groupby(["dataset", "model", "K"]).mean().reset_index()\
    .sort_values(by="L1_SA").query(f"model == '{model}' and K == 64")\
    [["dataset", "model", "K", "L1_SA", "L1_WA", "L1_inst"]]
    
    # how often did we beat the base model?
    num_outperform_SA = ((q.L1_inst - q.L1_SA) > 0).sum()
    print(f"1. Simple-Average Top-K stabilized baseline in {num_outperform_SA} of 16 datasets.")
    num_outperform_WA = ((q.L1_inst - q.L1_WA) > 0).sum()
    print(f"2. Weighted-Average Top-K stabilized baseline in {num_outperform_WA} of 16 datasets.")

# Hypothesis Testing on Top-K vs. Base Methods

In [None]:
# load in our results for base PAC-II + FSOL
base = pd.read_csv("../WRS/logs/baseline_logs.csv")

# how many datasets are we working with?
N = 16

# print for K=64
print("Wilcoxon Signed-Rank Test for K=64 Top-K Variants on Relative Oracle Performance:")

# go thru all four variants at K=64
for model in ["PAC", "FSOL"]:
    for AS in ["SA", "WA"]:

        # make our query
        table = master.groupby(["dataset", "model", "K"]).mean().reset_index()

        # get our model + top-K (treatment) and base model (control)
        m_treat = table.query(f"model == '{model}' and K == 64")[f"L1_{AS}"].values
        m_control = base.groupby(["dataset", "model"]).mean().reset_index().query(f"model == '{model}'")["L1_inst"].values

        # compute differences and then ranks -- this is TWO-SIDED TEST
        d = m_treat - m_control
        ranks = np.argsort(np.abs(d)) + 1

        # compute test statistic
        T = np.min([ranks[d > 0].sum() + 0.5*ranks[d == 0].sum(), 
                    ranks[d < 0].sum() + 0.5*ranks[d == 0].sum()])

        # compute normal approximation
        z = (T - 0.25*(N*(N+1))) / np.sqrt((1/24) * N * (N+1) * ( (2*N) + 1))

        # get our p-value using normal approximation
        pval = norm.cdf(z) * 2

        print(f"- {model}, {AS}: {pval}")

In [None]:
# load in our results for base PAC-II + FSOL
base = pd.read_csv("../WRS/logs/baseline_logs.csv")

# how many datasets are we interested in?
N = 16

# print for K=64
print("Wilcoxon Signed-Rank Test for K=64 Top-K Variants on Final Test Accuracy:")

# go thru all four variants at K=64
for model in ["PAC", "FSOL"]:
    for AS in ["SA", "WA"]:

        # make our query
        table = master.groupby(["dataset", "model", "K"]).mean().reset_index()

        # get our model + top-K (treatment) and base model (control)
        m_treat = table.query(f"model == '{model}' and K == 64")[f"fin_test_acc_{AS}"].values
        m_control = base.groupby(["dataset", "model"]).mean().reset_index().query(f"model == '{model}'")["fin_test_acc_inst"].values

        # compute differences and then ranks -- this is TWO-SIDED TEST
        d = m_treat - m_control
        ranks = np.argsort(np.abs(d)) + 1

        # compute test statistic
        T = np.min([ranks[d > 0].sum() + 0.5*ranks[d == 0].sum(), 
                    ranks[d < 0].sum() + 0.5*ranks[d == 0].sum()])

        # compute normal approximation
        z = (T - 0.25*(N*(N+1))) / np.sqrt((1/24) * N * (N+1) * ( (2*N) + 1))

        # get our p-value using normal approximation
        pval = norm.cdf(z) * 2

        print(f"- {model}, {AS}: {pval}")