In [None]:
import numpy as np
import pandas as pd
import sys, copy, os, shutil
from scipy.stats import norm

# translations for our datasets
dataset_descs = {"avazu-app_binary_sparse" : "Avazu (App)",
                 "avazu-site_binary_sparse" : "Avazu (Site)",
                 "criteo_binary_sparse" : "Criteo",
                 "dexter_binary_sparse" : "Dexter",
                 "dorothea_binary_sparse" : "Dorothea",
                 "kdd2010-a_binary_sparse" : "KDD2010 (Algebra)",
                 "mnist8-4+9_binary_sparse" : "MNIST8 (4+9)",
                 "news20_binary_sparse" : "News20",
                 "newsgroups_binary_sparse" : "Newsgroups (Binary, CS)",
                 "pcmac_binary_sparse" : "PCMAC",
                 "rcv1_binary_sparse" : "RCV1",
                 "real-sim_binary_sparse" : "Real-Sim",
                 "sst2_binary_sparse" : "SST-2",
                 "url_binary_sparse" : "URL",
                 "w8a_binary_sparse" : "W8A",
                 "webspam_binary_sparse" : "Webspam"}

# load in our logs
master = pd.read_csv("logs/master.csv")
base = pd.read_csv("logs/baseline_logs.csv")

# Hypothesis Testing

In [None]:
# how many datasets do we have?
N = 16

# print for K=64
print("Wilcoxon Signed-Rank Test for K=64 WRS Variants on Relative Oracle Performance:")

# go thru all four variants at K=64
for model in ["PAC", "FSOL"]:
    for weight_scheme in ["dense", "exp-dense"]:
        for AS in ["SA", "WA"]:
        
            # make our query
            table = master.groupby(["dataset", "model", "weight_scheme", "K"]).mean().reset_index()

            # get our model-WRS (treatment) and base model (control)
            m_treat = table.query(f"model == '{model}' and weight_scheme == '{weight_scheme}' and K == 64")[f"L1_{AS}"].values
            m_control = base.groupby(["dataset", "model"]).mean().reset_index().query(f"model == '{model}'")["L1_inst"].values

            # compute differences and then ranks -- this is TWO-SIDED TEST
            d = m_treat - m_control
            ranks = np.argsort(np.abs(d)) + 1

            # compute test statistic
            T = np.min([ranks[d > 0].sum() + 0.5*ranks[d == 0].sum(), 
                        ranks[d < 0].sum() + 0.5*ranks[d == 0].sum()])

            # compute normal approximation
            z = (T - 0.25*(N*(N+1))) / np.sqrt((1/24) * N * (N+1) * ( (2*N) + 1))

            # get our p-value using normal approximation
            pval = norm.cdf(z) * 2

            print(f"- {model}, {weight_scheme}+{AS}{ZS}: {pval}")

In [None]:
# how many datasets do we have?
N = 16

# print for K=64
print("Wilcoxon Signed-Rank Test for K=64 WRS Variants on Final Test Accuracy:")

# go thru all four variants at K=64
for model in ["PAC", "FSOL"]:
    for weight_scheme in ["dense", "exp-dense"]:
        for AS in ["SA", "WA"]:
        
            # make our query
            table = master.groupby(["dataset", "model", "weight_scheme", "K"]).mean().reset_index()

            # get our model-WRS (treatment) and base model (control)
            m_treat = 1 - table.query(f"model == '{model}' and weight_scheme == '{weight_scheme}' and K == 64")[f"fin_test_acc_{AS}"].values
            m_control = 1 - base.groupby(["dataset", "model"]).mean().reset_index().query(f"model == '{model}'")["fin_test_acc_inst"].values

            # compute differences and then ranks -- this is TWO-SIDED TEST
            d = m_treat - m_control
            ranks = np.argsort(np.abs(d)) + 1

            # compute test statistic
            T = np.min([ranks[d > 0].sum() + 0.5*ranks[d == 0].sum(), 
                        ranks[d < 0].sum() + 0.5*ranks[d == 0].sum()])

            # compute normal approximation
            z = (T - 0.25*(N*(N+1))) / np.sqrt((1/24) * N * (N+1) * ( (2*N) + 1))

            # get our p-value using normal approximation
            pval = norm.cdf(z) * 2

            print(f"- {model}, {weight_scheme}+{AS}: {pval}")