In [None]:
import numpy as np
import sys
sys.path.append('../tools')
from utils import get_shaps
from utils import classify_noisy_labels, compute_f1_score
from scipy.stats import sem
import seaborn as sns

%load_ext autoreload
%autoreload 2

In [None]:
task = 'classification'
dataset_name = 'gaussian'
num_datapoints = 100

In [None]:
model_name = 'SVC'
metric = 'accuracy'
seed = 2022
num_samples = 100
repeat_num = 10

In [None]:
num_samples = 20000
num_bootstrap = 25
xi = 1e-3
methods = ['random', 'stratified', 'owen', 'Sobol', 'kernel', 'active-0', 'active-2', 'active-5', 'active-100']

In [None]:
path = '../experiment_data/noisy_label_detection'

In [None]:
"""
Test out all sampling methods for data shapley
"""

res_data_shap = []

for method in methods:
    all_mcs = np.load(f"{path}/noisy_label_all_mcs_data_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_afs = np.load(f"{path}/noisy_label_all_afs_data_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_min_afs = np.load(f"{path}/noisy_label_all_min_afs_data_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    flipped_index = np.load(f"{path}/noisy_label_flipped_index_data_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    res_data_shap.append((method, all_mcs, all_afs, all_min_afs, flipped_index))

In [None]:
for i, item in enumerate(res_data_shap):
    method, all_mcs, all_afs, all_min_afs, flipped_index = item
    all_min_afs = np.asarray(all_min_afs)
    all_min_afs_mean = np.mean(all_min_afs, axis=0)
    all_min_afs_sem = sem(all_min_afs,axis=0)
    res_data_shap[i] = (method, all_mcs, all_afs, all_min_afs, all_min_afs_mean, all_min_afs_sem, flipped_index)

In [None]:
"""
Test out all sampling methods for beta shapley
"""

res_beta_shap = []

for method in methods:
    all_mcs = np.load(f"{path}/noisy_label_all_mcs_beta_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_afs = np.load(f"{path}/noisy_label_all_afs_beta_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_min_afs = np.load(f"{path}/noisy_label_all_min_afs_beta_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    flipped_index = np.load(f"{path}/noisy_label_flipped_index_beta_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    res_beta_shap.append((method, all_mcs, all_afs, all_min_afs, flipped_index))

In [None]:
for i, item in enumerate(res_beta_shap):
    method, all_mcs, all_afs, all_min_afs, flipped_index = item
    all_min_afs = np.asarray(all_min_afs)
    all_min_afs_mean = np.mean(all_min_afs, axis=0)
    all_min_afs_sem = sem(all_min_afs,axis=0)
    res_beta_shap[i] = (method, all_mcs, all_afs, all_min_afs, all_min_afs_mean, all_min_afs_sem, flipped_index)

In [None]:
# plotting
sys.path.append('../')
from vol_utils.utils import set_up_plotting
plt = set_up_plotting()

In [None]:
metric = 'data'

res_shap = res_data_shap if metric == 'data' else res_beta_shap

scores = [[] for _ in range(len(res_shap))]
for i in range(repeat_num):
    for j, (method, all_mcs, _, _, _, _, flipped_index) in enumerate(res_shap):
        shap = np.asarray(get_shaps(all_mcs[i]))
        index = classify_noisy_labels(shap)
        score = compute_f1_score(flipped_index, index)[2]
        scores[j].append(score)

In [None]:
scores = np.asarray(scores)
scores.shape

In [None]:
# remove kernelSHAP
no_kernel_scores = scores[[0,1,2,3,5,6,7,8]]

In [None]:
no_kernel = True
plot_scores = no_kernel_scores if no_kernel else scores

plt.figure(figsize=(8, 6))
sns.set_style(style='white')
palette = ['C0', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8']
if no_kernel:
    palette.remove('C4')
sns.boxplot(data=plot_scores.T, showfliers=False, palette=palette)
xlabels = ['MC', 'stratified', 'Owen', 'Sobol', 'kernel', r'Ours ($\alpha$ = 0)', r'Ours ($\alpha$ = 2)', r'Ours ($\alpha$ = 5)', r'Ours ($\alpha$ = 100)']
if no_kernel:
    xlabels.remove('kernel')
plt.xticks(range(len(xlabels)), xlabels, rotation=-90)
plt.ylabel("F1 Score")
plt.savefig("../figs/f1_score_{}_shap_{}_{}.pdf".format(metric, dataset_name, model_name), format='pdf', dpi=300, bbox_inches='tight')

In [None]:
# compute median and iqr of kernelSHAP
kernel_scores = scores[methods.index('kernel')]
np.median(kernel_scores), np.percentile(kernel_scores, 25), np.percentile(kernel_scores, 75)