In [None]:
import numpy as np
from scipy.stats import sem
import sys
sys.path.append('../tools')
from utils import get_shaps
import seaborn as sns

%load_ext autoreload
%autoreload 2

In [None]:
task = 'classification'
dataset_name = 'gaussian'
num_datapoints = 50

In [None]:
model_name = 'logistic'
metric = 'accuracy'
seed = 2022
repeat_num = 10

In [None]:
num_samples = 2000
num_bootstrap = 20
true_num_bootstrap = 50
true_num_samples = 20000
xi = 1e-3
methods = ['random', 'stratified', 'owen', 'Sobol', 'kernel', 'active-0', 'active-2', 'active-5', 'active-100']

In [None]:
path = '../experiment_data/symmetry'

In [None]:
"""
Test out all sampling methods for data shapley
"""

res_data_shap = []

for method in methods:
    all_mcs = np.load(f"{path}/small_all_mcs_data_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_afs = np.load(f"{path}/small_all_afs_data_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_min_afs = np.load(f"{path}/small_all_min_afs_data_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    res_data_shap.append((method, all_mcs, all_afs, all_min_afs))


In [None]:
for i, item in enumerate(res_data_shap):
    method, all_mcs, all_afs, all_min_afs = item
    all_min_afs = np.asarray(all_min_afs)
    all_min_afs_mean = np.mean(all_min_afs, axis=0)
    all_min_afs_sem = sem(all_min_afs,axis=0)
    res_data_shap[i] = (method, all_mcs, all_afs, all_min_afs, all_min_afs_mean, all_min_afs_sem)

In [None]:
"""
Test out all sampling methods for beta shapley
"""
res_beta_shap = []
for method in methods:
    all_mcs = np.load(f"{path}/small_all_mcs_beta_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_afs = np.load(f"{path}/small_all_afs_beta_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    all_min_afs = np.load(f"{path}/small_all_min_afs_beta_shap_{method}_{dataset_name}_{model_name}_{num_datapoints}_{num_samples}_{num_bootstrap}.npy", allow_pickle=True)
    res_beta_shap.append((method, all_mcs, all_afs, all_min_afs))

In [None]:
for i, item in enumerate(res_beta_shap):
    method, all_mcs, all_afs, all_min_afs = item
    all_min_afs = np.asarray(all_min_afs)
    all_min_afs_mean = np.mean(all_min_afs, axis=0)
    all_min_afs_std = np.std(all_min_afs, axis=0)
    all_min_afs_sem = sem(all_min_afs, axis=0)
    res_beta_shap[i] = (method, all_mcs, all_afs, all_min_afs, all_min_afs_mean, all_min_afs_sem)

In [None]:
linestyles = ['solid', 'dashed', 'dotted', 'dashdot', (0, (1, 10)), (0, (1, 1)), (0, (5, 1)), (0, (5, 5)), (0, (3,1,1,1,))]

In [None]:
metric = 'data'
res_shap = res_data_shap if metric == 'data' else res_beta_shap

sys.path.append('../')
from vol_utils.utils import set_up_plotting
plt = set_up_plotting()
plt.figure(figsize=(8,6))
for i, item in enumerate(res_shap):
    method, _, _, _, mean, se = item
    if method == 'owen':
        method = 'Owen' # capitalize
    if method == 'random':
        method = 'MC'
    if method == 'kernel':
        continue
    if method.startswith('active'):
        alpha = int(method.split('-')[-1])
        method = rf'Ours ($\alpha$ = {alpha})'
    plt.plot(mean, label=method, linestyle=linestyles[i], c=f'C{i}')
    plt.fill_between(np.arange(mean.shape[0]), mean-se, mean+se, alpha=0.3, color=f'C{i}')
plt.xlabel("Number of Evaluations")
plt.ylabel("min FS")
plt.xticks(np.arange(0, res_shap[0][4].shape[0], 500))
plt.legend(loc='upper left')
plt.savefig(f"../figs/symmetry_{metric}_shap_{dataset_name}_{model_name}_{num_datapoints}.pdf", format='pdf', dpi=300, bbox_inches='tight')

In [None]:
"""
Load mcs from random method on large samples (to approximate the true Shapley values)
"""
true_data_mcs = np.load(f"{path}/small_all_mcs_data_shap_random_{dataset_name}_{model_name}_{num_datapoints}_{true_num_samples}_{true_num_bootstrap}.npy", allow_pickle=True)
true_beta_mcs =np.load(f"{path}/small_all_mcs_beta_shap_random_{dataset_name}_{model_name}_{num_datapoints}_{true_num_samples}_{true_num_bootstrap}.npy", allow_pickle=True) 


In [None]:
n = num_datapoints * 2 # duplicate each training example
true_data_shaps = np.asarray([0.0 for _ in range(n)])
for i in range(len(true_data_mcs)):
    cur_shaps = np.asarray(get_shaps(true_data_mcs[i]))
    true_data_shaps += cur_shaps / len(true_data_mcs)

true_beta_shaps = np.asarray([0.0 for _ in range(n)])
for i in range(len(true_beta_mcs)):
    cur_shaps = np.asarray(get_shaps(true_beta_mcs[i]))
    true_beta_shaps += cur_shaps / len(true_beta_mcs)
# sanity check
print(true_data_shaps.shape)

In [None]:
# break into half
n = num_datapoints
true_shaps_data = (true_data_shaps[:n] + true_data_shaps[n:]) / 2
true_shaps_beta = (true_beta_shaps[:n] + true_beta_shaps[n:]) / 2


In [None]:
eps1_range = (0.02, 0.25)
multiplier = 50

In [None]:
all_counts = [[] for _ in range(len(methods))]
mean_counts = []
sem_counts = []

for ratio in range(int(eps1_range[0] * multiplier), int(eps1_range[1]*multiplier)):
    eps1 = ratio / multiplier
    eps2 = eps1 * xi

    def get_over_threshold_count(first, second, i):
        means = true_shaps_data / sum(true_shaps_data) # normalize shaps
        diff = abs(first - second)
        thres = means * eps1 + eps2
        return sum(diff > thres)

    counts = [[] for _ in range(len(methods))]
    for i, item in enumerate(res_data_shap):
        method, all_mcs, _, _, _, _ = item
        for j in range(repeat_num):
            shaps_active = np.asarray(get_shaps(all_mcs[j]))
            shaps_active /= sum(shaps_active) # normalize shaps
            first = shaps_active[:n]
            second = shaps_active[n:]
            count = get_over_threshold_count(first, second, i)
            counts[i].append(count)
    for i, count in enumerate(counts):
        all_counts[i].append(count)

for i, count in enumerate(all_counts):
    all_counts[i] = np.asarray(count).T
    mean_counts.append(all_counts[i].mean(axis=0))
    sem_counts.append(sem(all_counts[i], axis=0))

In [None]:
all_counts_beta_shap = [[] for _ in range(len(methods))]
mean_counts_beta_shap = []
sem_counts_beta_shap = []

for ratio in range(int(eps1_range[0] * multiplier), int(eps1_range[1] * multiplier)):
    eps1 = ratio / multiplier
    eps2 = eps1 * xi

    def get_over_threshold_count(first, second, i):
        means = true_shaps_beta / sum(true_shaps_beta) # normalize shaps
        diff = abs(first - second)
        thres = means * eps1 + eps2
        return sum(diff > thres)

    counts = [[] for _ in range(len(methods))]
    for i, item in enumerate(res_data_shap):
        method, all_mcs, _, _, _, _ = item
        for j in range(repeat_num):
            shaps_active = np.asarray(get_shaps(all_mcs[j]))
            shaps_active /= sum(shaps_active) # normalize shaps
            first = shaps_active[:n]
            second = shaps_active[n:]
            count = get_over_threshold_count(first, second, i)
            counts[i].append(count)
    for i, count in enumerate(counts):
        all_counts_beta_shap[i].append(count)

for i, count in enumerate(all_counts_beta_shap):
    all_counts_beta_shap[i] = np.asarray(count).T
    mean_counts_beta_shap.append(all_counts_beta_shap[i].mean(axis=0))
    sem_counts_beta_shap.append(sem(all_counts_beta_shap[i], axis=0))

In [None]:
plt.figure(figsize=(8,6))
n = num_datapoints
x = np.asarray(list(range(int(eps1_range[0]*multiplier), int(eps1_range[1]*multiplier), 2)))
x_ticks = x / multiplier

plt.xticks(x, x_ticks)

for i in range(len(methods)):
    method = methods[i]
    if method == 'owen':
        method = 'Owen' # capitalize
    if method == 'random':
        method = 'MC'
    if method == 'kernel':
        continue
    if method.startswith('active'):
        alpha = int(method.split('-')[-1])
        method = rf'Ours ($\alpha$ = {alpha})'
    plt.plot(mean_counts[i] / n, label = method, linestyle=linestyles[i], c=f'C{i}')
    plt.fill_between(np.arange(len(mean_counts[i])), (mean_counts[i] - sem_counts[i]) / n,
        (mean_counts[i] + sem_counts[i]) / n, alpha=0.3, color=f'C{i}')
plt.xlabel(r'$\epsilon_1$')
plt.ylabel(r"avg. prop. $|\varphi_i - \varphi_j| > t$")
plt.legend(loc='upper left')
plt.savefig(f"../figs/threshold_ratios_data_shap_{dataset_name}_{model_name}_{num_datapoints}.pdf", format='pdf', dpi=300, bbox_inches='tight')

In [None]:
plt.figure(figsize=(8,6))
n = num_datapoints
x = np.asarray(list(range(int(eps1_range[0]*multiplier), int(eps1_range[1]*multiplier), 2)))
x_ticks = x / multiplier

plt.xticks(x, x_ticks)

for i in range(len(methods)):
    method = methods[i]
    if method == 'owen':
        method = 'Owen' # capitalize
    if method == 'random':
        method = 'MC'
    if method == 'kernel':
        continue
    if method.startswith('active'):
        alpha = int(method.split('-')[-1])
        method = rf'Ours ($\alpha$ = {alpha})'
    plt.plot(mean_counts_beta_shap[i] / n, label = method, linestyle=linestyles[i], c=f'C{i}')
    plt.fill_between(np.arange(len(mean_counts_beta_shap[i])), (mean_counts_beta_shap[i] - sem_counts_beta_shap[i]) / n,
        (mean_counts_beta_shap[i] + sem_counts_beta_shap[i]) / n, alpha=0.3, color=f'C{i}')
plt.xlabel(r'$\epsilon_1$')
plt.ylabel(r"avg. prop. $|\varphi_i - \varphi_j| > t$")
plt.legend()
plt.savefig(f"../figs/threshold_ratios_beta_shap_{dataset_name}_{model_name}_{num_datapoints}.pdf", format='pdf', dpi=300, bbox_inches='tight')

In [None]:
"""
Consider sum of ratio
"""
def get_sum_ratio(first, second):
    sum_ratio = 0
    for i in range(len(first)):
        sum_ratio += max((abs(first[i])+xi) / (abs(second[i])+xi), 
                    (abs(second[i])+xi) / (abs(first[i])+xi))
    return sum_ratio

sum_ratios = [[] for _ in range(len(methods))]

for i, item in enumerate(res_data_shap):
    method, all_mcs, _, _, _, _ = item
    n = num_datapoints
    for j in range(repeat_num):
        shaps = np.asarray(get_shaps(all_mcs[j]))
        shaps /= sum(shaps) # normalize shapley value
        first = shaps[:n]
        second = shaps[n:]
        sum_ratios[i].append(get_sum_ratio(first, second))

sum_ratios_beta_shap = [[] for _ in range(len(methods))]

for i, item in enumerate(res_beta_shap):
    method, all_mcs, _, _, _, _ = item
    n = num_datapoints
    for j in range(repeat_num):
        shaps = np.asarray(get_shaps(all_mcs[j]))
        shaps /= sum(shaps) # normalize shapley value
        first = shaps[:n]
        second = shaps[n:]
        sum_ratios_beta_shap[i].append(get_sum_ratio(first, second))

In [None]:
sum_ratios = np.asarray(sum_ratios)
kernel_index = methods.index('kernel')
deleted_sum_ratios = np.delete(sum_ratios, kernel_index, axis=0)
plt.figure(figsize=(8,6))
sns.set_style(style='white')
sns.boxplot(data=np.log(deleted_sum_ratios).T, showfliers=False,palette=['C0', 'C1', 'C2', 'C3', 'C5', 'C6', 'C7', 'C8'])
xlabels=['MC', 'strat.', 'Owen', 'Sobol', r'$\alpha=0$', r'$\alpha=2$', r'$\alpha=5$', r'$\alpha=100$']
plt.xticks([0,1,2,3,4,5,6,7], xlabels, rotation=20)
plt.ylabel("Log Sum Ratio")
plt.savefig(f"../figs/log_sum_ratio_{dataset_name}_{model_name}_{num_datapoints}_boxplot.pdf", format='pdf', dpi=300, bbox_inches='tight')