In [8]:
import os
import re
import pandas as pd
import numpy as np
from itertools import product
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from tensorboard.backend.event_processing import event_accumulator
import pickle

In [9]:
def parse_tensorboard(path, scalars = None):
    """returns a dictionary of pandas dataframes for each requested scalar"""
    ea = event_accumulator.EventAccumulator(
        path,
        size_guidance={event_accumulator.SCALARS: 0},
    )
    _absorb_print = ea.Reload()
    # make sure the scalars are in the event accumulator tags
    # assert all(
    #     s in ea.Tags()["scalars"] for s in scalars
    # ), "some scalars were not found in the event accumulator"
    res = {}
    if scalars is None:
        scalars = ea.Tags()["scalars"]
    for s in scalars:
        try:
            res[s] = pd.DataFrame(ea.Scalars(s))
        except:
            continue
    return res

In [10]:
def get_scalar_stats(
    exp_results,
    strategy = "max", # max, min, last, argmin, argmax
    idx_results = None,
    return_stats = True
):

    if strategy == "argmin" or strategy == "argmax":
        
        assert idx_results is not None
        get_fn_idx = lambda x: (x.argmax() if strategy == "argmax" else x.argmin())
        idx_list = [get_fn_idx(x["value"]) for x in idx_results]
        res = np.array([x["value"].iloc[i] for i, x in zip(idx_list, exp_results)])

    else:

        if strategy == "max":
            get_fn = lambda x: x.max()
        elif strategy == "min":
            get_fn = lambda x: x.max()
        elif strategy == "last":
            get_fn = lambda x: x.iloc[-1]
        else:
            raise Exception(f"strategy {strategy} undefined")
        res = np.array([get_fn(x["value"]) for x in exp_results])

    if return_stats:
        return np.array([res.mean(), res.std()])
    else:
        return res


def get_scalar_stats_wrapper_max(results, exp_name, scalar):
    exp_results = results[exp_name][scalar]
    return get_scalar_stats(exp_results, strategy="max")   

def get_scalar_stats_wrapper_last(results, exp_name, scalar):
    exp_results = results[exp_name][scalar]
    return get_scalar_stats(exp_results, strategy="last")

def get_scalar_stats_wrapper_last_approx(results, exp_name, scalar):
    matches = [s for s in results[exp_name].keys() if scalar in s]
    exp_results = results[exp_name][matches[0]]
    return get_scalar_stats(exp_results, strategy="last") 

def get_scalar_stats_wrapper_argmin(results, exp_name, scalar, scalar_idx):
    exp_results = results[exp_name][scalar]
    idx_results = results[exp_name][scalar_idx]
    return get_scalar_stats(exp_results, strategy="argmin", idx_results=idx_results)    

In [11]:
scalars = [
    "val/acc_task",
    "val/loss_task",
    "val/balanced_acc_adv_attack",
    "val/loss_adv_attack",
    "zero_ratio"
]
# scalars = [
#     'val/acc_task_eval',
#     'val/loss_task_eval',
#     'val/balanced_acc_adv_attack_gender',
#     'val/loss_adv_attack_gender',
#     'val/balanced_acc_adv_attack_age',
#     'val/loss_adv_attack_age'
# ]

In [12]:
model_name = "bert_uncased_L-4_H-256_A-4" # "bert-base-uncased" # "bert-base-uncased"  # "bert_uncased_L-4_H-256_A-4"
folder = "/home/deepak/sparse_transformers/logs_hatespeech"
experiment_names = set([re.sub(r"(?<=seed)[\d]+", "{}", n) for n in os.listdir(folder)]) # remove seed suffix
results = {}
for f in os.listdir(folder):
    exp_name = re.sub(r"(?<=seed)[\d]+", "{}", f)
    filepath = os.path.join(folder, f)
    df = parse_tensorboard(filepath)
    try:
        results[exp_name].append(df)
    except KeyError:
        results[exp_name] = [df]

results = {k.replace("-" + model_name, "").replace("-seed{}", ""):v for k,v in results.items()}

In [None]:
# with open("results.pkl", "wb") as f:
#     pickle.dump(results, f)

In [None]:
# with open("results.pkl", "rb") as f:
#     results = pickle.load(f)

In [13]:
results_prep = {}
for exp, data in results.items():
    results_prep[exp] = {}
    for res_seed in data:
        for k, v in res_seed.items():
            for s in scalars:
                if s in k:
                    try:
                        results_prep[exp][k].append(v)
                    except:
                        results_prep[exp][k] = [v]

In [14]:
for s in results_prep.keys():
    print(s)

adverserial-baseline-64-2e-05-weighted_loss_prot-dialect
adverserial-Adp-64-2e-05-weighted_loss_prot-dialect
adverserial-Adp_fusion-64-2e-05-weighted_loss_prot-dialect
adverserial-Adp_prot-64-2e-05-weighted_loss_prot-dialect
task-Adp-64-2e-05
task-baseline-64-2e-05


In [8]:
# remove cp_init
results_prep = {k.replace("only_adv_attack_", ""):v for k,v in results_prep.items()}

In [100]:
# remove cp_init
# results_prep = {k.replace("-cp_init", "").replace("-freeze_task_head_cp", ""):v for k,v in results_prep.items()}

In [101]:
# drop no_cooldown experiments
# results_prep = {k:v for k,v in results_prep.items() if "no_cooldown" not in k}

In [9]:
# sort values
results_prep = {k:results_prep[k] for k in sorted(results_prep.keys())}

In [10]:
list(results_prep.keys())

['adv_0.05_additive_seed{}', 'mod_0.05_additive_seed{}']

In [11]:
set([k for d in results_prep.values() for k in d.keys()])

{'val/acc_task_eval',
 'val/balanced_acc_adv_attack_age',
 'val/balanced_acc_adv_attack_gender',
 'val/loss_adv_attack_age',
 'val/loss_adv_attack_gender',
 'val/loss_task_eval'}

In [12]:
adv_keys = {
    "pan16": ["gender", "age", "all"],
    "bios": ["gender", "all"],
    "hatespeech": ["dialect", "all"]
}

key_map = {
    'task-baseline': [
        'val/acc_task',
        ['val/balanced_acc_adv_attack_task_emb_target_key_{}', 'val/loss_adv_attack_task_emb_target_key_{}']
    ],
    'task-diff_pruning': [
        'val/acc_task',
        ['val/balanced_acc_adv_attack_task_emb_target_key_{}', 'val/loss_adv_attack_task_emb_target_key_{}'],
        'train/zero_ratio_adv'
    ],
    'adverserial-baseline': [
        'val/acc_task_debiased',
        ['val/balanced_acc_adv_attack_task_emb_target_key_{}', 'val/loss_adv_attack_task_emb_target_key_{}'],
        ['val/balanced_acc_adv_attack_adv_emb_{}_target_key_{}', 'val/loss_adv_attack_adv_emb_{}_target_key_{}'],
    ],
    'adverserial-diff_pruning': [
        'val/acc_task_debiased',
        ['val/balanced_acc_adv_attack_task_emb_target_key_{}', 'val/loss_adv_attack_task_emb_target_key_{}'],
        ['val/balanced_acc_adv_attack_adv_emb_{}_target_key_{}', 'val/loss_adv_attack_adv_emb_{}_target_key_{}'],
        'train/zero_ratio_adv'
    ],
    
}


keys_merged_masks = [
    ['val/acc_task_eval', 'val/loss_task_eval'],
    ['val/balanced_acc_adv_attack_gender', 'val/loss_adv_attack_gender'],
    ['val/balanced_acc_adv_attack_age', 'val/loss_adv_attack_age']
]


keys_modular_pan16 = [
    'val/acc_task',
    'val/acc_task_debiased_gender',
    'val/acc_task_debiased_age',
    ['val/balanced_acc_adv_attack_task_emb_target_key_gender', 'val/loss_adv_attack_task_emb_target_key_gender'],
    ['val/balanced_acc_adv_attack_task_emb_target_key_age', 'val/loss_adv_attack_task_emb_target_key_age'],
    ['val/balanced_acc_adv_attack_adv_emb_gender_target_key_gender', 'val/loss_adv_attack_adv_emb_gender_target_key_gender'],
    ['val/balanced_acc_adv_attack_adv_emb_gender_target_key_age', 'val/loss_adv_attack_adv_emb_gender_target_key_age'],
    ['val/balanced_acc_adv_attack_adv_emb_age_target_key_gender', 'val/loss_adv_attack_adv_emb_age_target_key_gender'],
    ['val/balanced_acc_adv_attack_adv_emb_age_target_key_age', 'val/loss_adv_attack_adv_emb_age_target_key_age'],
    'train/zero_ratio_adv_gender',
    'train/zero_ratio_adv_age'
]

keys_modular_pan16_adv_merged = [
    'val/acc_task',
    'val/acc_task_debiased',
    ['val/balanced_acc_adv_attack_task_emb_target_key_gender', 'val/loss_adv_attack_task_emb_target_key_gender'],
    ['val/balanced_acc_adv_attack_task_emb_target_key_age', 'val/loss_adv_attack_task_emb_target_key_age'],
    ['val/balanced_acc_adv_attack_adv_emb_all_target_key_gender', 'val/loss_adv_attack_adv_emb_all_target_key_gender'],
    ['val/balanced_acc_adv_attack_adv_emb_all_target_key_age', 'val/loss_adv_attack_adv_emb_all_target_key_age'],
    'train/zero_ratio_adv',
]


keys_modular_bios = [
    'val/acc_task',
    'val/acc_task_debiased_gender',
    ['val/balanced_acc_adv_attack_task_emb_target_key_gender', 'val/loss_adv_attack_task_emb_target_key_gender'],
    ['val/balanced_acc_adv_attack_adv_emb_gender_target_key_gender', 'val/loss_adv_attack_adv_emb_gender_target_key_gender'],
    'train/zero_ratio_adv_gender'
]


keys_modular_hatespeech = [
    'val/acc_task',
    'val/acc_task_debiased_dialect',
    ['val/balanced_acc_adv_attack_task_emb_target_key_dialect', 'val/loss_adv_attack_task_emb_target_key_dialect'],
    ['val/balanced_acc_adv_attack_adv_emb_dialect_target_key_dialect', 'val/loss_adv_attack_adv_emb_dialect_target_key_dialect'],
    'train/zero_ratio_adv_dialect'
]

In [13]:
def fill_key_map(ds):
    key_map_ = {}
    for k,v in key_map.items():
        tmp = [v[0]] + [[y.format(x) for y in v[1]] for x in adv_keys[ds]]
        if "adv" in k:
            for a, b in product(adv_keys[ds], adv_keys[ds]):
                tmp.append([y.format(a, b) for y in v[2]])
        if "diff" in k:
            tmp.append(v[-1])
        key_map_[k] = tmp
    return key_map_

In [14]:
# pan16 standard
key_map_pan16 = fill_key_map("pan16")
for exp, data in results_prep.items():
    print(exp)
    for exp_str, keys in key_map_pan16.items():
        if exp_str in exp:
            for k in keys:
                try:
                    check = k in data.keys()
                    k_name = k
                    fn = lambda results_prep, exp: get_scalar_stats_wrapper_last(results_prep, exp, k)
                except TypeError:
                    check = k[0] in data.keys()
                    k_name = k[0]
                    fn = lambda results_prep, exp: get_scalar_stats_wrapper_argmin(results_prep, exp, *k)
                if check:
                    res = fn(results_prep, exp)
                    print(f"{k_name}: {res[0]:.3f} +- {res[1]:.3f}")
    print("\n")

                #if check:
        #         acc_task = get_scalar_stats_wrapper_last(results_prep, exp, keys[0])
        # bacc_g = get_scalar_stats_wrapper_argmin(results_prep, exp, *keys[1])
        # bacc_a = get_scalar_stats_wrapper_argmin(results_prep, exp, *keys[2])
        # print(exp)
        # print(f"acc task: {acc_task[0]:.3f} +- {acc_task[1]:.3f}")
        # print(f"bacc attack gender: {bacc_g[0]:.3f} +- {bacc_g[1]:.3f}")
        # print(f"bacc attack age: {bacc_a[0]:.3f} +- {bacc_a[1]:.3f}")
        # if "diff" in exp:
        #     zero_ratio = get_scalar_stats_wrapper_last_approx(results_prep, exp, "zero_ratio")
        #     print(f"zero ratio: {zero_ratio[0]:.3f} +- {zero_ratio[1]:.3f}")
        # print("\n")
    

adv_0.05_additive_seed{}


mod_0.05_additive_seed{}




In [82]:
# bios and hatespeech
km = fill_key_map("bios")
for exp, data in results_prep.items():
    print(exp)
    for exp_str, keys in km.items():
        if exp_str in exp:
            for k in keys:
                try:
                    check = k in data.keys()
                    k_name = k
                    fn = lambda results_prep, exp: get_scalar_stats_wrapper_last(results_prep, exp, k)
                except TypeError:
                    check = k[0] in data.keys()
                    k_name = k[0]
                    fn = lambda results_prep, exp: get_scalar_stats_wrapper_argmin(results_prep, exp, *k)
                if check:
                    res = fn(results_prep, exp)
                    print(f"{k_name}: {res[0]:.3f} +- {res[1]:.3f}")
    print("\n")

adverserial-diff_pruning_0.01-64-2e-05-sp_pen1.25e-07-cp_init-weighted_loss_prot-gender
val/acc_task_debiased: 0.845 +- 0.001
val/balanced_acc_adv_attack_adv_emb_all_target_key_gender: 0.616 +- 0.007
train/zero_ratio_adv: 0.990 +- 0.000




In [94]:
# pan16 modular
for exp in results_prep.keys():
    print(exp)

    if "adv_merged" in exp:
        km = keys_modular_pan16_adv_merged
        acc_task = get_scalar_stats_wrapper_last(results_prep, exp, km[0])
        acc_task_deb = get_scalar_stats_wrapper_last(results_prep, exp, km[1])
        bacc_g_task_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[2])
        bacc_a_task_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[3])
        bacc_g_adv_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[4])
        bacc_a_adv_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[5])
        zero_ratio = get_scalar_stats_wrapper_last(results_prep, exp, km[6])
        print(f"acc task: {acc_task[0]:.3f} +- {acc_task[1]:.3f}")
        print(f"acc task debiased: {acc_task_deb[0]:.3f} +- {acc_task_deb[1]:.3f}")
        print(f"bacc attack gender - task emb: {bacc_g_task_emb[0]:.3f} +- {bacc_g_task_emb[1]:.3f}")
        print(f"bacc attack age - task emb: {bacc_a_task_emb[0]:.3f} +- {bacc_a_task_emb[1]:.3f}")
        print(f"bacc attack gender - adv emb: {bacc_g_adv_emb[0]:.3f} +- {bacc_g_adv_emb[1]:.3f}")
        print(f"bacc attack age - adv emb: {bacc_a_adv_emb[0]:.3f} +- {bacc_a_adv_emb[1]:.3f}")
        print(f"zero ratio: {zero_ratio[0]:.3f} +- {zero_ratio[1]:.3f}")
    else:
        km = keys_modular_pan16
        acc_task = get_scalar_stats_wrapper_last(results_prep, exp, km[0])
        acc_task_g = get_scalar_stats_wrapper_last(results_prep, exp, km[1])
        acc_task_a = get_scalar_stats_wrapper_last(results_prep, exp, km[2])
        bacc_g_task_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[3])
        bacc_a_task_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[4])
        bacc_g_gender_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[5])
        bacc_a_gender_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[6])
        bacc_g_age_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[7])
        bacc_a_age_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[8])
        zero_ratio_g = get_scalar_stats_wrapper_last(results_prep, exp, km[9])
        zero_ratio_a = get_scalar_stats_wrapper_last(results_prep, exp, km[10])
        print(f"acc task: {acc_task[0]:.3f} +- {acc_task[1]:.3f}")
        print(f"acc task debiased gender: {acc_task_g[0]:.3f} +- {acc_task_g[1]:.3f}")
        print(f"acc task debiased age: {acc_task_a[0]:.3f} +- {acc_task_a[1]:.3f}")
        print(f"bacc attack gender - task emb: {bacc_g_task_emb[0]:.3f} +- {bacc_g_task_emb[1]:.3f}")
        print(f"bacc attack age - task emb: {bacc_a_task_emb[0]:.3f} +- {bacc_a_task_emb[1]:.3f}")
        print(f"bacc attack gender - g emb: {bacc_g_gender_emb[0]:.3f} +- {bacc_g_gender_emb[1]:.3f}")
        print(f"bacc attack age - g emb: {bacc_a_gender_emb[0]:.3f} +- {bacc_a_gender_emb[1]:.3f}")
        print(f"bacc attack gender - a emb: {bacc_g_age_emb[0]:.3f} +- {bacc_g_age_emb[1]:.3f}")
        print(f"bacc attack age - a emb: {bacc_a_age_emb[0]:.3f} +- {bacc_a_age_emb[1]:.3f}")
        print(f"zero ratio - g emb: {zero_ratio_g[0]:.3f} +- {zero_ratio_g[1]:.3f}")
        print(f"zero ratio - a emb: {zero_ratio_a[0]:.3f} +- {zero_ratio_a[1]:.3f}")
        
    print("\n")

modular-diff_pruning_0.01-64-2e-05-sp_pen1.25e-07-weighted_loss_prot-gender


KeyError: 'val/acc_task_debiased_age'

In [15]:
# pan16 merged masks
for exp in results_prep.keys():
    acc_task = get_scalar_stats_wrapper_argmin(results_prep, exp, *keys_merged_masks[0])
    bacc_gender = get_scalar_stats_wrapper_argmin(results_prep, exp, *keys_merged_masks[1])
    bacc_age = get_scalar_stats_wrapper_argmin(results_prep, exp, *keys_merged_masks[2])
    print(exp)
    print(f"acc task: {acc_task[0]:.3f} +- {acc_task[1]:.3f}")
    print(f"bacc attack gender: {bacc_gender[0]:.3f} +- {bacc_gender[1]:.3f}")
    print(f"bacc attack age: {bacc_age[0]:.3f} +- {bacc_age[1]:.3f}")
    print("\n")

adv_0.05_additive_seed{}
acc task: 0.910 +- 0.004
bacc attack gender: 0.593 +- 0.006
bacc attack age: 0.373 +- 0.012


mod_0.05_additive_seed{}
acc task: 0.914 +- 0.002
bacc attack gender: 0.607 +- 0.010
bacc attack age: 0.396 +- 0.016




In [14]:
key_map

{'task-baseline': ['val/acc_task',
  ['val/balanced_acc_adv_attack_task_emb_target_key_{}',
   'val/loss_adv_attack_task_emb_target_key_{}']],
 'task-diff_pruning': ['val/acc_task',
  ['val/balanced_acc_adv_attack_task_emb_target_key_{}',
   'val/loss_adv_attack_task_emb_target_key_{}'],
  'train/zero_ratio_adv'],
 'adverserial-baseline': ['val/acc_task_debiased',
  ['val/balanced_acc_adv_attack_task_emb_target_key_{}',
   'val/loss_adv_attack_task_emb_target_key_{}'],
  ['val/balanced_acc_adv_attack_adv_emb_{}_target_key_{}',
   'val/loss_adv_attack_adv_emb_{}_target_key_{}']],
 'adverserial-diff_pruning': ['val/acc_task_debiased',
  ['val/balanced_acc_adv_attack_task_emb_target_key_{}',
   'val/loss_adv_attack_task_emb_target_key_{}'],
  ['val/balanced_acc_adv_attack_adv_emb_{}_target_key_{}',
   'val/loss_adv_attack_adv_emb_{}_target_key_{}'],
  'train/zero_ratio_adv']}

In [106]:
# bios or hatespeech
ds = "hatespeech"
pa = "gender" if ds=="bios" else "dialect"
for exp, keys in km.items():
    if exp in results_prep:
        acc_task = get_scalar_stats_wrapper_last(results_prep, exp, keys[0])
        acc_d = get_scalar_stats_wrapper_argmin(results_prep, exp, *keys[1])
        print(exp)
        print(f"acc task: {acc_task[0]:.3f} +- {acc_task[1]:.3f}")
        print(f"bacc attack {pa}: {acc_d[0]:.3f} +- {acc_d[1]:.3f}")
        if "diff" in exp:
            zero_ratio = get_scalar_stats_wrapper_last_approx(results_prep, exp, "zero_ratio")
            print(f"zero ratio: {zero_ratio[0]:.3f} +- {zero_ratio[1]:.3f}")
        print("\n")
    

AttributeError: 'list' object has no attribute 'items'

In [107]:
# bios or hatespeech modular
# pan16 modular
ds = "hatespeech"
km = keys_modular_bios if ds=="bios" else keys_modular_hatespeech
for exp in results_prep.keys():
    acc_task = get_scalar_stats_wrapper_last(results_prep, exp, km[0])
    acc_task_debiased = get_scalar_stats_wrapper_last(results_prep, exp, km[1])
    bacc_prot_task_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[2])
    bacc_prot_prot_emb = get_scalar_stats_wrapper_argmin(results_prep, exp, *km[3])
    zero_ratio_prot_emb = get_scalar_stats_wrapper_last(results_prep, exp, km[4])
    print(exp)
    print(f"acc task: {acc_task[0]:.3f} +- {acc_task[1]:.3f}")
    print(f"acc task debiased: {acc_task_debiased[0]:.3f} +- {acc_task_debiased[1]:.3f}")
    print(f"bacc attack - task emb: {bacc_prot_task_emb[0]:.3f} +- {bacc_prot_task_emb[1]:.3f}")
    print(f"bacc attack - protected emb: {bacc_prot_prot_emb[0]:.3f} +- {bacc_prot_prot_emb[1]:.3f}")
    print(f"zero ratio - protected emb: {zero_ratio_prot_emb[0]:.3f} +- {zero_ratio_prot_emb[1]:.3f}")
    print("\n")

modular-diff_pruning_0.01-64-2e-05-sp_pen1.25e-07-weighted_loss_prot-dialect
acc task: 0.814 +- 0.005
acc task debiased: 0.812 +- 0.006
bacc attack - task emb: 0.872 +- 0.013
bacc attack - protected emb: 0.754 +- 0.037
zero ratio - protected emb: 0.998 +- 0.000


