In [2]:
import sys
sys.path.insert(0,'..')

import yaml
import os
from yaml import Loader as Loader
from pathlib import Path
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_auc_score

from analyze_results import (
    extract_result,
    aggregate_runs,
    from_model_outputs_calc_rcc_auc,
    format_results2,
    improvement_over_baseline,
    from_model_outputs_calc_pr_auc,
    from_model_outputs_calc_rpp,
    from_model_outputs_calc_roc_auc,
    from_model_outputs_calc_arc_auc
)

from utils.utils_wandb import init_wandb, wandb
from ue4nlp.ue_scores import *

In [3]:
def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return from_model_outputs_calc_arc_auc
    if metric_type == "roc-auc":
        return from_model_outputs_calc_roc_auc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc

    elif metric_type == "pr-auc":
        return from_model_outputs_calc_pr_auc

    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp

    else:
        raise ValueError("Wrong metric type!")


def get_one_table(runs_dir, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baseline=None, methods=None):
    default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
        #"var.ratio": var_ratio,
        #"sampled_entropy": mean_entropy,
    }
    
    if methods is None:
        methods = default_methods

    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            runs_dir, methods=methods, metric=metric
        )

        if agg_res.empty:
            print("Broken\n")
            continue

        if metric_type == "rcc-auc":
            final_score = format_results2(agg_res, percents=False)
        elif metric_type == "rpp":
            final_score = format_results2(agg_res, percents=True)
        elif metric_type == "accuracy":
            final_score = format_results2(agg_res, percents=True)
        elif metric_type == "ece":
            final_score = format_results2(agg_res, percents=True)
        elif metric_type == "sce":
            final_score = format_results2(agg_res, percents=True)
        else:
            final_score = improvement_over_baseline(agg_res, baseline_col="max_prob", baseline=baseline, metric=metric_type, percents=True, subtract=True)
        table.append(final_score)
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    # fix for rcc-auc and rpp
    if 'baseline (max_prob)' not in res_table.index:
        res_table.loc['baseline (max_prob)'] = 0
    for metric in ['rcc-auc', 'rpp']:
        try:
            res_table[metric].loc['baseline (max_prob)'] = res_table[metric].loc['max_prob']
        except:
            pass
    try:
        res_table = res_table.drop(['max_prob', 'count'])
    except:
        res_table = res_table.drop(['max_prob'])
    return res_table


def collect_tables(run_dirs, names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baseline=None, methods=None):
    all_tables = []
    for run_dir, name in zip(run_dirs, names):
        buf_table = get_one_table(run_dir, metric_types, baseline, methods)
        #print(buf_table)
        # add name to index
        indices = [(name, ind) for ind in list(buf_table.index)]
        baseline_name = 'baseline|'+'|'.join(name.split('|')[1:])
        buf_table.loc[baseline_name] = buf_table.loc['baseline (max_prob)']
        # add reindex
        indices = indices + [(baseline_name, 'max_prob')]
        
        index = pd.MultiIndex.from_tuples(indices, names=['Method', 'UE Score'])
        buf_table.index = index
        buf_table.drop((name, 'baseline (max_prob)'), inplace=True)
        # add buf_table to final_table
        all_tables.append(buf_table)
    return pd.concat(all_tables)


def collect_datasets(runs_dirs, names, dataset_names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baselines={}, methods=None):
    all_tables = []
    for run_dir, dataset_name in zip(runs_dirs, dataset_names):
        #try:
        dataset_table = collect_tables(run_dir, names, metric_types, baselines.get(dataset_name, None), methods=methods)
        columns = pd.MultiIndex.from_tuples([(dataset_name, ind) for ind in list(dataset_table.columns)])
        dataset_table.columns = columns
        all_tables.append(dataset_table)
        #except:
        #    print(f'empty dir {run_dir}')
    return pd.concat(all_tables, axis=1)

In [3]:
import os 

default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
raw_baselines = {}
for ds_fname, ds_name in zip(dataset_fnames, dataset_names):
    #model_series_dir = f'../workdir/run_glue_for_model_series/electra-raw/{ds_fname}/0.0/ddpp_dpp_0.3_20/'
    model_series_dir = f'../workdir/final_res_det/run_glue_for_model_series/{ds_fname}_electra_raw_no_sn/mc_mahalanobis/'
    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            model_series_dir, methods=default_methods, metric=metric
        )

        mean_res = agg_res.mean(axis=0)
        final_results = mean_res.T
        table.append(final_results.loc[['max_prob']])
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    raw_baselines[ds_name] = res_table#
    #raw_baselines[ds_name]={k:v for k,v in zip(res_table.columns.values.tolist(), res_table.values[0].tolist())}

FileNotFoundError: [Errno 2] No such file or directory: '../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_no_sn/mc_mahalanobis/'

# NUQ

In [None]:
import os 


def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"#from_model_outputs_calc_arc_auc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

        
metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
nuq_total = lambda x: np.squeeze(x[2], axis=-1)
agg_methods = {
    "nuq_aleatoric": nuq_aleatoric,
    "nuq_epistemic": nuq_epistemic,
    "nuq_total": nuq_total,
}

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['nuq']#'nuq',
regs = ['raw', 'reg', 'metric']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines, methods=agg_methods)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_nuq = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_raw_sn/mrpc/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_raw_sn/cola/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_raw_sn/sst2/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_raw_no_sn/mrpc/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_raw_no_sn/cola/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_raw_no_sn/sst2/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_reg_sn/mrpc/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_reg_sn/cola/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_reg_sn/sst2/0.0/nuq
/home/user/uncertainty-estimation//workdir/run_glue_for_model_series_sn/electra_reg_no_sn/m

In [None]:
def preproc_regs(x):
    reg = x.split('|')[-1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw' or reg == 'Deep Ensemble':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'DE'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif x == 'mahalanobis':
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MD'

table_nuq = pd.concat([pd.concat(tables), pd.concat(baselines).iloc[[4,5,2,3,0,1]]])
table_nuq = table_nuq.reset_index()
table_nuq['Reg. Type'] = table_nuq.Method.apply(lambda x: preproc_regs(x))
table_nuq['Method'] = table_nuq.Method.apply(lambda x: preproc_method(x))
table_nuq['UE Score'] = table_nuq['UE Score'].apply(lambda x: preproc_ue(x))
table_nuq = table_nuq[list(table_nuq.columns[:1]) + list(table_nuq.columns[-1:]) + list(table_nuq.columns[1:-1])].reset_index(drop=True)

In [None]:
table_nuq

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,NUQ SN,-,aleatoric,0.86±0.39,13.12±0.95,1.87±0.10,1.77±0.32,44.77±2.69,2.12±0.07,0.33±0.28,12.38±1.44,0.81±0.08
1,NUQ SN,-,epistemic,0.61±0.49,14.94±0.70,2.15±0.11,1.30±0.41,49.94±3.57,2.57±0.21,0.28±0.30,13.15±1.15,0.86±0.08
2,NUQ SN,-,total,0.77±0.44,13.86±0.74,1.96±0.10,1.51±0.37,46.69±3.14,2.36±0.19,0.32±0.29,12.53±1.02,0.82±0.07
3,NUQ,-,aleatoric,0.35±0.20,13.56±0.57,1.83±0.12,0.83±0.24,44.68±3.50,2.05±0.09,0.41±0.25,12.05±2.10,0.80±0.12
4,NUQ,-,epistemic,0.05±0.23,15.49±0.61,2.15±0.14,0.32±0.15,49.69±1.51,2.59±0.12,0.38±0.24,12.13±1.33,0.81±0.09
5,NUQ,-,total,0.27±0.22,14.22±0.53,1.93±0.14,0.42±0.20,47.69±1.82,2.45±0.11,0.40±0.21,11.75±1.72,0.79±0.10
6,NUQ SN,CER,aleatoric,0.82±0.52,13.19±0.63,1.77±0.07,1.09±0.22,51.70±5.19,2.37±0.13,0.15±0.21,12.24±1.61,0.82±0.09
7,NUQ SN,CER,epistemic,0.59±0.57,14.67±0.65,2.03±0.09,0.64±0.18,55.56±4.22,2.84±0.22,0.11±0.21,13.16±1.77,0.87±0.10
8,NUQ SN,CER,total,0.76±0.56,13.63±0.72,1.84±0.11,0.75±0.18,53.92±3.88,2.73±0.21,0.12±0.19,12.49±1.99,0.83±0.11
9,NUQ,CER,aleatoric,0.47±0.20,14.52±2.07,1.95±0.27,0.41±0.18,43.48±3.01,2.00±0.13,0.68±0.32,11.27±1.81,0.78±0.13


# Determenistic methods

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis', 'mc_mahalanobis']#'nuq',
regs = ['raw','reg']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['True']:#['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                #model_series_dir = f'../workdir/final_res/run_glue_for_model_series/{name}_electra_{reg}_{sn}/{method}/'\
                model_series_dir = f'/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-{reg}-True/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_det_sn = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/mrpc/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/cola/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/sst2/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-reg-True/mrpc/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-reg-True/cola/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-reg-True/sst2/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/mrpc/0.0/mc_mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/cola/0.0/mc_mahalanobis
/mnt/users/avazhentsev/unc

In [None]:
table_det_sn = table_det_sn.iloc[[0,1,3,5,6]].reset_index()
table_det_sn.Method = ['MD SN (ours)']*2+['SMD SN (ours)']*2+['SR SN']
table_det_sn['Reg. Type'] = ['-', 'CER', '-', 'CER', '-']

In [None]:
table_det_sn = table_det_sn[list(table_det_sn.columns[[0,-1]]) + list(table_det_sn.columns[list(range(2,11))])]

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']#'nuq',
regs = ['reg', 'raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/final_res_det/run_glue_for_model_series/{name}_electra_{reg}_{sn}/{method}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_det = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_raw_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_elect

In [None]:
table_det

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
mahalanobis|last|reg_sn,mahalanobis_distance,1.05±0.19,12.30±1.44,1.63±0.18,0.22±0.10,52.66±3.11,2.34±0.11,-0.07±0.21,14.93±2.21,1.03±0.14
mahalanobis|last|reg_no_sn,mahalanobis_distance,1.15±0.21,11.42±1.33,1.58±0.17,0.59±0.21,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
mahalanobis|last|raw_sn,mahalanobis_distance,0.85±0.20,13.57±1.40,1.84±0.20,0.56±0.07,43.41±1.81,2.05±0.07,0.18±0.25,12.98±2.22,0.88±0.14
mahalanobis|last|raw_no_sn,mahalanobis_distance,0.92±0.23,13.21±1.68,1.75±0.23,0.67±0.06,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
baseline|raw_sn,max_prob,91.93±0.51,27.08±5.47,2.86±0.56,90.61±0.25,95.49±10.90,4.06±0.26,93.94±0.30,15.84±4.45,1.06±0.29
baseline|raw_no_sn,max_prob,92.08±0.71,23.28±8.35,2.68±0.68,92.04±0.14,59.04±8.17,2.63±0.17,93.80±0.41,18.07±6.11,1.23±0.41


In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['sngp']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        names = [f'{method}|last|{reg}']
        for name in dataset_fnames:
            if name == 'mrpc':
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}-sngp-correct-hp/{name}/0.0/'
            else:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}-sngp/{name}/0.0/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])
table_sngp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/mrpc/0.0/
../workdir/run_glue_for_model_series/electra-raw-sngp/cola/0.0/
../workdir/run_glue_for_model_series/electra-raw-sngp/sst2/0.0/


In [None]:
table_sngp

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
sngp|last|raw,sngp,0.70±0.39,14.84±2.80,2.06±0.40,-0.02±0.06,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10
baseline|raw,max_prob,92.22±0.24,18.26±2.40,2.62±0.22,91.31±0.09,64.62±4.29,3.41±0.09,92.64±0.37,45.53±10.95,2.40±0.39


# MC-Mahalanobis

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mc_mahalanobis']#'nuq',
regs = ['reg', 'raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/final_res_det/run_glue_for_model_series/{name}_electra_{reg}_{sn}/{method}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_mc_det = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_raw_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_no_sn/mc_mahalanobis/
../workdir/final_res_det/run

In [None]:
table_mc_det

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
mc_mahalanobis|last|reg_sn,mahalanobis_distance,1.05±0.19,12.30±1.42,1.63±0.18,0.21±0.09,52.70±3.15,2.34±0.11,-0.11±0.23,15.52±2.42,1.07±0.15
mc_mahalanobis|last|reg_sn,sampled_mahalanobis_distance,1.23±0.26,11.74±2.37,1.43±0.24,-0.56±0.13,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
mc_mahalanobis|last|reg_no_sn,mahalanobis_distance,1.15±0.21,11.42±1.33,1.58±0.17,0.59±0.21,43.39±3.64,2.04±0.20,0.18±0.27,14.05±4.21,0.94±0.28
mc_mahalanobis|last|reg_no_sn,sampled_mahalanobis_distance,1.24±0.34,11.38±2.76,1.49±0.29,0.20±0.41,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
mc_mahalanobis|last|raw_sn,mahalanobis_distance,0.84±0.21,13.57±1.39,1.84±0.20,0.56±0.07,43.41±1.81,2.05±0.07,0.14±0.25,13.63±2.36,0.93±0.15
mc_mahalanobis|last|raw_sn,sampled_mahalanobis_distance,0.87±0.13,14.51±1.17,1.84±0.14,0.12±0.13,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
mc_mahalanobis|last|raw_no_sn,mahalanobis_distance,0.92±0.23,13.21±1.68,1.75±0.23,0.67±0.06,41.63±1.44,1.96±0.06,0.27±0.44,14.07±3.23,0.96±0.22
mc_mahalanobis|last|raw_no_sn,sampled_mahalanobis_distance,1.10±0.25,13.36±2.21,1.57±0.23,0.30±0.13,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
baseline|raw_sn,max_prob,91.93±0.51,27.08±5.47,2.86±0.56,90.61±0.25,95.49±10.90,4.06±0.26,93.94±0.30,15.84±4.45,1.06±0.29
baseline|raw_no_sn,max_prob,92.08±0.71,23.28±8.35,2.68±0.68,92.04±0.14,59.04±8.17,2.63±0.17,93.80±0.41,18.07±6.11,1.23±0.41


In [None]:
det_res = pd.concat([table_det.iloc[[3,1,2,0]], table_mc_det.iloc[[7,3,5,1]], table_sngp.iloc[:1], pd.concat(baselines).iloc[[2,0,1,3]]])

In [None]:
det_res = det_res.reset_index()
det_res.Method = ['MD']*2+['MD SN (ours)']*2+['SMD (ours)']*2+['SMD SN (ours)']*2+['SNGP']+['SR SN']*2+['SR']*2
det_res['Reg. Type'] = ['-', 'CER']*4+['-']+['-','CER','CER','-']

In [None]:
det_res = det_res[list(det_res.columns[[0,-1]]) + list(det_res.columns[list(range(2,11))])]

In [None]:
det_res#.iloc[:2]

Unnamed: 0_level_0,Method,Reg. Type,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MD,-,0.92±0.23,13.21±1.68,1.75±0.23,0.67±0.06,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
1,MD,CER,1.15±0.21,11.42±1.33,1.58±0.17,0.59±0.21,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
2,MD SN (ours),-,0.85±0.20,13.57±1.40,1.84±0.20,0.56±0.07,43.41±1.81,2.05±0.07,0.18±0.25,12.98±2.22,0.88±0.14
3,MD SN (ours),CER,1.05±0.19,12.30±1.44,1.63±0.18,0.22±0.10,52.66±3.11,2.34±0.11,-0.07±0.21,14.93±2.21,1.03±0.14
4,SMD (ours),-,1.10±0.25,13.36±2.21,1.57±0.23,0.30±0.13,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
5,SMD (ours),CER,1.24±0.34,11.38±2.76,1.49±0.29,0.20±0.41,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
6,SMD SN (ours),-,0.87±0.13,14.51±1.17,1.84±0.14,0.12±0.13,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
7,SMD SN (ours),CER,1.23±0.26,11.74±2.37,1.43±0.24,-0.56±0.13,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
8,SNGP,-,0.70±0.39,14.84±2.80,2.06±0.40,-0.02±0.06,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10
9,SR SN,-,91.93±0.51,27.08±5.47,2.86±0.56,90.61±0.25,95.49±10.90,4.06±0.26,93.94±0.30,15.84±4.45,1.06±0.29


In [None]:
tab_res_det = pd.concat([det_res.iloc[[0,1]], table_det_sn.iloc[[0,1]], det_res.iloc[[4,5,6,7,8]], table_det_sn.iloc[[-1]], det_res.iloc[-3:]])

In [None]:
tab_res_det

Unnamed: 0_level_0,Method,Reg. Type,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MD,-,0.92±0.23,13.21±1.68,1.75±0.23,0.67±0.06,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
1,MD,CER,1.15±0.21,11.42±1.33,1.58±0.17,0.59±0.21,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
0,MD SN (ours),-,0.86±0.28,13.61±2.07,1.80±0.18,0.70±0.09,40.42±2.30,1.96±0.12,0.29±0.28,12.16±1.93,0.83±0.11
1,MD SN (ours),CER,0.79±0.14,14.57±1.49,1.93±0.07,0.77±0.08,39.51±2.61,1.87±0.07,0.30±0.32,10.89±1.25,0.75±0.06
4,SMD (ours),-,1.10±0.25,13.36±2.21,1.57±0.23,0.30±0.13,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
5,SMD (ours),CER,1.24±0.34,11.38±2.76,1.49±0.29,0.20±0.41,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
6,SMD SN (ours),-,0.87±0.13,14.51±1.17,1.84±0.14,0.12±0.13,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
7,SMD SN (ours),CER,1.23±0.26,11.74±2.37,1.43±0.24,-0.56±0.13,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
8,SNGP,-,0.70±0.39,14.84±2.80,2.06±0.40,-0.02±0.06,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10
4,SR SN,-,92.60±0.38,18.72±6.81,2.17±0.40,91.19±0.32,79.92±11.25,3.53±0.33,93.90±0.27,17.83±4.26,1.11±0.28


In [None]:
print(str(tab_res_det.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
       Method & Reg. Type & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST-2} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
           MD &         - &           0.92$\pm$0.23 &  13.21$\pm$1.68 &  1.75$\pm$0.23 &           0.67$\pm$0.13 &   41.63$\pm$1.44 &  1.96$\pm$0.06 &           0.34$\pm$0.44 &  13.01$\pm$2.88 &  0.89$\pm$0.19 \\
           MD &       CER &           1.15$\pm$0.21 &  11.42$\pm$1.33 &  1.58$\pm$0.17 &           0.53$\pm$0.32 &   43.39$\pm$3.64 &  2.04$\pm$0.20 &           0.24$\pm$0.25 &  12.90$\pm$3.55 &  0.87$\pm$0.23 \\
 MD SN (ours) &         - &           0.86$\pm$0.28 &  13.61$\pm$2.07 &  1.80$\pm$0.18 &           1.54$\pm$0.32 &   40.42$\pm$2.30 &  1.96$\pm$0.12 &           0.29$\pm$0.28 &  12.16$\pm$1.93 &  0.83$\pm$0.11 \\
 MD SN (ours) &       CER &           

# MC-Dropout 

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mc_all']#, 'mc_last']
regs = ['raw', 'reg']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for reg in regs:
    run_dirs = []
    #layer = method.split('_')[-1]
    names = [f'mc|{reg}']
    print(names)
    for name in dataset_fnames:
        model_series_dir = f'../workdir/final_res/run_mc_all/{name}_electra_{reg}_no_sn/mc_all/'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    try:
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])
    except:
        print('skip')
        pass
table_mc = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_mc = table_mc.reset_index()

['mc|raw']
../workdir/final_res/run_mc_all/mrpc_electra_raw_no_sn/mc_all/
../workdir/final_res/run_mc_all/cola_electra_raw_no_sn/mc_all/
../workdir/final_res/run_mc_all/sst2_electra_raw_no_sn/mc_all/
['mc|reg']
../workdir/final_res/run_mc_all/mrpc_electra_reg_no_sn/mc_all/
../workdir/final_res/run_mc_all/cola_electra_reg_no_sn/mc_all/
../workdir/final_res/run_mc_all/sst2_electra_reg_no_sn/mc_all/


In [None]:
def preproc_regs(x):
    regs = x.split('|')    
    return '-' if (regs[-1]=='raw' or regs[-1] == '') else 'CER'
        
table_mc['Reg. Type'] = table_mc.Method.apply(lambda x: preproc_regs(x))
table_mc['Dropout Layers'] = table_mc['Method'].apply(lambda x: x.split('|')[1] if 'baseline' not in x else '-')
table_mc['Method'] = table_mc['Method'].apply(lambda x: x.split('|')[0].upper() if 'baseline' not in x else x.split('|')[0])
table_mc = table_mc[list(table_mc.columns[:1]) + list(table_mc.columns[-2:]) + list(table_mc.columns[1:-2])]

In [None]:
table_mc['UE Score'] = ['BALD', 'SMP', 'PV', 'VR']*2+['MP']*2
table_mc['Reg. Type'] = list(table_mc['Reg. Type'].iloc[:-2].values) + ['-', 'CER']

In [None]:
table_mc[list(table_mc.columns[:2]) + list(table_mc.columns[3:])]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MC,-,BALD,1.00±0.13,14.99±1.47,1.66±0.17,0.45±0.13,48.13±3.96,2.21±0.13,0.37±0.32,13.59±3.84,0.86±0.14
1,MC,-,SMP,1.02±0.14,15.00±3.15,1.64±0.21,0.54±0.10,46.58±3.67,2.12±0.09,0.35±0.29,13.12±3.27,0.88±0.17
2,MC,-,PV,1.01±0.13,14.66±1.56,1.65±0.17,0.47±0.12,47.15±3.59,2.18±0.11,0.36±0.29,13.47±3.94,0.86±0.15
3,MC,-,VR,0.07±0.44,24.75±5.93,2.94±0.44,-0.42±0.15,69.07±6.33,2.96±0.13,-0.30±0.45,27.87±4.82,1.66±0.27
4,MC,CER,BALD,1.17±0.29,12.47±2.64,1.59±0.26,0.34±0.24,49.94±6.41,2.27±0.21,0.28±0.19,14.00±2.64,0.84±0.15
5,MC,CER,SMP,1.04±0.32,13.25±3.21,1.70±0.32,0.46±0.25,46.02±5.32,2.17±0.24,0.25±0.12,13.79±3.08,0.88±0.19
6,MC,CER,PV,1.12±0.27,12.62±2.51,1.61±0.26,0.37±0.23,48.39±5.93,2.23±0.21,0.27±0.17,14.13±2.71,0.85±0.16
7,MC,CER,VR,-0.18±0.22,25.39±2.42,3.27±0.23,-0.41±0.26,78.09±12.21,3.27±0.24,-0.50±0.46,29.23±5.85,1.65±0.32
8,baseline,-,MP,92.08±0.71,23.28±8.35,2.68±0.68,92.04±0.14,59.04±8.17,2.63±0.17,93.80±0.41,18.07±6.11,1.23±0.41
9,baseline,CER,MP,92.63±0.41,17.17±4.12,2.21±0.41,92.09±0.45,54.04±10.18,2.57±0.46,93.90±0.24,16.68±2.92,1.11±0.24


In [None]:
print(str(table_mc[list(table_mc.columns[:2]) + list(table_mc.columns[3:])].iloc[:8].to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
Method & Reg. Type & UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST-2} \\
       & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
    MC &         - &     BALD &           1.00$\pm$0.13 &  14.99$\pm$1.47 &  1.66$\pm$0.17 &           0.45$\pm$0.13 &   48.13$\pm$3.96 &  2.21$\pm$0.13 &           0.37$\pm$0.32 &  13.59$\pm$3.84 &  0.86$\pm$0.14 \\
    MC &         - &      SMP &           1.02$\pm$0.14 &  15.00$\pm$3.15 &  1.64$\pm$0.21 &           0.54$\pm$0.10 &   46.58$\pm$3.67 &  2.12$\pm$0.09 &           0.35$\pm$0.29 &  13.12$\pm$3.27 &  0.88$\pm$0.17 \\
    MC &         - &       PV &           1.01$\pm$0.13 &  14.66$\pm$1.56 &  1.65$\pm$0.17 &           0.47$\pm$0.12 &   47.15$\pm$3.59 &  2.18$\pm$0.11 &           0.36$\pm$0.29 &  13.47$\pm$3.94 &  0.86$\pm$0.15 \\
    MC &         - &       V

# MC-DPP all

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.5, 0.6]
comsizes = [20, 50]

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for reg in regs:
                run_dirs = []
                names = [f'ddpp_{method}|{max_frac}|{cs}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/electra-raw/{name}/0.0/{method}_{max_frac}_{cs}'
                    #print(model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print(f'Not exists one of this dirs: {run_dirs}')

Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_dpp_0.5_20']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_dpp_0.6_20']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_dpp_0.6_50']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_ood_0.6_20']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_ood_0.6_50']


In [None]:
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])])

In [None]:
table_dpp

Unnamed: 0_level_0,Unnamed: 1_level_0,CoLA,CoLA,CoLA,MRPC,MRPC,MRPC,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ddpp_ddpp_dpp|0.3|20,bald,111.11±16.72,-0.05±0.12,2.98±0.26,56.43±22.61,-0.43±0.60,3.34±1.13,32.64±8.36,-0.13±0.20,1.36±0.32
ddpp_ddpp_dpp|0.3|20,sampled_max_prob,109.98±10.93,-0.02±0.08,2.93±0.22,48.79±22.51,-0.05±0.10,2.96±1.13,30.13±6.30,-0.06±0.14,1.29±0.27
ddpp_ddpp_dpp|0.3|20,variance,111.56±13.93,-0.05±0.09,2.97±0.24,54.27±21.51,-0.33±0.37,3.22±1.08,31.95±6.84,-0.09±0.13,1.33±0.26
ddpp_ddpp_dpp|0.3|50,bald,104.60±6.92,0.03±0.12,2.89±0.14,43.00±23.97,0.16±0.57,2.77±1.24,32.85±10.20,-0.06±0.19,1.30±0.34
ddpp_ddpp_dpp|0.3|50,sampled_max_prob,107.48±9.28,0.02±0.02,2.90±0.18,46.69±24.13,0.03±0.09,2.90±1.16,28.95±7.81,-0.02±0.12,1.24±0.28
ddpp_ddpp_dpp|0.3|50,variance,104.12±7.89,0.05±0.09,2.87±0.15,45.31±23.25,0.10±0.41,2.83±1.17,32.23±10.85,-0.06±0.16,1.30±0.33
ddpp_ddpp_dpp|0.4|20,bald,108.61±11.38,-0.06±0.06,2.98±0.24,49.17±23.52,-0.11±0.43,3.02±1.22,36.45±14.41,-0.21±0.28,1.44±0.46
ddpp_ddpp_dpp|0.4|20,sampled_max_prob,109.39±13.93,-0.03±0.10,2.95±0.27,48.31±22.84,-0.08±0.12,2.96±1.12,28.13±6.61,0.01±0.15,1.22±0.26
ddpp_ddpp_dpp|0.4|20,variance,108.27±11.59,-0.02±0.07,2.95±0.23,50.08±23.19,-0.15±0.34,3.06±1.16,34.06±12.32,-0.11±0.22,1.35±0.36
ddpp_ddpp_dpp|0.4|50,bald,108.04±7.63,-0.00±0.10,2.93±0.16,44.15±22.13,0.11±0.68,2.79±1.20,33.22±11.59,-0.08±0.22,1.32±0.36


In [None]:
table_dpp.sort_values(by=('SST-2', 'rcc-auc')).iloc[:50]

Unnamed: 0_level_0,Unnamed: 1_level_0,CoLA,CoLA,CoLA,MRPC,MRPC,MRPC,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp
ddpp_ddpp_dpp|0.5|50,sampled_max_prob,108.98±8.57,0.02±0.06,2.91±0.18,46.52±23.47,0.02±0.06,2.89±1.12,25.95±4.20,0.03±0.06,1.16±0.20
ddpp_ddpp_dpp|0.4|20,sampled_max_prob,109.39±13.93,-0.03±0.10,2.95±0.27,48.31±22.84,-0.08±0.12,2.96±1.12,28.13±6.61,0.01±0.15,1.22±0.26
ddpp_ddpp_dpp|0.4|50,sampled_max_prob,108.99±9.82,0.01±0.05,2.92±0.19,45.47±22.77,0.04±0.23,2.86±1.14,28.15±4.37,0.02±0.05,1.21±0.17
ddpp_ddpp_dpp|0.3|50,sampled_max_prob,107.48±9.28,0.02±0.02,2.90±0.18,46.69±24.13,0.03±0.09,2.90±1.16,28.95±7.81,-0.02±0.12,1.24±0.28
ddpp_ddpp_dpp|0.3|20,sampled_max_prob,109.98±10.93,-0.02±0.08,2.93±0.22,48.79±22.51,-0.05±0.10,2.96±1.13,30.13±6.30,-0.06±0.14,1.29±0.27
ddpp_ddpp_dpp|0.4|50,variance,107.11±7.33,0.03±0.10,2.90±0.15,44.30±21.54,0.12±0.61,2.79±1.17,30.81±8.46,-0.02±0.15,1.25±0.28
ddpp_ddpp_dpp|0.3|20,variance,111.56±13.93,-0.05±0.09,2.97±0.24,54.27±21.51,-0.33±0.37,3.22±1.08,31.95±6.84,-0.09±0.13,1.33±0.26
ddpp_ddpp_dpp|0.5|50,variance,106.94±9.41,0.03±0.06,2.89±0.18,43.74±21.31,0.15±0.32,2.77±1.01,32.14±13.13,-0.08±0.08,1.26±0.33
ddpp_ddpp_dpp|0.3|50,variance,104.12±7.89,0.05±0.09,2.87±0.15,45.31±23.25,0.10±0.41,2.83±1.17,32.23±10.85,-0.06±0.16,1.30±0.33
ddpp_ddpp_dpp|0.3|20,bald,111.11±16.72,-0.05±0.12,2.98±0.26,56.43±22.61,-0.43±0.60,3.34±1.13,32.64±8.36,-0.13±0.20,1.36±0.32


In [None]:
{'MRPC': {'ddpp_ood' : 0.6, 'ddpp_dpp' : 0.5},
'CoLA': {'ddpp_ood' : 0.5, 'ddpp_dpp' : 0.6},
'SST-2': {'ddpp_ood' : 0.5, 'ddpp_dpp' : 0.5}}

{'MRPC': {'ddpp_ood': 0.6, 'ddpp_dpp': 0.5},
 'CoLA': {'ddpp_ood': 0.5, 'ddpp_dpp': 0.6},
 'SST-2': {'ddpp_ood': 0.4, 'ddpp_dpp': 0.5}}

# MC-DPP calibration

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.6, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.6}, 
                   'sst2': {'False': 0.3, 'True': 0.3}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for val_subsample in val_subsamples:
        for reg in regs:
            run_dirs = []
            dpp_type = 'with_ood' if method=='True' else 'on_masks'
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            for name in dataset_fnames:
                max_frac = max_fracs_dicts[name][method]
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_dpp_cal = pd.concat([pd.concat(tables), pd.concat(baselines[-2:-1])])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/False/0.6/2021-10-02/03-46-27/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/False/0.6/2021-10-03/03-10-07/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/False/0.3/2021-10-03/10-23-36/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/dpp/False/0.6/2021-10-02/03-22-18/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/False/0.6/2021-10-03/02-29-18/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.1/dpp/False/0.3/2021-10-03/10-02-10/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.6/2021-10-02/05-02-54/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/True/0.6/2021-10-03/05-28-09/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/True/0.3/2021-10-03/11-51-10/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/dpp/True/0.6/2021-10-02/04-33-03/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/True/0.6/202

In [None]:
def preproc_cal(x):
    if 'baseline' in x:
        return '-'
    ds = x.split('|')[1]
    if ds == '0.1':
        return 'val.'
    return 'train'
        
table_dpp_cal = table_dpp_cal.reset_index()
table_dpp_cal['Calibr. Dataset'] = table_dpp_cal.Method.apply(lambda x: preproc_cal(x))
table_dpp_cal['Method'] = table_dpp_cal['Method'].apply(lambda x: x.split('|')[0])
table_dpp_cal = table_dpp_cal[list(table_dpp_cal.columns[:1]) + list(table_dpp_cal.columns[-1:]) + list(table_dpp_cal.columns[1:-1])]

In [None]:
table_dpp_cal

Unnamed: 0_level_0,Method,Calibr. Dataset,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,train,bald,-0.05±0.16,15.99±0.98,2.03±0.18,-0.11±0.39,51.39±9.34,2.54±0.37,-0.01±0.19,19.37±3.13,1.15±0.18
1,DPP_on_masks,train,sampled_max_prob,0.04±0.18,14.42±1.06,1.92±0.20,0.00±0.31,48.92±7.28,2.43±0.28,0.01±0.22,17.14±3.35,1.11±0.23
2,DPP_on_masks,train,variance,0.02±0.22,14.76±1.48,1.94±0.25,-0.04±0.32,49.79±8.13,2.47±0.32,-0.03±0.19,19.03±2.89,1.15±0.19
3,DPP_on_masks,val.,bald,-0.69±1.05,25.97±18.99,2.60±1.04,-0.44±0.26,59.58±7.91,2.86±0.20,0.09±0.23,15.88±3.32,1.07±0.23
4,DPP_on_masks,val.,sampled_max_prob,-0.02±0.45,15.18±3.32,1.92±0.36,-0.20±0.14,53.21±3.52,2.61±0.18,-0.01±0.20,17.46±3.05,1.18±0.22
5,DPP_on_masks,val.,variance,-0.06±0.44,15.36±3.35,1.95±0.35,-0.26±0.11,54.27±4.05,2.66±0.14,0.02±0.20,16.84±2.86,1.14±0.19
6,DPP_with_ood,train,bald,-0.31±0.51,20.21±6.49,2.30±0.48,0.04±0.30,51.21±10.43,2.38±0.30,-0.37±0.16,25.45±3.33,1.50±0.16
7,DPP_with_ood,train,sampled_max_prob,-0.05±0.16,15.22±0.86,2.04±0.19,0.15±0.27,48.60±8.17,2.27±0.24,0.05±0.24,17.60±5.34,1.08±0.24
8,DPP_with_ood,train,variance,-0.11±0.22,16.91±3.15,2.09±0.20,0.10±0.28,49.57±9.21,2.30±0.26,-0.24±0.26,22.98±4.39,1.37±0.24
9,DPP_with_ood,val.,bald,-1.82±1.11,37.60±17.51,3.75±1.07,-0.31±0.44,65.64±16.01,2.73±0.42,-0.42±0.07,27.17±3.71,1.57±0.08


In [None]:
print(str(table_dpp_cal.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
       Method & Calibr. Dataset &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &           train &              bald &          -0.05$\pm$0.16 &   15.99$\pm$0.98 &  2.03$\pm$0.18 &          -0.11$\pm$0.39 &   51.39$\pm$9.34 &  2.54$\pm$0.37 &          -0.01$\pm$0.19 &  19.37$\pm$3.13 &  1.15$\pm$0.18 \\
 DPP\_on\_masks &           train &  sampled\_max\_prob &           0.04$\pm$0.18 &   14.42$\pm$1.06 &  1.92$\pm$0.20 &           0.00$\pm$0.31 &   48.92$\pm$7.28 &  2.43$\pm$0.28 &           0.01$\pm$0.22 &  17.14$\pm$3.35 &  1.11$\pm$0.23 \\
 DPP\_on\_masks &           train &          variance &           0.02$\pm$0.22 &   14.76$\pm$1.48 &  1.94$\pm$0.25 &          -0.04$\pm$0.32 &   49.79$\pm$8

# MC-DPP regs

In [None]:
table_dpp.sort_values(by=('SST2 (10%)', 'rcc-auc')).iloc[:50]

NameError: name 'table_dpp' is not defined

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['reg']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.3, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.3}, 
                   'sst2': {'False': 0.4, 'True': 0.6}}
val_subsamples_dicts = {'mrpc': {'False': 0.0, 'True': 0.0}, 
                        'cola': {'False': 0.0, 'True': 0.0}, 
                        'sst2': {'False': 0.0, 'True': 0.0}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        dpp_type = 'with_ood' if method=='True' else 'on_masks'
        for name in dataset_fnames:      
            val_subsample = val_subsamples_dicts[name][method]
            max_frac = max_fracs_dicts[name][method]
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
            model_series_dir += os.listdir(model_series_dir)[-1]
            model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/dpp/False/0.3/2021-10-01/16-30-33/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/dpp/False/0.6/2021-10-02/22-25-22/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/dpp/False/0.4/2021-10-03/13-24-49/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/dpp/True/0.6/2021-10-02/02-26-43/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/dpp/True/0.3/2021-10-02/08-14-31/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/dpp/True/0.6/2021-10-03/21-04-44/


In [None]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_dpp_reg = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_dpp_reg = table_dpp_reg.reset_index()
table_dpp_reg['Reg. Type'] = table_dpp_reg.Method.apply(lambda x: preproc_regs(x))
table_dpp_reg['Method'] = table_dpp_reg['Method'].apply(lambda x: x.split('|')[0])
table_dpp_reg = table_dpp_reg[list(table_dpp_reg.columns[:1]) + list(table_dpp_reg.columns[-1:]) + list(table_dpp_reg.columns[1:-1])]

In [None]:
table_dpp_reg

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,reg,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13
1,DPP_on_masks,reg,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12
2,DPP_on_masks,reg,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09
3,DPP_with_ood,reg,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26
4,DPP_with_ood,reg,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10
5,DPP_with_ood,reg,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09
6,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11
7,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11


In [None]:
print(str(table_dpp_reg.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
       Method & Reg. Type &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &       reg &              bald &           0.36$\pm$0.33 &  12.06$\pm$1.94 &  1.62$\pm$0.34 &          -0.36$\pm$0.34 &  54.66$\pm$6.57 &  2.77$\pm$0.37 &           0.17$\pm$0.13 &  14.86$\pm$1.94 &  0.97$\pm$0.13 \\
 DPP\_on\_masks &       reg &  sampled\_max\_prob &           0.27$\pm$0.15 &  12.53$\pm$0.98 &  1.70$\pm$0.22 &          -0.17$\pm$0.29 &  51.43$\pm$5.55 &  2.57$\pm$0.27 &           0.23$\pm$0.13 &  13.23$\pm$1.86 &  0.91$\pm$0.12 \\
 DPP\_on\_masks &       reg &          variance &           0.28$\pm$0.23 &  12.32$\pm$1.49 &  1.66$\pm$0.28 &          -0.22$\pm$0.31 &  52.46$\pm$5.81 &  2.63$\pm$0.30 &          

# Raw DPP

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['reg', 'raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]


dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for max_frac in max_fracs:
    for val_subsample in val_subsamples:
        for reg in regs:
            run_dirs = []
            names = [f'dpp|{max_frac}|{val_subsample}|{reg}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/raw_dpp/{max_frac}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/raw_dpp/0.3/2021-10-01/16-36-07/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/raw_dpp/0.3/2021-10-02/04-14-06/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/raw_dpp/0.3/2021-10-03/02-09-45/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/raw_dpp/0.3/2021-10-01/18-07-08/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/raw_dpp/0.3/2021-10-02/07-18-23/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/raw_dpp/0.3/2021-10-03/04-38-20/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.1/raw_dpp/0.3/2021-10-01/16-14-13/
../workdir/run_glue_for_model_series/electra-reg/cola/0.1/raw_dpp/0.3/2021-10-02/03-37-17/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.1/raw_dpp/0.3/2021-10-03/01-42-58/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/raw_dpp/0.3/2021-10-01/17-47-49/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/raw_dpp/0.3/2021-10-02/06-46-16/

In [None]:
table_dpp.sort_values(by=('SST2 (10%)', 'rcc-auc')).iloc[:50]

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': 0.3, 
                   'cola': 0.4, 
                   'sst2': 0.6}
val_subsamples_dicts = {'mrpc': 0.0, 
                        'cola': 0.1, 
                        'sst2': 0.0}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for reg in regs:
    run_dirs = []
    for name in dataset_fnames:      
        val_subsample = val_subsamples_dicts[name]
        max_frac = max_fracs_dicts[name]
        names = [f'DPP|{val_subsample}|{reg}']
        model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
        model_series_dir += os.listdir(model_series_dir)[-1]
        model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
    baselines.append(res_df.iloc[-1:])
    tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.3/2021-10-01/19-33-07/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/True/0.4/2021-10-02/19-28-25/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/True/0.6/2021-10-03/23-59-01/


In [None]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_raw_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_raw_dpp = table_raw_dpp.reset_index()
table_raw_dpp['Method'] = table_raw_dpp['Method'].apply(lambda x: x.split('|')[0])
#table_raw_dpp = table_raw_dpp[list(table_raw_dpp.columns[:1]) + list(table_raw_dpp.columns[-1:]) + list(table_raw_dpp.columns[1:-1])]

In [None]:
table_raw_dpp

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP,bald,-0.31±0.52,20.57±6.22,2.28±0.51,-0.31±0.45,65.82±14.59,2.73±0.40,-0.19±0.29,21.60±6.41,1.33±0.29
1,DPP,sampled_max_prob,-0.08±0.17,15.65±1.56,2.07±0.22,-0.01±0.17,55.23±8.15,2.41±0.19,0.04±0.23,17.06±4.21,1.10±0.22
2,DPP,variance,0.01±0.40,15.95±5.33,1.98±0.43,-0.06±0.24,58.42±11.42,2.48±0.21,-0.01±0.20,18.76±5.07,1.15±0.22
3,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.02±0.17,53.17±3.70,2.62±0.20,93.89±0.21,17.10±3.29,1.13±0.23


In [None]:
print(str(table_raw_dpp.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
   Method &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
          & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
      DPP &              bald &          -0.31$\pm$0.52 &  20.57$\pm$6.22 &  2.28$\pm$0.51 &          -0.31$\pm$0.45 &  65.82$\pm$14.59 &  2.73$\pm$0.40 &          -0.19$\pm$0.29 &  21.60$\pm$6.41 &  1.33$\pm$0.29 \\
      DPP &  sampled\_max\_prob &          -0.08$\pm$0.17 &  15.65$\pm$1.56 &  2.07$\pm$0.22 &          -0.01$\pm$0.17 &   55.23$\pm$8.15 &  2.41$\pm$0.19 &           0.04$\pm$0.23 &  17.06$\pm$4.21 &  1.10$\pm$0.22 \\
      DPP &          variance &           0.01$\pm$0.40 &  15.95$\pm$5.33 &  1.98$\pm$0.43 &          -0.06$\pm$0.24 &  58.42$\pm$11.42 &  2.48$\pm$0.21 &          -0.01$\pm$0.20 &  18.76$\pm$5.07 &  1.15$\pm$0.22 \\
 baseline &       

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.6, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.6}, 
                   'sst2': {'False': 0.3, 'True': 0.3}}
val_subsamples_dicts = {'mrpc': {'False': 0.0, 'True': 0.0}, 
                        'cola': {'False': 0.0, 'True': 0.0}, 
                        'sst2': {'False': 0.0, 'True': 0.1}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []


for method in methods:
    for reg in regs:
        run_dirs = []
        dpp_type = 'with_ood' if method=='True' else 'on_masks'
        for name in dataset_fnames:      
            val_subsample = val_subsamples_dicts[name][method]
            max_frac = max_fracs_dicts[name][method]
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
            model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
            model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/False/0.6/2021-10-02/03-46-27/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/False/0.6/2021-10-03/03-10-07/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/False/0.3/2021-10-03/10-23-36/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.6/2021-10-02/05-02-54/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/True/0.6/2021-10-03/05-28-09/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.1/dpp/True/0.3/2021-10-03/11-22-29/


In [None]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_dpp_2 = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_dpp_2 = table_dpp_2.reset_index()
table_dpp_2['Method'] = table_dpp_2['Method'].apply(lambda x: x.split('|')[0])

In [None]:
table_dpp_2

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,bald,-0.05±0.16,15.99±0.98,2.03±0.18,-0.11±0.39,51.39±9.34,2.54±0.37,-0.01±0.19,19.37±3.13,1.15±0.18
1,DPP_on_masks,sampled_max_prob,0.04±0.18,14.42±1.06,1.92±0.20,0.00±0.31,48.92±7.28,2.43±0.28,0.01±0.22,17.14±3.35,1.11±0.23
2,DPP_on_masks,variance,0.02±0.22,14.76±1.48,1.94±0.25,-0.04±0.32,49.79±8.13,2.47±0.32,-0.03±0.19,19.03±2.89,1.15±0.19
3,DPP_with_ood,bald,-0.31±0.51,20.21±6.49,2.30±0.48,0.04±0.30,51.21±10.43,2.38±0.30,-0.42±0.07,27.17±3.71,1.57±0.08
4,DPP_with_ood,sampled_max_prob,-0.05±0.16,15.22±0.86,2.04±0.19,0.15±0.27,48.60±8.17,2.27±0.24,0.21±0.19,13.76±2.39,0.94±0.20
5,DPP_with_ood,variance,-0.11±0.22,16.91±3.15,2.09±0.20,0.10±0.28,49.57±9.21,2.30±0.26,-0.17±0.12,22.83±4.46,1.32±0.12
6,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.89±0.21,17.10±3.29,1.13±0.23
7,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.86±0.21,17.31±2.91,1.19±0.22


In [None]:
print(str(table_dpp_2.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
       Method &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &              bald &          -0.05$\pm$0.16 &  15.99$\pm$0.98 &  2.03$\pm$0.18 &          -0.11$\pm$0.39 &   51.39$\pm$9.34 &  2.54$\pm$0.37 &          -0.01$\pm$0.19 &  19.37$\pm$3.13 &  1.15$\pm$0.18 \\
 DPP\_on\_masks &  sampled\_max\_prob &           0.04$\pm$0.18 &  14.42$\pm$1.06 &  1.92$\pm$0.20 &           0.00$\pm$0.31 &   48.92$\pm$7.28 &  2.43$\pm$0.28 &           0.01$\pm$0.22 &  17.14$\pm$3.35 &  1.11$\pm$0.23 \\
 DPP\_on\_masks &          variance &           0.02$\pm$0.22 &  14.76$\pm$1.48 &  1.94$\pm$0.25 &          -0.04$\pm$0.32 &   49.79$\pm$8.13 &  2.47$\pm$0.32 &          -0.03$\pm$0.19 &  19.03$\pm$2.89 &  1.15$\pm$

# Ensemble

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
dataset_to_time = {'mrpc':'17-25-06', 'cola': '17-34-50', 'sst2': '17-45-49'}
run_dirs = []
for name in dataset_fnames:      
    names = [f'Deep Ensemble']
    time = dataset_to_time[name]
    model_series_dir = f'/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/{time}/final_results/'
    print(model_series_dir)
    run_dirs.append([model_series_dir])
ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)

/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/17-25-06/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/17-34-50/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/17-45-49/final_results/


In [None]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = '-'
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab['UE Score'] = ['BALD', 'SMP', 'PV', 'VR', 'MP']
ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-1:]) + list(ens_tab.columns[1:-1])]

In [None]:
ens_tab

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,-,BALD,0.29±0.19,26.61±4.14,2.51±0.29,-0.40±0.36,73.43±10.75,2.96±0.35,0.37±0.27,21.29±3.65,1.02±0.09
1,Deep Ensemble,-,SMP,0.82±0.30,16.48±4.34,1.96±0.34,-0.03±0.19,55.72±4.96,2.60±0.20,0.50±0.25,13.43±1.84,0.87±0.08
2,Deep Ensemble,-,PV,0.40±0.22,25.56±5.96,2.41±0.31,-0.30±0.32,68.49±8.94,2.87±0.30,0.41±0.24,17.88±2.59,0.97±0.07
3,Deep Ensemble,-,VR,0.06±0.33,24.98±1.55,3.04±0.46,-1.45±0.29,114.57±7.79,4.02±0.19,-0.33±0.30,27.82±3.65,1.74±0.20
4,baseline,-,MP,92.21±0.59,21.23±8.59,2.57±0.64,91.50±0.37,64.03±7.96,3.07±0.38,93.66±0.22,20.70±3.03,1.39±0.23


In [None]:
print(str(ens_tab.iloc[:-1].to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
        Method & Reg. Type & UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST-2} \\
               & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 Deep Ensemble &         - &     BALD &           0.29$\pm$0.19 &  26.61$\pm$4.14 &  2.51$\pm$0.29 &          -0.40$\pm$0.36 &  73.43$\pm$10.75 &  2.96$\pm$0.35 &           0.37$\pm$0.27 &  21.29$\pm$3.65 &  1.02$\pm$0.09 \\
 Deep Ensemble &         - &      SMP &           0.82$\pm$0.30 &  16.48$\pm$4.34 &  1.96$\pm$0.34 &          -0.03$\pm$0.19 &   55.72$\pm$4.96 &  2.60$\pm$0.20 &           0.50$\pm$0.25 &  13.43$\pm$1.84 &  0.87$\pm$0.08 \\
 Deep Ensemble &         - &       PV &           0.40$\pm$0.22 &  25.56$\pm$5.96 &  2.41$\pm$0.31 &          -0.30$\pm$0.32 &   68.49$\pm$8.94 &  2.87$\pm$0.30 &           0.41$\pm$0.24 &  17.88$\pm$2.59 &  0.97$

# DE + mahalanobis

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
dataset_to_time = {'mrpc': '12-56-53', 'cola': '13-08-59', 'sst2': '13-44-40'}
run_dirs = []
for name in dataset_fnames:      
    names = [f'Deep Ensemble']
    time = dataset_to_time[name]
    model_series_dir = f'/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/{time}/final_results/'
    print(model_series_dir)
    run_dirs.append([model_series_dir])
ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)

/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/12-56-53/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/13-08-59/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/13-44-40/final_results/


In [None]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = '-'
ens_tab['UE Score'] = ['MD', 'MD', 'MP']
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-1:]) + list(ens_tab.columns[1:-1])]

In [None]:
ens_tab['Method'] = 'DE+'+ens_tab['UE Score']
ens_tab = ens_tab.drop(columns=['UE Score'])

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [None]:
tab_res_det = tab_res_det.reset_index(drop=True)

In [None]:
pd.concat([tab_res_det.iloc[:8], ens_tab.iloc[1:2], det_res.iloc[8:]]).reset_index(drop=True)

Unnamed: 0_level_0,Method,Reg. Type,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MD,-,0.92±0.23,13.21±1.68,1.75±0.23,0.67±0.06,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
1,MD,CER,1.15±0.21,11.42±1.33,1.58±0.17,0.59±0.21,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
2,MD SN (ours),-,0.86±0.28,13.61±2.07,1.80±0.18,0.70±0.09,40.42±2.30,1.96±0.12,0.29±0.28,12.16±1.93,0.83±0.11
3,MD SN (ours),CER,0.79±0.14,14.57±1.49,1.93±0.07,0.77±0.08,39.51±2.61,1.87±0.07,0.30±0.32,10.89±1.25,0.75±0.06
4,SMD (ours),-,1.10±0.25,13.36±2.21,1.57±0.23,0.30±0.13,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
5,SMD (ours),CER,1.24±0.34,11.38±2.76,1.49±0.29,0.20±0.41,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
6,SMD SN (ours),-,0.87±0.13,14.51±1.17,1.84±0.14,0.12±0.13,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
7,SMD SN (ours),CER,1.23±0.26,11.74±2.37,1.43±0.24,-0.56±0.13,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
8,DE+MD,-,0.85±0.01,13.16±0.19,1.82±0.27,0.33±0.13,46.43±2.14,2.22±0.12,0.75±0.26,8.63±0.68,0.62±0.03
9,SNGP,-,0.70±0.39,14.84±2.80,2.06±0.40,-0.02±0.06,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10


In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['sngp']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
ridge_factors = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
momentums = [0.999, 0.99, 0.9]
        
for method in methods:
    for reg in regs:
        for ridge_factor in ridge_factors:
            for momentum in momentums:
                run_dirs = []
                names = [f'{method}|{ridge_factor}_{momentum}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/electra-raw-sngp/{name}/{ridge_factor}_{momentum}_0.0'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={})
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    continue
table_sngp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-raw-sngp/mrpc/1e-05_0.999_0.0
../workdir/run_glue_for_model_series/electra-raw-sngp/cola/1e-05_0.999_0.0
../workdir/run_glue_for_model_series/electra-raw-sngp/sst2/1e-05_0.999_0.0
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw-sngp/sst2/1e-05_0.999_0.0']
../workdir/run_glue_for_model_series/electra-raw-sngp/mrpc/1e-05_0.99_0.0
../workdir/run_glue_for_model_series/electra-raw-sngp/cola/1e-05_0.99_0.0
../workdir/run_glue_for_model_series/electra-raw-sngp/sst2/1e-05_0.99_0.0
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw-sngp/mrpc/1e-05_0.99_0.0']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw-sngp/cola/1e-05_0.99_0.0']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw-sngp/sst2/1e-05_0.99_0.0']
../workdir/run_glue_for_model_series/electra-raw-sngp/mrpc/1e-05_0.9_0.0
../workdir/run_glue_for_model_s

In [None]:
table_sngp

Unnamed: 0_level_0,Unnamed: 1_level_0,CoLA,CoLA,CoLA,MRPC,MRPC,MRPC,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
sngp|1e-05_0.999,sngp,85.60±1.62,0.53±0.07,2.56±0.08,35.52±5.81,0.67±0.15,2.52±0.32,,,
sngp|0.0001_0.999,sngp,85.84±1.75,0.54±0.06,2.56±0.09,35.40±5.52,0.65±0.18,2.52±0.31,27.64±3.49,1.08±0.21,1.13±0.12
sngp|0.001_0.999,sngp,85.83±1.62,0.54±0.06,2.56±0.08,36.71±6.10,0.59±0.18,2.58±0.32,28.94±3.75,1.05±0.23,1.16±0.11
sngp|0.01_0.999,sngp,87.54±2.57,0.52±0.08,2.57±0.08,39.98±7.28,0.41±0.15,2.77±0.37,33.16±4.76,0.92±0.22,1.29±0.19
sngp|0.1_0.999,sngp,92.91±4.48,0.42±0.10,2.67±0.10,44.43±9.01,0.06±0.17,3.14±0.44,41.92±8.71,0.65±0.22,1.57±0.26
sngp|1_0.999,sngp,101.36±6.60,0.16±0.12,2.95±0.13,61.49±14.72,-0.84±0.34,4.07±0.51,53.88±16.24,0.28±0.32,1.95±0.42
baseline|,max_prob,103.60±5.14,91.66±0.07,3.14±0.08,44.10±8.67,91.39±0.44,3.19±0.38,62.72±17.12,92.82±0.29,2.23±0.29
baseline|,max_prob,103.60±5.14,91.66±0.07,3.14±0.08,44.10±8.67,91.39±0.44,3.19±0.38,62.72±17.12,92.82±0.29,2.23±0.29


In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['sngp']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        run_dirs = []
        names = [f'{method}']
        for name in dataset_fnames:
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/{name}/0.0'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        try:
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={})
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
        except:
            continue
table_sngp_new = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/mrpc/0.0
../workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/cola/0.0
../workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/sst2/0.0
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/sst2/0.0']


In [None]:
table_sngp_new

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
sngp,sngp,0.56±0.29,14.84±2.80,2.06±0.40,0.54±nan,54.13±nan,2.73±nan
baseline|,max_prob,92.22±0.24,18.26±2.40,2.62±0.22,91.41±nan,61.27±nan,3.37±nan


In [None]:
table_sngp

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
sngp|last|raw,sngp,0.58±0.39,15.78±3.30,2.19±0.43,-0.02±0.06,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10
baseline|raw,max_prob,92.24±0.26,17.87±2.46,2.62±0.24,91.31±0.09,64.62±4.29,3.41±0.09,92.64±0.37,45.53±10.95,2.40±0.39


# Combine all

In [None]:
table_dpp_reg['Dropout Layers'] = 'last'
table_det['Dropout Layers'] = '-'

In [None]:
table_dpp_reg

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%),Dropout Layers
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,Unnamed: 13_level_1
0,DPP_on_masks,reg,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13,last
1,DPP_on_masks,reg,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12,last
2,DPP_on_masks,reg,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09,last
3,DPP_with_ood,reg,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26,last
4,DPP_with_ood,reg,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10,last
5,DPP_with_ood,reg,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09,last
6,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11,last
7,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11,last


In [None]:
res = pd.concat([table_mc.iloc[[0,1,2]], table_dpp_reg.iloc[:-2], table_det.iloc[[9,10,11,15]], table_mc_det.iloc[[3]], ens_tab.iloc[:-1], table_dpp_reg.iloc[-2:]])
res = res[table_mc.columns].reset_index(drop=True)

In [None]:
res

Unnamed: 0_level_0,Method,Reg. Type,Dropout Layers,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MC,raw,all,bald,0.59±0.11,11.78±0.77,1.41±0.12,0.17±0.22,47.55±5.41,2.28±0.17,0.28±0.10,13.51±2.03,0.85±0.11
1,MC,raw,all,sampled_max_prob,0.53±0.13,11.60±1.21,1.45±0.14,0.30±0.18,43.51±4.19,2.15±0.15,0.22±0.12,13.99±1.69,0.91±0.12
2,MC,raw,all,variance,0.60±0.12,11.66±0.70,1.41±0.12,0.23±0.23,45.97±4.97,2.24±0.17,0.28±0.10,13.24±1.58,0.85±0.11
3,DPP_on_masks,reg,last,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13
4,DPP_on_masks,reg,last,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12
5,DPP_on_masks,reg,last,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09
6,DPP_with_ood,reg,last,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26
7,DPP_with_ood,reg,last,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10
8,DPP_with_ood,reg,last,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09
9,NUQ,spectral_norm,,epistemic,0.43±0.19,11.62±1.23,1.63±0.17,0.30±0.10,43.13±2.81,2.14±0.05,0.36±0.07,10.88±0.80,0.77±0.09


In [None]:
print(str(res.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllllll}
\toprule
         Method &      Reg. Type & Dropout Layers &                      UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
                & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
             MC &            raw &            all &                          bald &           0.59$\pm$0.11 &  11.78$\pm$0.77 &  1.41$\pm$0.12 &           0.17$\pm$0.22 &  47.55$\pm$5.41 &  2.28$\pm$0.17 &           0.28$\pm$0.10 &  13.51$\pm$2.03 &  0.85$\pm$0.11 \\
             MC &            raw &            all &              sampled\_max\_prob &           0.53$\pm$0.13 &  11.60$\pm$1.21 &  1.45$\pm$0.14 &           0.30$\pm$0.18 &  43.51$\pm$4.19 &  2.15$\pm$0.15 &           0.22$\pm$0.12 &  13.99$\pm$1.69 &  0.91$\pm$0.12 \\
             MC &            raw &            all &                      v

# SNGP HP table

In [None]:
import os

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['raw']
methods = ['sngp']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']

ridge_factors = [1e-3, 1e-2, 1e-1, 1]
momentums = [0.9999]#, 0.99, 0.9]

names = []
tables = []
baselines = []

for method in methods:
    for ridge_factor in ridge_factors:
            for momentum in momentums:
                run_dirs = []
                names = [f'{method}|{ridge_factor}_{momentum}_0.0']
                for name in dataset_fnames:
                    model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/20newsgroups/{ridge_factor}_{momentum}_0.0/'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                #try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={})
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
                #except:
                #    print(f'Not exists one of this dirs: {run_dirs}')

/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/20newsgroups/0.001_0.9999_0.0/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/20newsgroups/0.01_0.9999_0.0/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/20newsgroups/0.1_0.9999_0.0/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/20newsgroups/1_0.9999_0.0/


In [None]:
table_sngp = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])])

In [None]:
table_sngp

Unnamed: 0_level_0,Unnamed: 1_level_0,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
sngp|0.001_0.9999_0.0,sngp,-3.95±1.24,2234.43±322.95,8.66±1.13
sngp|0.01_0.9999_0.0,sngp,-5.52±1.54,2528.77±326.38,10.29±1.17
sngp|0.1_0.9999_0.0,sngp,-7.64±2.03,2844.78±394.70,12.44±1.62
sngp|1_0.9999_0.0,sngp,-8.99±2.45,3059.90±492.84,13.77±2.01
baseline|1_0.9999_0.0,max_prob,87.65±0.48,983.92±75.64,4.65±0.44


# DPP All 20ng

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.5, 0.6]
comsizes = [50]

dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for reg in regs:
                run_dirs = []
                names = [f'{method}|{max_frac}|{cs}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/electra_raw_no_sn/{name}/0.0/{method}_{max_frac}_50/results/'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print(f'Not exists one of this dirs: {run_dirs}')

../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.3_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.4_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.5_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.6_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.3_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.4_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.5_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.6_50/results/


In [None]:
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])])

In [None]:
table_dpp.sort_values(by= ('20newsgroups',             'rcc-auc'))

Unnamed: 0_level_0,Unnamed: 1_level_0,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ddpp_dpp|0.3|50,var.ratio,-2.97±0.17,1093.25±23.79,5.80±0.19
ddpp_ood|0.3|50,var.ratio,-3.05±0.15,1104.95±22.85,5.86±0.20
ddpp_dpp|0.4|50,var.ratio,-3.16±0.14,1123.48±18.32,6.00±0.19
ddpp_ood|0.4|50,var.ratio,-3.20±0.13,1130.50±16.95,6.02±0.22
ddpp_dpp|0.5|50,var.ratio,-3.32±0.15,1146.23±17.34,6.14±0.19
ddpp_ood|0.5|50,var.ratio,-3.32±0.16,1149.10±19.71,6.16±0.16
ddpp_dpp|0.6|50,var.ratio,-3.44±0.13,1165.61±20.28,6.26±0.21
ddpp_ood|0.6|50,var.ratio,-3.45±0.13,1169.64±16.75,6.27±0.18
baseline|50,max_prob,91.38±0.18,473.46±28.89,2.82±0.16
ddpp_dpp|0.6|50,sampled_max_prob,-0.01±0.11,475.87±24.75,2.83±0.15


# 20 ng

In [None]:
import os 

default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
raw_baselines = {}
for ds_fname, ds_name in zip(dataset_fnames, dataset_names):
    model_series_dir = f'../workdir/run_glue_for_model_series/electra_raw_no_sn/{ds_fname}/0.0/ddpp_dpp_best/'
    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            model_series_dir, methods=default_methods, metric=metric
        )

        mean_res = agg_res.mean(axis=0)
        final_results = mean_res.T
        table.append(final_results.loc[['max_prob']])
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    raw_baselines[ds_name] = res_table#
    #raw_baselines[ds_name]={k:v for k,v in zip(res_table.columns.values.tolist(), res_table.values[0].tolist())}

FileNotFoundError: [Errno 2] No such file or directory: '../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_best/'

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['metric', 'reg', 'raw']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass
table_det = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra_metric_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/mahalanobis


In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp_best', 'ddpp_ood_best', 'mc_all']
regs = ['metric', 'reg', 'raw']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
#baselines = []
for method in methods:
    for reg in regs:
        for sn in ['no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
                #baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass
table_mc = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_dpp_best
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_dpp_best']
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/ddpp_dpp_best
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_best
../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_ood_best
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_ood_best']
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/ddpp_ood_best
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_best
../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/mc_all
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/mc_all
../workdir/run_glue_for_model_series/electra_raw_no_s

In [None]:
import os

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['raw']
methods = ['sngp']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']

ridge_factors = [1e-3]
momentums = [0.9999]

names = []
tables = []
baselines = []

for method in methods:
    for ridge_factor in ridge_factors:
            for momentum in momentums:
                run_dirs = []
                names = [f'{method}|{ridge_factor}_{momentum}_0.0']
                for name in dataset_fnames:
                    model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/20newsgroups/{ridge_factor}_{momentum}_0.0/'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                #try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={})
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
                #except:
                #    print(f'Not exists one of this dirs: {run_dirs}')

/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp-correct-hp/20newsgroups/0.001_0.9999_0.0/


In [None]:
def preproc_regs(x):
    return '-'
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SNGP'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif x == 'mahalanobis':
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    return 'sngp'

table_sngp = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])]).reset_index()
table_sngp['Reg. Type'] = table_sngp.Method.apply(lambda x: preproc_regs(x))
table_sngp['Method'] = table_sngp.Method.apply(lambda x: preproc_method(x))
table_sngp['UE Score'] = table_sngp['UE Score'].apply(lambda x: preproc_ue(x))

In [None]:
table_sngp

Unnamed: 0_level_0,Method,UE Score,20newsgroups,20newsgroups,20newsgroups,Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,Unnamed: 6_level_1
0,SNGP,sngp,-2.14±0.47,1551.42±332.67,6.39±0.63,-
1,SR SN,sngp,89.35±0.34,799.33±52.98,4.13±0.25,-


In [None]:
table_sngp.to_dict()

{('Method', ''): {0: 'SNGP', 1: 'SR SN'},
 ('UE Score', ''): {0: 'sngp', 1: 'sngp'},
 ('20newsgroups', 'rejection-curve-auc'): {0: '-2.14±0.47', 1: '89.35±0.34'},
 ('20newsgroups', 'rcc-auc'): {0: '1551.42±332.67', 1: '799.33±52.98'},
 ('20newsgroups', 'rpp'): {0: '6.39±0.63', 1: '4.13±0.25'},
 ('Reg. Type', ''): {0: '-', 1: '-'}}

In [None]:
res_table = pd.concat([table_mc.iloc[:-2], table_det.iloc[:-2], pd.concat(baselines[-6:])]).reset_index(drop=False)

NameError: name 'pd' is not defined

In [None]:
res_table = res_table[res_table['UE Score']!='var.ratio']

In [None]:
def preproc_regs(x):
    reg = x.split('|')[-1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif x == 'mahalanobis':
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    return 'MD'

res_table['Reg. Type'] = res_table.Method.apply(lambda x: preproc_regs(x))
res_table['Method'] = res_table.Method.apply(lambda x: preproc_method(x))
res_table['UE Score'] = res_table['UE Score'].apply(lambda x: preproc_ue(x))

In [None]:
res_table = res_table[list(res_table.columns[:1]) + list(res_table.columns[-1:]) + list(res_table.columns[1:-1])].reset_index(drop=True)

In [None]:
res_table

Unnamed: 0_level_0,Method,Reg. Type,UE Score,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+DPP) (ours),CER,BALD,-0.11±0.22,463.87±35.71,2.78±0.22
1,DDPP (+DPP) (ours),CER,SMP,-0.03±0.16,443.63±30.31,2.71±0.17
2,DDPP (+DPP) (ours),CER,PV,-0.15±0.19,466.33±28.88,2.82±0.19
3,DDPP (+DPP) (ours),-,BALD,-0.03±0.18,441.21±22.50,2.64±0.17
4,DDPP (+DPP) (ours),-,SMP,-0.01±0.16,438.45±30.94,2.63±0.16
5,DDPP (+DPP) (ours),-,PV,-0.01±0.19,436.32±26.77,2.63±0.17
6,DDPP (+OOD) (ours),CER,BALD,-0.15±0.16,465.83±25.44,2.81±0.15
7,DDPP (+OOD) (ours),CER,SMP,-0.15±0.20,467.87±37.65,2.82±0.18
8,DDPP (+OOD) (ours),CER,PV,-0.17±0.20,469.24±30.78,2.83±0.18
9,DDPP (+OOD) (ours),-,BALD,-0.14±0.33,459.79±47.69,2.76±0.33


In [None]:
res_table.iloc[list(range(12)) + list(range(27, 33))]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+DPP) (ours),CER,BALD,-0.11±0.22,463.87±35.71,2.78±0.22
1,DDPP (+DPP) (ours),CER,SMP,-0.03±0.16,443.63±30.31,2.71±0.17
2,DDPP (+DPP) (ours),CER,PV,-0.15±0.19,466.33±28.88,2.82±0.19
3,DDPP (+DPP) (ours),-,BALD,-0.03±0.18,441.21±22.50,2.64±0.17
4,DDPP (+DPP) (ours),-,SMP,-0.01±0.16,438.45±30.94,2.63±0.16
5,DDPP (+DPP) (ours),-,PV,-0.01±0.19,436.32±26.77,2.63±0.17
6,DDPP (+OOD) (ours),CER,BALD,-0.15±0.16,465.83±25.44,2.81±0.15
7,DDPP (+OOD) (ours),CER,SMP,-0.15±0.20,467.87±37.65,2.82±0.18
8,DDPP (+OOD) (ours),CER,PV,-0.17±0.20,469.24±30.78,2.83±0.18
9,DDPP (+OOD) (ours),-,BALD,-0.14±0.33,459.79±47.69,2.76±0.33


In [None]:
res_table.iloc[list(range(12,33))]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
12,MC dropout,metric,BALD,-0.04±0.12,497.47±73.40,2.71±0.27
13,MC dropout,metric,SMP,-0.06±0.16,493.36±95.29,2.73±0.33
14,MC dropout,metric,PV,-0.05±0.14,495.59±87.65,2.74±0.30
15,MC dropout,CER,BALD,0.47±0.09,367.30±17.41,2.18±0.09
16,MC dropout,CER,SMP,0.46±0.10,368.76±20.08,2.19±0.10
17,MC dropout,CER,PV,0.44±0.09,369.50±19.59,2.21±0.09
18,MC dropout,-,BALD,0.47±0.08,364.66±10.97,2.15±0.08
19,MC dropout,-,SMP,0.48±0.09,364.41±15.67,2.14±0.09
20,MC dropout,-,PV,0.46±0.09,363.92±12.14,2.16±0.08
21,MD SN (ours),metric,MD,0.61±0.08,351.46±12.24,2.00±0.08


In [None]:
def calc_rejection_table(
    probabilities,
    labels,
    sampled_probabilities,
    model_answers,
    methods,
    ratio_list=None,
):
    if ratio_list is None:
        ratio_list = [0.0, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]

    predictions = np.argmax(probabilities, axis=-1)
    errors = (labels != predictions).astype("uint8")

    model_ues = 1 - np.max(probabilities, axis=1)
    sorted_indexes_model = np.argsort(-model_ues)

    results = {}
    model_scores = [
        get_score_ratio(sorted_indexes_model, model_answers, labels, ratio, drop=True)
        for ratio in ratio_list
    ]
    results["max_prob"] = model_scores

    for name, method_function in methods.items():
        ue_scores = method_function(sampled_probabilities)

        ensemble_answers = np.asarray(sampled_probabilities).mean(1).argmax(-1)
        sorted_indexes_ensemble = np.argsort(-ue_scores)

        ens_scores = [
            get_score_ratio(
                sorted_indexes_ensemble, ensemble_answers, labels, ratio, drop=True
            )
            for ratio in ratio_list
        ]
        results[name] = ens_scores
    results = pd.DataFrame(results).T
    results.columns = [f"{int(ratio*100)}%" for ratio in ratio_list]
    return results


def extract_result_arc_tab(time_dir, methods, metric="roc-auc", oos=False):
    with open(Path(time_dir) / "dev_inference.json") as f:
        model_outputs = json.loads(f.read())

    return calc_rejection_table(
        np.asarray(model_outputs["probabilities"]),
        np.asarray(model_outputs["true_labels"]),
        np.asarray(model_outputs["sampled_probabilities"]).transpose(1, 0, 2),
        np.asarray(model_outputs["answers"]),
        methods=methods,
        # ratio_list=[0.0, 0.1, 0.2, 0.3]
    )


def extract_result_arc_tab_de(model_path, methods, metric="roc-auc"):
    probs = []
    for run_seed in os.listdir(model_path):
        run_dir = model_path / run_seed

        try:
            with open(Path(run_dir) / "dev_inference.json") as f:
                model_outputs = json.loads(f.read())

            probs.append(np.asarray(model_outputs["probabilities"]))

        except FileNotFoundError:
            pass
        except:
            continue

    return calc_rejection_table(
        np.asarray(model_outputs["probabilities"]),
        np.asarray(model_outputs["true_labels"]),
        np.asarray(np.asarray(probs).transpose(1, 0, 2)),
        np.asarray(model_outputs["answers"]),
        methods=methods,
        # ratio_list=[0.0, 0.1, 0.2, 0.3]
    )


def aggregate_runs_rejection_table(data_path, methods, de=False):
    results = []
    for model_seed in os.listdir(data_path):
        try:
            model_seed_int = int(model_seed)
        except:
            continue

        model_path = Path(data_path) / model_seed

        if de:
            results.append(extract_result_arc_tab_de(model_path, methods))
            continue

        for run_seed in os.listdir(model_path):
            run_dir = model_path / run_seed
            try:
                results.append(extract_result_arc_tab(run_dir, methods=methods))
            except FileNotFoundError:
                pass
            except:
                continue

    results = pd.concat(results, axis=0)

    return results

In [None]:
path = '../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_ood_best/results/'
default_methods = {
        "bald": bald,
        "entropy": mean_entropy,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
}

In [None]:
aggregate_runs_rejection_table(path, default_methods)

FileNotFoundError: [Errno 2] No such file or directory: '../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_ood_best/results/'

# DeBERTA

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.5, 0.6]
comsizes = [50]

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for reg in regs:
                run_dirs = []
                names = [f'ddpp_{method}|{max_frac}|{cs}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series_dpp_hp/deberta_{reg}_no_sn/{name}/0.0/{method}_{max_frac}_{cs}/'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print(f'Not exists one of this dirs: {run_dirs}')

../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/mrpc/0.0/ddpp_dpp_0.3_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/cola/0.0/ddpp_dpp_0.3_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/sst2/0.0/ddpp_dpp_0.3_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/mrpc/0.0/ddpp_dpp_0.4_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/cola/0.0/ddpp_dpp_0.4_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/sst2/0.0/ddpp_dpp_0.4_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/mrpc/0.0/ddpp_dpp_0.5_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/cola/0.0/ddpp_dpp_0.5_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/sst2/0.0/ddpp_dpp_0.5_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/mrpc/0.0/ddpp_dpp_0.6_50/
../workdir/run_glue_for_model_series_dpp_hp/deberta_raw_no_sn/cola/0.0/ddpp_dpp_0.6_50/
../workdir/run_glue_for_model_se

In [None]:
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])])

In [None]:
table_dpp.sort_values(by= ('SST-2', 'rcc-auc'))

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ddpp_ddpp_dpp|0.3|50,sampled_max_prob,-0.21±0.19,54.88±15.60,3.03±0.59,-0.20±0.10,135.35±9.14,3.70±0.15,0.07±0.06,36.74±5.44,1.48±0.26
ddpp_ddpp_dpp|0.3|50,bald,-0.24±0.31,56.43±16.19,3.09±0.57,-0.22±0.14,136.40±9.88,3.73±0.17,0.01±0.18,37.76±5.31,1.54±0.25
baseline|50,max_prob,91.76±0.48,48.05±14.96,2.83±0.50,91.05±0.16,129.15±7.33,3.49±0.15,93.42±0.32,37.91±6.20,1.54±0.29
ddpp_ddpp_dpp|0.5|50,bald,-0.64±0.65,62.74±15.32,3.46±0.62,-0.11±0.08,133.78±9.54,3.62±0.12,0.06±0.07,37.94±7.78,1.49±0.28
ddpp_ddpp_dpp|0.5|50,variance,-0.60±0.65,62.27±15.35,3.42±0.61,-0.10±0.09,133.13±9.54,3.61±0.12,0.05±0.03,38.06±7.22,1.50±0.28
ddpp_ddpp_dpp|0.6|50,sampled_max_prob,-0.19±0.31,55.24±13.72,3.03±0.50,-0.04±0.05,131.57±8.32,3.55±0.17,0.02±0.12,38.06±7.47,1.53±0.33
ddpp_ddpp_dpp|0.3|50,variance,-0.19±0.30,55.16±16.55,3.03±0.60,-0.21±0.14,136.25±9.34,3.72±0.16,-0.00±0.16,38.22±5.77,1.56±0.26
ddpp_ddpp_dpp|0.5|50,sampled_max_prob,-0.41±0.40,59.24±14.41,3.25±0.56,-0.07±0.07,132.96±8.87,3.58±0.14,0.01±0.05,38.51±7.22,1.54±0.31
ddpp_ddpp_dpp|0.6|50,bald,-0.12±0.59,53.32±13.00,2.96±0.49,-0.10±0.07,132.85±9.34,3.60±0.17,-0.00±0.21,38.69±9.54,1.53±0.37
ddpp_ddpp_dpp|0.6|50,variance,-0.06±0.53,51.46±15.07,2.90±0.55,-0.08±0.07,132.21±9.34,3.58±0.18,-0.00±0.18,38.83±9.24,1.55±0.37


In [None]:
{'MPRC': {'ddpp_ood': 0.6, 'ddpp_dpp': 0.6},
 'CoLA': {'ddpp_ood': 0.6, 'ddpp_dpp': 0.6}
 'SST-2': {'ddpp_ood': 0.6, 'ddpp_dpp': 0.3}}

# DeBERTA ALL

In [None]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

def print_data(x):
    print(x.shape)
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_ood', 'ddpp_dpp', 'mc_all', 'mahalanobis', 'nuq', 'mc_mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            if sn == 'sn' and method not in ['nuq', 'mahalanobis', 'mc_mahalanobis']:
                continue
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/deberta_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/deberta_raw_no_sn/mrpc/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_raw_no_sn/cola/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_raw_no_sn/sst2/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_reg_no_sn/mrpc/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_reg_no_sn/cola/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_reg_no_sn/sst2/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_metric_no_sn/mrpc/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_metric_no_sn/cola/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_metric_no_sn/sst2/0.0/ddpp_ood
../workdir/run_glue_for_model_series/deberta_raw_no_sn/mrpc/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/deberta_raw_no_sn/cola/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/deberta_raw_no_sn/sst2/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/deberta_reg_no_sn/mrpc/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/deber

In [None]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
run_dirs = []
for name in dataset_fnames:      
    names = [f'Deep Ensemble|raw_no_sn']
    model_series_dir = f'../workdir/run_glue_for_ensemble_series/deberta/{name}/final_results/'
    print(model_series_dir)
    run_dirs.append([model_series_dir])
ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={})

../workdir/run_glue_for_ensemble_series/deberta/mrpc/final_results/
../workdir/run_glue_for_ensemble_series/deberta/cola/final_results/
../workdir/run_glue_for_ensemble_series/deberta/sst2/final_results/


In [None]:
table_all = pd.concat([pd.concat(tables), ens_tab, pd.concat(baselines[-6:])]).reset_index()

In [None]:
def preproc_regs(x):
    reg = x.split('|')[-1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1].split('_', 1)[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [None]:
table_all.iloc[:18]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+OOD) (ours),-,BALD,0.05±0.45,21.06±4.31,2.57±0.48,0.05±0.28,78.68±8.22,3.28±0.22,-0.19±0.19,22.97±3.58,1.40±0.19
1,DDPP (+OOD) (ours),-,SMP,0.13±0.30,19.94±3.98,2.52±0.44,0.03±0.29,77.92±7.60,3.28±0.22,0.05±0.23,18.61±4.51,1.15±0.25
2,DDPP (+OOD) (ours),-,PV,0.12±0.43,20.63±4.40,2.54±0.45,0.02±0.30,79.29±7.83,3.30±0.22,-0.10±0.17,21.56±3.37,1.31±0.18
3,DDPP (+OOD) (ours),CER,BALD,-0.46±0.35,20.60±5.15,2.57±0.23,-0.35±0.27,88.64±7.29,3.55±0.25,-0.39±0.32,23.10±5.51,1.44±0.31
4,DDPP (+OOD) (ours),CER,SMP,-0.22±0.36,18.44±4.85,2.36±0.32,-0.32±0.19,88.18±6.02,3.52±0.22,-0.14±0.38,19.45±6.77,1.18±0.38
5,DDPP (+OOD) (ours),CER,PV,-0.38±0.31,19.79±4.63,2.51±0.20,-0.36±0.23,88.98±6.54,3.55±0.24,-0.29±0.32,21.91±5.92,1.35±0.33
6,DDPP (+OOD) (ours),metric,BALD,-0.06±0.09,39.88±4.92,3.98±0.29,-1.03±0.29,116.62±8.03,4.53±0.25,-0.09±0.38,26.54±3.82,1.63±0.24
7,DDPP (+OOD) (ours),metric,SMP,-0.07±0.09,40.17±5.32,3.99±0.30,-0.98±0.27,115.79±7.88,4.49±0.23,0.35±0.11,23.12±7.98,1.19±0.31
8,DDPP (+OOD) (ours),metric,PV,-0.07±0.10,39.91±5.00,3.99±0.30,-1.02±0.29,116.75±8.01,4.53±0.25,0.23±0.37,23.15±2.87,1.32±0.24
9,DDPP (+DPP) (ours),-,BALD,0.16±0.38,20.76±4.91,2.49±0.39,-0.22±0.15,73.31±4.42,3.54±0.17,0.03±0.19,18.15±3.45,1.18±0.23


In [None]:
table_all.iloc[18:27]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
18,MC dropout,-,BALD,0.49±0.35,19.88±6.47,2.20±0.53,0.63±0.16,59.29±1.82,2.76±0.07,0.32±0.13,13.93±2.94,0.89±0.16
19,MC dropout,-,SMP,0.37±0.26,20.83±4.81,2.30±0.50,0.67±0.17,56.95±2.67,2.69±0.11,0.27±0.09,14.25±2.14,0.95±0.16
20,MC dropout,-,PV,0.47±0.35,20.27±6.63,2.22±0.56,0.65±0.15,58.45±2.01,2.74±0.08,0.32±0.12,13.81±2.54,0.89±0.15
21,MC dropout,CER,BALD,0.48±0.30,13.62±2.69,1.68±0.19,0.64±0.22,58.55±4.94,2.70±0.19,0.25±0.20,11.91±2.38,0.80±0.16
22,MC dropout,CER,SMP,0.43±0.24,13.68±3.34,1.68±0.21,0.70±0.19,56.48±3.52,2.63±0.18,0.24±0.16,11.49±1.94,0.81±0.13
23,MC dropout,CER,PV,0.48±0.25,13.56±2.93,1.68±0.20,0.66±0.23,57.47±4.34,2.67±0.19,0.27±0.19,11.62±2.14,0.80±0.15
24,MC dropout,metric,BALD,1.85±0.47,17.73±3.29,2.04±0.36,0.06±0.52,86.19±14.47,3.48±0.41,0.51±0.29,17.28±2.18,1.02±0.10
25,MC dropout,metric,SMP,1.85±0.42,16.94±2.80,2.02±0.34,0.16±0.52,82.16±15.01,3.38±0.41,0.53±0.29,16.99±3.28,1.01±0.13
26,MC dropout,metric,PV,1.85±0.45,17.61±3.31,2.04±0.35,0.11±0.52,84.91±14.52,3.44±0.41,0.53±0.28,16.75±2.61,1.00±0.11


In [None]:
table_all.iloc[27:51]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
27,MD SN (ours),-,MD,0.76±0.59,12.28±0.81,1.73±0.12,0.39±0.19,54.40±4.49,2.66±0.20,0.33±0.25,12.46±2.12,0.90±0.15
28,MD,-,MD,0.44±0.17,16.56±3.77,2.21±0.40,0.52±0.10,58.02±4.92,2.80±0.20,0.32±0.18,13.00±3.41,0.89±0.22
29,MD SN (ours),CER,MD,0.14±0.14,13.10±1.74,1.87±0.27,0.31±0.07,54.46±4.71,2.56±0.25,0.25±0.21,10.50±2.22,0.77±0.18
30,MD,CER,MD,0.33±0.20,13.14±2.11,1.83±0.26,0.45±0.18,58.41±3.94,2.75±0.16,0.20±0.25,11.85±1.69,0.86±0.14
31,MD SN (ours),metric,MD,1.14±0.31,13.17±2.77,1.83±0.33,0.28±0.20,68.60±4.54,3.22±0.17,0.39±0.20,13.55±1.96,0.95±0.13
32,MD,metric,MD,2.04±0.23,13.35±1.40,1.87±0.18,0.74±0.27,58.64±6.04,2.77±0.27,0.43±0.39,17.19±1.44,1.10±0.11
33,NUQ SN,-,aleatoric,0.62±0.65,12.89±0.87,1.85±0.17,0.26±0.23,63.77±5.60,2.78±0.11,0.31±0.23,12.87±1.48,0.89±0.10
34,NUQ SN,-,epistemic,0.29±0.47,14.75±0.60,2.18±0.13,-0.23±0.23,64.44±5.35,3.25±0.25,0.24±0.22,14.12±2.07,0.96±0.14
35,NUQ SN,-,total,0.42±0.60,14.15±0.88,2.07±0.16,0.01±0.28,61.28±6.62,3.03±0.37,0.30±0.22,13.49±1.65,0.92±0.12
36,NUQ,-,aleatoric,0.32±0.24,17.68±4.85,2.33±0.57,0.33±0.26,69.26±8.05,2.99±0.21,0.30±0.13,13.40±3.39,0.91±0.21


In [None]:
table_all.iloc[51:]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
51,SMD SN (ours),-,SMD,-0.11±0.59,18.99±2.71,2.61±0.22,-1.01±0.30,85.64±16.18,4.10±0.46,-1.15±0.35,41.71±10.59,2.40±0.39
52,SMD,-,SMD,-0.49±0.39,29.52±7.92,3.16±0.63,-0.44±0.19,88.67±12.66,3.80±0.21,-0.87±0.42,35.13±10.09,2.10±0.52
53,SMD SN (ours),CER,SMD,-0.67±0.35,19.47±2.86,2.67±0.35,-0.74±0.14,77.23±9.08,3.65±0.32,-0.97±0.38,32.40±10.29,2.03±0.40
54,SMD,CER,SMD,-0.11±0.44,17.25±3.16,2.29±0.35,-0.21±0.37,74.77±4.51,3.45±0.14,-1.67±0.47,49.76±9.36,2.79±0.29
55,SMD SN (ours),metric,SMD,-0.82±0.42,34.91±3.95,3.82±0.41,-1.40±0.40,123.81±13.43,4.99±0.18,-2.99±0.40,108.87±13.58,4.42±0.40
56,SMD,metric,SMD,-0.37±0.84,37.74±7.46,4.32±0.73,-1.66±0.63,125.59±15.66,5.26±0.62,-2.58±0.70,101.85±20.60,4.21±0.59
57,Deep Ensemble,-,BALD,0.62±0.26,20.56±3.24,2.14±0.28,0.50±0.37,86.64±19.56,3.41±0.29,0.37±0.13,12.89±2.34,0.77±0.03
58,Deep Ensemble,-,SMP,1.09±0.33,13.57±2.14,1.69±0.22,0.89±0.29,64.43±3.39,3.05±0.18,0.36±0.20,11.32±0.73,0.76±0.07
59,Deep Ensemble,-,PV,0.77±0.29,18.71±2.85,2.02±0.26,0.61±0.34,81.47±15.27,3.32±0.26,0.38±0.15,11.90±1.14,0.75±0.04
60,SR (baseline),-,MP,92.07±0.40,23.59±7.09,2.66±0.44,90.44±0.22,82.36±4.89,3.88±0.19,93.91±0.12,17.12±2.09,1.13±0.10


In [None]:
table_all.to_csv('../../deberta_all.csv', header=True, index=False)

In [None]:
table_all.iloc[list(range(6))+list(range(18, 24))+list(range(27, 39))+list(range(57, 67))]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+OOD) (ours),-,BALD,0.05±0.45,21.06±4.31,2.57±0.48,0.05±0.28,78.68±8.22,3.28±0.22,-0.19±0.19,22.97±3.58,1.40±0.19
1,DDPP (+OOD) (ours),-,SMP,0.13±0.30,19.94±3.98,2.52±0.44,0.03±0.29,77.92±7.60,3.28±0.22,0.05±0.23,18.61±4.51,1.15±0.25
2,DDPP (+OOD) (ours),-,PV,0.12±0.43,20.63±4.40,2.54±0.45,0.02±0.30,79.29±7.83,3.30±0.22,-0.10±0.17,21.56±3.37,1.31±0.18
3,DDPP (+OOD) (ours),CER,BALD,-0.46±0.35,20.60±5.15,2.57±0.23,-0.35±0.27,88.64±7.29,3.55±0.25,-0.39±0.32,23.10±5.51,1.44±0.31
4,DDPP (+OOD) (ours),CER,SMP,-0.22±0.36,18.44±4.85,2.36±0.32,-0.32±0.19,88.18±6.02,3.52±0.22,-0.14±0.38,19.45±6.77,1.18±0.38
5,DDPP (+OOD) (ours),CER,PV,-0.38±0.31,19.79±4.63,2.51±0.20,-0.36±0.23,88.98±6.54,3.55±0.24,-0.29±0.32,21.91±5.92,1.35±0.33
18,MC dropout,-,BALD,0.49±0.35,19.88±6.47,2.20±0.53,0.63±0.16,59.29±1.82,2.76±0.07,0.32±0.13,13.93±2.94,0.89±0.16
19,MC dropout,-,SMP,0.37±0.26,20.83±4.81,2.30±0.50,0.67±0.17,56.95±2.67,2.69±0.11,0.27±0.09,14.25±2.14,0.95±0.16
20,MC dropout,-,PV,0.47±0.35,20.27±6.63,2.22±0.56,0.65±0.15,58.45±2.01,2.74±0.08,0.32±0.12,13.81±2.54,0.89±0.15
21,MC dropout,CER,BALD,0.48±0.30,13.62±2.69,1.68±0.19,0.64±0.22,58.55±4.94,2.70±0.19,0.25±0.20,11.91±2.38,0.80±0.16


In [34]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

def print_data(x):
    print(x.shape)
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for use_sn_params in spectralnorm + ['default']:
                for eval_init in ['eval_init', 'no_eval_init']:
                    for head in ['', '_raw_head']:
                        run_dirs = []
                        name_sn = ''
                        names = [f'{method}_{reg}|{sn}|{use_sn_params}|{eval_init}{head}']
                        for name in dataset_fnames:
                            model_series_dir = f'../workdir/run_glue_for_model_series_tmp/electra_{reg}_{sn}_{use_sn_params}_{eval_init}{head}/{name}/0.0/{method}'
                            print(model_series_dir)
                            run_dirs.append([model_series_dir])
                        agg_func = choose_agg_func(method)
                        try:
                            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                            baselines.append(res_df.iloc[-1:])
                            tables.append(res_df.iloc[:-1])
                        except:
                            print('pass')
                            pass

../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_eval_init/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_eval_init/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_eval_init/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_eval_init_raw_head/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_eval_init_raw_head/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_eval_init_raw_head/sst2/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_no_eval_init/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_no_eval_init/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_no_eval_init/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electra_raw_sn_sn_no_eval_init_raw_head/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp/electr

In [35]:
table_all = pd.concat([pd.concat(tables)]).reset_index()

In [36]:
def preproc_params(x):
    sn_params = x.split('|')[2]
    if sn_params == 'no_sn':
        return 'Optimal NO SN'
    elif sn_params == 'default':
        return 'default'
    else:
        return 'Optimal SN'
    
def preproc_method(x):
    method = x.split('|')[0].split('_')[0]
    sn = x.split('|')[1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_reg(x):
    reg = x.split('|')[0].split('_')[-1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'metric':
        return 'metric'
    return '-'

def preproc_method(x):
    method = x.split('|')[0].split('_')[0]
    sn = x.split('|')[1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

def preproc_eval_init(x):
    sn = x.split('|')[1]
    eval_init = x.split('|')[-1]
    if sn == 'no_sn':
        return '-'
    elif sn == 'sn' and 'no_eval_init' in eval_init and 'raw_head' in eval_init:
        return '-'
    elif sn == 'sn' and 'no_eval_init' in eval_init:
        return 'NO SN init'
    elif sn == 'sn' and 'eval_init' in eval_init:
        return 'SN init'
    return '-'

def preproc_eval_head(x):
    sn = x.split('|')[1]
    head = x.split('|')[-1]
    if sn == 'no_sn':
        return '-'
    elif sn == 'sn' and 'raw_head' in head:
        return 'Raw HEAD'
    elif sn == 'sn' and 'raw_head' not in head:
        return 'SN HEAD'
    return '-'

def is_correct(x):
    sn = x.split('|')[1]
    head = x.split('|')[-1]
    sn_params = x.split('|')[2]
    if sn == 'no_sn':
        return '+'
    elif sn == 'sn' and 'raw_head' not in head and 'no_eval_init' not in head and sn_params=='sn':
        return '+'
    return '-'

table_all['Train HP'] = table_all.Method.apply(lambda x: preproc_params(x))
table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_reg(x))
#table_all['Eval. Init'] = table_all.Method.apply(lambda x: preproc_eval_init(x))
#table_all['Eval. Head'] = table_all.Method.apply(lambda x: preproc_eval_head(x))
#table_all['Correct Eval'] = table_all.Method.apply(lambda x: is_correct(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
#table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [37]:
table_all.iloc[[0, 3, 6, 7, 8] + list(range(9, 14))]

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2,Train HP,Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,Unnamed: 12_level_1,Unnamed: 13_level_1
0,MD SN (ours),MD,87.54±0.63,12.32±1.03,1.65±0.16,86.37±0.37,44.77±2.63,2.09±0.13,92.47±1.02,13.00±2.04,0.87±0.12,Optimal SN,-
3,MD SN (ours),MD,88.36±0.74,13.97±2.02,1.91±0.31,86.58±0.52,41.47±1.94,1.97±0.10,93.08±0.27,11.99±2.04,0.83±0.14,Optimal NO SN,-
6,MD SN (ours),MD,88.52±0.68,11.13±1.85,1.54±0.26,86.26±0.82,40.81±2.02,2.00±0.07,93.04±0.94,11.38±1.44,0.80±0.11,default,-
7,MD,MD,88.85±0.77,13.35±2.01,1.87±0.28,86.35±0.48,41.90±1.49,1.95±0.10,93.04±0.36,11.89±2.20,0.82±0.15,Optimal NO SN,-
8,MD,MD,88.60±0.69,12.40±2.06,1.71±0.27,86.32±0.74,40.90±2.42,2.01±0.07,93.00±0.64,12.34±1.76,0.85±0.13,default,-
9,MD SN (ours),MD,87.66±0.90,12.86±0.84,1.71±0.15,86.24±0.34,52.11±3.17,2.40±0.11,92.87±0.57,14.32±0.95,0.97±0.04,Optimal SN,CER
10,MD SN (ours),MD,88.52±0.88,14.86±5.16,1.86±0.26,86.64±0.48,39.54±1.64,1.94±0.11,93.21±0.55,12.02±1.36,0.83±0.07,Optimal NO SN,CER
11,MD SN (ours),MD,89.01±1.01,13.48±1.59,1.90±0.27,85.78±0.51,43.80±4.91,2.14±0.24,92.97±0.76,12.59±1.98,0.87±0.09,default,CER
12,MD,MD,88.36±0.83,13.30±3.42,1.71±0.24,86.51±0.84,40.30±3.86,1.97±0.14,93.62±0.67,11.12±1.62,0.80±0.11,Optimal NO SN,CER
13,MD,MD,87.75±0.69,13.50±2.60,1.81±0.41,85.89±0.45,46.41±5.14,2.32±0.24,92.39±0.72,14.17±2.13,0.95±0.11,default,CER


In [12]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

def print_data(x):
    print(x.shape)
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for use_sn_params in spectralnorm + ['default']:
                for eval_init in ['eval_init', 'no_eval_init']:
                    for head in ['', '_raw_head']:
                        run_dirs = []
                        name_sn = ''
                        names = [f'{method}_{reg}|{sn}|{use_sn_params}|{eval_init}{head}']
                        for name in dataset_fnames:
                            model_series_dir = f'../workdir/run_glue_for_model_series_tmp_new/electra_{reg}_{sn}_{use_sn_params}_{eval_init}{head}/{name}/0.0/{method}'
                            print(model_series_dir)
                            run_dirs.append([model_series_dir])
                        agg_func = choose_agg_func(method)
                        try:
                            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                            baselines.append(res_df.iloc[-1:])
                            tables.append(res_df.iloc[:-1])
                        except:
                            print('pass')
                            pass

../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_eval_init/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_eval_init/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_eval_init/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_eval_init_raw_head/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_eval_init_raw_head/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_eval_init_raw_head/sst2/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_no_eval_init/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_no_eval_init/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_no_eval_init/sst2/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn_no_eval_init_raw_head/mrpc/0.0/mahalanobis
..

In [13]:
table_all = pd.concat([pd.concat(tables)]).reset_index()

In [14]:
table_all['Train HP'] = table_all.Method.apply(lambda x: preproc_params(x))
table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_reg(x))
#table_all['Eval. Init'] = table_all.Method.apply(lambda x: preproc_eval_init(x))
#table_all['Eval. Head'] = table_all.Method.apply(lambda x: preproc_eval_head(x))
#table_all['Correct Eval'] = table_all.Method.apply(lambda x: is_correct(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
#table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [15]:
table_all

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2,Train HP,Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,Unnamed: 12_level_1,Unnamed: 13_level_1
0,MD SN (ours),MD,87.95±0.63,13.64±1.22,1.82±0.15,86.64±0.45,42.38±2.10,1.99±0.05,92.91±0.75,12.74±2.47,0.87±0.19,Optimal SN,-
1,MD SN (ours),MD,88.03±1.73,13.58±2.32,1.87±0.22,86.63±0.23,41.86±1.94,1.99±0.09,92.79±0.93,13.70±1.66,0.95±0.09,Optimal NO SN,-
2,MD SN (ours),MD,88.52±0.68,11.13±1.85,1.54±0.26,86.26±0.82,40.81±2.02,2.00±0.07,93.04±0.94,11.38±1.44,0.80±0.11,default,-
3,MD,MD,88.52±1.52,12.34±3.23,1.66±0.41,86.45±0.14,42.41±1.50,1.99±0.10,92.81±0.85,13.38±1.34,0.94±0.09,Optimal NO SN,-
4,MD,MD,88.60±0.69,12.40±2.06,1.71±0.27,86.32±0.74,40.90±2.42,2.01±0.07,93.00±0.64,12.34±1.76,0.85±0.13,default,-
5,MD SN (ours),MD,87.91±0.90,12.64±1.26,1.69±0.26,86.47±0.65,45.05±1.35,2.13±0.09,93.31±0.30,10.31±1.96,0.73±0.15,Optimal SN,CER
6,MD SN (ours),MD,87.83±0.63,12.38±2.00,1.65±0.36,86.50±0.32,42.40±2.13,2.03±0.11,93.54±0.54,11.67±1.53,0.83±0.13,Optimal NO SN,CER
7,MD SN (ours),MD,89.01±1.01,13.48±1.59,1.90±0.27,85.78±0.51,43.80±4.91,2.14±0.24,92.97±0.76,12.59±1.98,0.87±0.09,default,CER
8,MD,MD,88.60±0.51,12.85±1.66,1.76±0.23,86.42±0.29,42.62±2.27,2.00±0.10,93.37±0.44,11.56±2.06,0.81±0.17,Optimal NO SN,CER
9,MD,MD,87.75±0.69,13.50±2.60,1.81±0.41,85.89±0.45,46.41±5.14,2.32±0.24,92.39±0.72,14.17±2.13,0.95±0.11,default,CER


In [32]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

def print_data(x):
    print(x.shape)
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for use_sn_params in spectralnorm:
                run_dirs = []
                name_sn = ''
                names = [f'{method}_{reg}|{sn}|{use_sn_params}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series_tmp_new/electra_{reg}_{sn}_{use_sn_params}/{name}/0.0/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_no_sn/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_no_sn/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_sn_no_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_no_sn_sn/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_no_sn_sn/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_no_sn_sn/sst2/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new/electra_raw_no_sn_no_sn/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/electra_raw_no_sn_no_sn/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_

In [33]:
table_all = pd.concat([pd.concat(tables)]).reset_index()

In [34]:
def preproc_params(x):
    sn_params = x.split('|')[2]
    if sn_params == 'no_sn':
        return 'Optimal NO SN'
    elif sn_params == 'default':
        return 'default'
    else:
        return 'Optimal SN'
    
def preproc_method(x):
    method = x.split('|')[0].split('_')[0]
    sn = x.split('|')[1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_reg(x):
    reg = x.split('|')[0].split('_')[-1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'metric':
        return 'metric'
    return '-'

def preproc_method(x):
    method = x.split('|')[0].split('_')[0]
    sn = x.split('|')[1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

def preproc_eval_init(x):
    sn = x.split('|')[1]
    eval_init = x.split('|')[-1]
    if sn == 'no_sn':
        return '-'
    elif sn == 'sn' and 'no_eval_init' in eval_init and 'raw_head' in eval_init:
        return '-'
    elif sn == 'sn' and 'no_eval_init' in eval_init:
        return 'NO SN init'
    elif sn == 'sn' and 'eval_init' in eval_init:
        return 'SN init'
    return '-'

def preproc_eval_head(x):
    sn = x.split('|')[1]
    head = x.split('|')[-1]
    if sn == 'no_sn':
        return '-'
    elif sn == 'sn' and 'raw_head' in head:
        return 'Raw HEAD'
    elif sn == 'sn' and 'raw_head' not in head:
        return 'SN HEAD'
    return '-'

def is_correct(x):
    sn = x.split('|')[1]
    head = x.split('|')[-1]
    sn_params = x.split('|')[2]
    if sn == 'no_sn':
        return '+'
    elif sn == 'sn' and 'raw_head' not in head and 'no_eval_init' not in head and sn_params=='sn':
        return '+'
    return '-'

table_all['Train HP'] = table_all.Method.apply(lambda x: preproc_params(x))
table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_reg(x))
#table_all['Eval. Init'] = table_all.Method.apply(lambda x: preproc_eval_init(x))
#table_all['Eval. Head'] = table_all.Method.apply(lambda x: preproc_eval_head(x))
#table_all['Correct Eval'] = table_all.Method.apply(lambda x: is_correct(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
#table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [35]:
table_all

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2,Train HP,Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,Unnamed: 12_level_1,Unnamed: 13_level_1
0,MD SN (ours),MD,88.19±1.16,13.70±2.44,1.86±0.26,86.77±0.55,38.46±2.78,1.87±0.11,93.44±0.72,12.77±1.83,0.90±0.12,Optimal SN,-
1,MD SN (ours),MD,88.28±0.78,12.20±1.41,1.66±0.18,86.83±0.81,41.17±1.53,1.97±0.06,93.00±0.58,11.07±1.37,0.78±0.10,Optimal NO SN,-
2,MD,MD,88.36±1.12,12.28±1.82,1.66±0.18,86.86±0.76,41.11±1.96,1.98±0.06,93.37±0.62,11.61±2.26,0.81±0.13,Optimal NO SN,-
3,MD SN (ours),MD,88.40±0.40,13.20±1.43,1.80±0.24,86.43±0.34,41.63±1.32,1.97±0.04,93.04±0.50,11.51±2.19,0.82±0.14,Optimal SN,CER
4,MD SN (ours),MD,88.11±0.60,13.24±1.34,1.80±0.23,86.34±0.33,41.03±1.02,1.95±0.08,93.25±0.53,11.28±2.07,0.78±0.16,Optimal NO SN,CER
5,MD,MD,88.15±0.51,13.22±1.18,1.82±0.18,86.31±0.51,42.51±2.07,1.97±0.10,93.35±0.41,11.22±0.92,0.78±0.08,Optimal NO SN,CER
6,MD SN (ours),MD,88.11±1.68,12.47±2.52,1.66±0.28,86.86±0.56,39.35±2.89,1.90±0.09,93.64±0.61,11.11±2.02,0.80±0.13,Optimal SN,metric
7,MD SN (ours),MD,88.11±0.77,12.61±1.73,1.72±0.23,86.56±0.57,38.52±2.49,1.89±0.19,93.56±0.65,11.48±2.76,0.78±0.12,Optimal NO SN,metric
8,MD,MD,88.60±0.65,12.13±1.67,1.67±0.18,86.45±0.37,39.19±2.87,1.94±0.17,93.62±0.41,11.33±1.62,0.80±0.10,Optimal NO SN,metric


In [16]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

def print_data(x):
    print(x.shape)
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MRPC']#, 'CoLA', 'SST-2']
dataset_fnames = ['mrpc']#, 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for use_sn_params in spectralnorm + ['default']:
                for eval_init in ['eval_init', 'no_eval_init']:
                    for head in ['', '_raw_head']:
                        run_dirs = []
                        name_sn = ''
                        names = [f'{method}_{reg}|{sn}|{use_sn_params}|{eval_init}{head}']
                        for name in dataset_fnames:
                            model_series_dir = f'../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_{reg}_{sn}_{use_sn_params}_{eval_init}{head}/{name}/0.0/{method}'
                            print(model_series_dir)
                            run_dirs.append([model_series_dir])
                        agg_func = choose_agg_func(method)
                        try:
                            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                            baselines.append(res_df.iloc[-1:])
                            tables.append(res_df.iloc[:-1])
                        except:
                            print('pass')
                            pass

../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_sn_eval_init/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_sn_eval_init_raw_head/mrpc/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_sn_no_eval_init/mrpc/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_sn_no_eval_init_raw_head/mrpc/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_no_sn_eval_init/mrpc/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_no_sn_eval_init_raw_head/mrpc/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_no_sn_no_eval_init/mrpc/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_no_sn_no_eval_init_raw_head/mrpc/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new_rcc_auc/electra_raw_sn_default_eval_init/mrp

In [18]:
table_all = pd.concat([pd.concat(tables)]).reset_index()

In [19]:
table_all['Train HP'] = table_all.Method.apply(lambda x: preproc_params(x))
table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_reg(x))
#table_all['Eval. Init'] = table_all.Method.apply(lambda x: preproc_eval_init(x))
#table_all['Eval. Head'] = table_all.Method.apply(lambda x: preproc_eval_head(x))
#table_all['Correct Eval'] = table_all.Method.apply(lambda x: is_correct(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
#table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [20]:
table_all

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,Train HP,Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,Unnamed: 6_level_1,Unnamed: 7_level_1
0,MD SN (ours),MD,87.75±0.58,16.34±0.86,2.24±0.10,Optimal SN,-


# DEBERTA

In [38]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

def print_data(x):
    print(x.shape)
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for use_sn_params in spectralnorm + ['default']:
                for eval_init in ['eval_init', 'no_eval_init']:
                    for head in ['', '_raw_head']:
                        run_dirs = []
                        name_sn = ''
                        names = [f'{method}_{reg}|{sn}|{use_sn_params}|{eval_init}{head}']
                        for name in dataset_fnames:
                            model_series_dir = f'../workdir/run_glue_for_model_series_tmp_new/deberta_{reg}_{sn}_{use_sn_params}_{eval_init}{head}/{name}/0.0/{method}'
                            print(model_series_dir)
                            run_dirs.append([model_series_dir])
                        agg_func = choose_agg_func(method)
                        try:
                            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                            baselines.append(res_df.iloc[-1:])
                            tables.append(res_df.iloc[:-1])
                        except:
                            print('pass')
                            pass

../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_eval_init/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_eval_init/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_eval_init/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_eval_init_raw_head/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_eval_init_raw_head/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_eval_init_raw_head/sst2/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_no_eval_init/mrpc/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_no_eval_init/cola/0.0/mahalanobis
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_no_eval_init/sst2/0.0/mahalanobis
pass
../workdir/run_glue_for_model_series_tmp_new/deberta_raw_sn_sn_no_eval_init_raw_head/mrpc/0.0/mahalanobis
..

In [39]:
table_all = pd.concat([pd.concat(tables)]).reset_index()

In [40]:
table_all['Train HP'] = table_all.Method.apply(lambda x: preproc_params(x))
table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_reg(x))
#table_all['Eval. Init'] = table_all.Method.apply(lambda x: preproc_eval_init(x))
#table_all['Eval. Head'] = table_all.Method.apply(lambda x: preproc_eval_head(x))
#table_all['Correct Eval'] = table_all.Method.apply(lambda x: is_correct(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
#table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [41]:
table_all

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2,Train HP,Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,Unnamed: 12_level_1,Unnamed: 13_level_1
0,MD SN (ours),MD,89.22±0.82,12.37±0.92,1.78±0.16,84.55±0.35,55.23±5.17,2.72±0.28,93.33±0.44,11.04±2.19,0.78±0.17,Optimal SN,-
1,MD SN (ours),MD,85.29±8.33,28.91±34.34,2.94±2.20,84.34±0.70,54.34±4.78,2.62±0.24,93.16±0.41,13.40±2.41,0.94±0.17,Optimal NO SN,-
2,MD SN (ours),MD,88.77±0.33,15.36±1.88,2.14±0.29,83.65±0.72,58.85±5.47,2.73±0.23,92.49±0.55,14.46±1.57,1.00±0.10,default,-
3,MD,MD,88.58±1.26,15.19±3.82,2.05±0.41,84.53±0.55,55.30±4.70,2.68±0.19,92.93±0.52,12.51±1.97,0.86±0.13,Optimal NO SN,-
4,MD,MD,84.72±8.10,29.50±33.85,3.06±2.30,81.18±5.97,106.12±110.09,4.21±3.31,92.11±0.64,14.82±2.63,1.00±0.17,default,-
5,MD SN (ours),MD,90.28±0.46,12.63±1.47,1.86±0.25,84.76±0.44,53.18±2.84,2.61±0.12,93.69±0.38,11.35±1.53,0.83±0.13,Optimal SN,CER
6,MD SN (ours),MD,89.75±0.81,12.51±1.19,1.80±0.16,84.15±0.53,58.13±4.24,2.70±0.15,93.27±0.41,12.35±2.21,0.88±0.15,Optimal NO SN,CER
7,MD SN (ours),MD,84.68±8.10,34.08±38.16,3.77±3.10,78.80±7.51,156.44±138.77,5.92±4.11,83.28±16.48,92.57±149.45,3.50±4.02,default,CER
8,MD,MD,89.05±0.59,13.48±1.24,1.88±0.19,84.23±0.48,57.78±3.86,2.73±0.15,93.50±0.59,11.67±1.56,0.85±0.11,Optimal NO SN,CER
9,MD,MD,80.35±9.31,52.54±48.81,4.74±3.59,79.90±5.49,128.42±102.31,5.05±2.99,92.20±0.34,15.90±2.51,1.07±0.17,default,CER


# QQP

## DPP HP

In [53]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
spectralnorm = ['no_sn']
dataset_names = ['QQP', 'Amazon']
dataset_fnames = ['qqp', 'amazon']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for max_frac in [0.3, 0.4, 0.5, 0.6]:
            for sn in spectralnorm:
                run_dirs = []
                name_sn = ''
                names = [f'{method}_{max_frac}|{reg}|{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series_dpp_hp/electra_{reg}_{sn}/{name}/0.0/{method}_{max_frac}_50'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/qqp/0.0/ddpp_dpp_0.3_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/amazon/0.0/ddpp_dpp_0.3_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/qqp/0.0/ddpp_dpp_0.4_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/amazon/0.0/ddpp_dpp_0.4_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/qqp/0.0/ddpp_dpp_0.5_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/amazon/0.0/ddpp_dpp_0.5_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/qqp/0.0/ddpp_dpp_0.6_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/amazon/0.0/ddpp_dpp_0.6_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/qqp/0.0/ddpp_ood_0.3_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/amazon/0.0/ddpp_ood_0.3_50
../workdir/run_glue_for_model_series_dpp_hp/electra_raw_no_sn/qqp/0.0/ddpp_ood_0.4_50
../workdir/run_glue_for_model_series_dp

In [54]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])]).reset_index()

In [59]:
table_all.sort_values(('QQP', 'rpp'))

Unnamed: 0_level_0,Method,UE Score,QQP,QQP,QQP,Amazon,Amazon,Amazon
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
19,ddpp_ood_0.5|raw|no_sn,sampled_max_prob,0.07±0.04,258.61±17.13,1.93±0.11,-0.24±0.11,531.68±18.06,4.91±0.20
16,ddpp_ood_0.4|raw|no_sn,sampled_max_prob,0.07±0.05,258.33±16.81,1.93±0.11,-0.31±0.14,535.92±20.73,4.98±0.22
22,ddpp_ood_0.6|raw|no_sn,sampled_max_prob,0.07±0.03,259.44±17.50,1.94±0.12,-0.19±0.06,529.26±17.38,4.88±0.19
23,ddpp_ood_0.6|raw|no_sn,variance,0.06±0.07,261.23±16.19,1.95±0.10,-0.67±0.31,561.97±30.23,5.38±0.39
13,ddpp_ood_0.3|raw|no_sn,sampled_max_prob,0.06±0.06,260.59±14.76,1.95±0.10,-0.33±0.14,537.84±20.48,5.00±0.22
20,ddpp_ood_0.5|raw|no_sn,variance,0.06±0.06,261.85±18.25,1.95±0.12,-0.61±0.18,557.29±18.70,5.31±0.19
17,ddpp_ood_0.4|raw|no_sn,variance,0.06±0.06,262.30±17.76,1.96±0.11,-0.65±0.16,560.96±19.71,5.35±0.23
11,ddpp_dpp_0.6|raw|no_sn,variance,0.04±0.05,263.47±17.11,1.97±0.11,-0.29±0.15,535.13±18.62,5.00±0.19
14,ddpp_ood_0.3|raw|no_sn,variance,0.03±0.06,268.74±16.21,1.98±0.09,-0.71±0.30,565.95±27.03,5.41±0.36
8,ddpp_dpp_0.5|raw|no_sn,variance,0.03±0.09,265.37±19.01,1.98±0.13,-0.30±0.12,535.19±14.37,5.00±0.17


In [None]:
{'amazon': {'ddpp_dpp': 0.4, 'ddpp_ood': 0.6},
'qqp': {'ddpp_dpp': 0.6, 'ddpp_ood': 0.5}}

# QQP NUQ HP

In [84]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['nuq']
regs = ['raw']
spectralnorm = ['no_sn']
dataset_names = ['QQP']
dataset_fnames = ['qqp']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for n_neighbors in [20, 30, 40, 50, 60]:
                for log_pN in [0, -20, -40]:
                    for n_samples in [3, 5]:
                        for n_folds in [10, 20]:
                            for n_points in [10, 50]:
                                run_dirs = []
                                name_sn = ''
                                names = [f'{method}|classification_{n_points}_{n_folds}_{n_samples}|{n_neighbors}|{log_pN}']
                                for name in dataset_fnames:
                                    model_series_dir = f'../workdir/run_glue_for_model_series_nuq_hp/electra_{reg}_{sn}/{name}/classification_{n_neighbors}_{log_pN}_{n_points}_{n_folds}_{n_samples}/{method}'
                                    print(model_series_dir)
                                    run_dirs.append([model_series_dir])
                                agg_func = choose_agg_func(method)
                                try:
                                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                                    baselines.append(res_df.iloc[-1:])
                                    tables.append(res_df.iloc[:-1])
                                except:
                                    print('pass')
                                    pass

../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_10_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_50_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_10_20_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_50_20_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_10_10_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_50_10_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_10_20_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_0_50_20_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_-20_10_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/qqp/classification_20_-20_50_10_3/nuq
../workdir/run_g

In [85]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])]).reset_index()

In [86]:
table_all.sort_values(('QQP', 'rpp')).head(30)

Unnamed: 0_level_0,Method,UE Score,QQP,QQP,QQP
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp
149,nuq|classification_50_10_3|40|0,nuq_total,-0.01±0.06,266.18±15.47,1.96±0.09
148,nuq|classification_50_10_3|40|0,nuq_epistemic,-0.04±0.03,264.53±14.33,1.97±0.10
292,nuq|classification_50_10_3|60|0,nuq_epistemic,-0.01±0.06,268.89±16.03,1.99±0.11
293,nuq|classification_50_10_3|60|0,nuq_total,-0.02±0.06,277.28±16.11,2.00±0.10
298,nuq|classification_50_20_3|60|0,nuq_epistemic,0.01±0.04,270.69±16.34,2.00±0.11
304,nuq|classification_50_10_5|60|0,nuq_epistemic,0.01±0.05,270.76±15.82,2.00±0.11
310,nuq|classification_50_20_5|60|0,nuq_epistemic,-0.01±0.05,272.37±12.66,2.01±0.10
227,nuq|classification_50_20_3|50|0,nuq_total,-0.01±0.04,276.98±15.65,2.01±0.11
307,nuq|classification_10_20_5|60|0,nuq_epistemic,-0.00±0.04,271.38±17.42,2.01±0.12
166,nuq|classification_50_20_5|40|0,nuq_epistemic,-0.04±0.02,271.94±16.42,2.01±0.12


# QQP full

In [102]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best'  or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['ddpp_ood', 'ddpp_dpp', 'mc', 'mahalanobis', 'nuq', 'nuq_best1']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['QQP']
dataset_fnames = ['qqp']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/qqp/0.0/ddpp_ood
pass
../workdir/run_glue_for_model_series/electra_raw_no_sn/qqp/0.0/ddpp_ood
../workdir/run_glue_for_model_series/electra_reg_sn/qqp/0.0/ddpp_ood
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/qqp/0.0/ddpp_ood
../workdir/run_glue_for_model_series/electra_metric_sn/qqp/0.0/ddpp_ood
pass
../workdir/run_glue_for_model_series/electra_metric_no_sn/qqp/0.0/ddpp_ood
../workdir/run_glue_for_model_series/electra_raw_sn/qqp/0.0/ddpp_dpp
pass
../workdir/run_glue_for_model_series/electra_raw_no_sn/qqp/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/electra_reg_sn/qqp/0.0/ddpp_dpp
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/qqp/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/electra_metric_sn/qqp/0.0/ddpp_dpp
pass
../workdir/run_glue_for_model_series/electra_metric_no_sn/qqp/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/electra_raw_sn/qqp/0.0/mc
pass
../workdir/run_glue_for_model_series/electra

In [103]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [104]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    
    elif method == 'nuq_best1' and not 'no_sn' in sn:
        return 'Best NUQ SN'
    elif method == 'nuq_best1':
        return 'Best NUQ'
    
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [105]:
table_all.iloc[27:]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,QQP,QQP,QQP
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
27,MD SN (ours),-,MD,-1.29±0.37,2785.02±296.44,3.23±0.36
28,MD,-,MD,-1.64±0.49,3053.33±385.59,3.57±0.47
29,MD SN (ours),CER,MD,-0.65±0.18,2505.31±177.56,2.76±0.18
30,MD,CER,MD,-1.06±0.17,2852.74±187.94,3.09±0.18
31,MD SN (ours),metric,MD,-0.37±0.15,2448.81±118.84,2.66±0.12
32,MD,metric,MD,-0.43±0.15,2150.88±128.10,2.48±0.12
33,NUQ SN,-,aleatoric,-0.20±0.13,1770.55±102.53,2.11±0.12
34,NUQ SN,-,epistemic,-0.07±0.02,1457.39±34.82,1.99±0.05
35,NUQ SN,-,total,-0.07±0.05,1482.90±82.25,1.99±0.05
36,NUQ,-,aleatoric,-0.20±0.07,1752.35±82.96,2.09±0.04


# Amazon

In [29]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['ddpp_ood', 'ddpp_dpp', 'mc', 'mahalanobis', 'nuq']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['Amazon']
dataset_fnames = ['amazon']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0/ddpp_ood
pass
../workdir/run_glue_for_model_series/electra_raw_no_sn/amazon/0.0/ddpp_ood
../workdir/run_glue_for_model_series/electra_reg_sn/amazon/0.0/ddpp_ood
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/amazon/0.0/ddpp_ood
../workdir/run_glue_for_model_series/electra_metric_sn/amazon/0.0/ddpp_ood
pass
../workdir/run_glue_for_model_series/electra_metric_no_sn/amazon/0.0/ddpp_ood
../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0/ddpp_dpp
pass
../workdir/run_glue_for_model_series/electra_raw_no_sn/amazon/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/electra_reg_sn/amazon/0.0/ddpp_dpp
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/amazon/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/electra_metric_sn/amazon/0.0/ddpp_dpp
pass
../workdir/run_glue_for_model_series/electra_metric_no_sn/amazon/0.0/ddpp_dpp
../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0/mc
pass
../wo

In [30]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [31]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [32]:
table_all

Unnamed: 0_level_0,Method,Reg. Type,UE Score,Amazon,Amazon,Amazon
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+OOD) (ours),-,BALD,-0.63±0.25,3946.30±149.99,5.27±0.31
1,DDPP (+OOD) (ours),-,SMP,-0.30±0.06,3789.98±44.35,4.94±0.10
2,DDPP (+OOD) (ours),-,PV,-0.76±0.20,4024.96±114.23,5.42±0.24
3,DDPP (+OOD) (ours),CER,BALD,-0.63±0.29,3945.67±119.34,5.26±0.29
4,DDPP (+OOD) (ours),CER,SMP,-0.27±0.12,3776.15±48.94,4.91±0.09
5,DDPP (+OOD) (ours),CER,PV,-0.77±0.29,4020.17±127.41,5.43±0.28
6,DDPP (+OOD) (ours),metric,BALD,-0.44±0.25,3843.60±97.02,5.06±0.24
7,DDPP (+OOD) (ours),metric,SMP,-0.24±0.06,3758.19±36.34,4.86±0.03
8,DDPP (+OOD) (ours),metric,PV,-0.59±0.21,3927.44±76.06,5.23±0.19
9,DDPP (+DPP) (ours),-,BALD,-0.30±0.15,3800.56±101.85,4.95±0.19


# Amazon, 1%

In [7]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['mahalanobis', 'nuq']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['Amazon']
dataset_fnames = ['amazon']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0_0.01/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0_0.01/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/amazon/0.0_0.01/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/amazon/0.0_0.01/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/amazon/0.0_0.01/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_sn/amazon/0.0_0.01/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_no_sn/amazon/0.0_0.01/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0_0.01/nuq
../workdir/run_glue_for_model_series/electra_raw_no_sn/amazon/0.0_0.01/nuq
../workdir/run_glue_for_model_series/electra_reg_sn/amazon/0.0_0.01/nuq
../workdir/run_glue_for_model_series/electra_reg_no_sn/amazon/0.0_0.01/nuq
../workdir/run_glue_for_model_series/electra_metric_sn/amazon/0.0_0.01/nuq
../workdir/run_glue_for_model_series/electra_metric_no_sn/amazon/0.0_0.01/nuq


In [8]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [9]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [10]:
table_all

Unnamed: 0_level_0,Method,Reg. Type,UE Score,Amazon,Amazon,Amazon
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,MD SN (ours),-,MD,-5.24±1.11,9645.11±1517.53,9.74±1.03
1,MD,-,MD,-5.32±1.28,9599.70±1498.44,9.83±1.17
2,MD SN (ours),CER,MD,-6.26±1.36,10739.10±1712.66,10.70±1.41
3,MD,CER,MD,-7.22±1.63,11571.07±1716.87,11.66±1.70
4,MD SN (ours),metric,MD,-7.09±1.15,11766.61±1408.78,11.73±1.13
5,MD,metric,MD,-5.29±1.24,9696.91±1651.54,9.79±1.16
6,NUQ SN,-,aleatoric,-3.64±1.62,6548.75±1329.30,8.08±1.51
7,NUQ SN,-,epistemic,-0.16±0.07,4408.77±134.86,4.53±0.15
8,NUQ SN,-,total,-0.76±0.49,4658.36±286.87,5.17±0.40
9,NUQ,-,aleatoric,-3.60±1.63,6500.35±1267.04,8.06±1.50


# SST-5 

In [94]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['mahalanobis', 'nuq', 'nuq_best', 'nuq_best1']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['SST-5']
dataset_fnames = ['sst5']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0_0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/sst5/0.0_0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/sst5/0.0_0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/sst5/0.0_0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/sst5/0.0_0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_sn/sst5/0.0_0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_no_sn/sst5/0.0_0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_sn/sst5/0.0_0.0/nuq
../workdir/run_glue_for_model_series/electra_raw_no_sn/sst5/0.0_0.0/nuq
../workdir/run_glue_for_model_series/electra_reg_sn/sst5/0.0_0.0/nuq
../workdir/run_glue_for_model_series/electra_reg_no_sn/sst5/0.0_0.0/nuq
../workdir/run_glue_for_model_series/electra_metric_sn/sst5/0.0_0.0/nuq
../workdir/run_glue_for_model_series/electra_metric_no_sn/sst5/0.0_0.0/nuq
../workdir/run_glue_for_model_series/electra_raw_sn/sst5/0.0_0.0/nuq_best
../workdir/run_glue_

In [95]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [96]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    
    elif method == 'nuq_best' and not 'no_sn' in sn:
        return 'Best NUQ SN'
    elif method == 'nuq_best':
        return 'Best NUQ'
    
    elif method == 'nuq_best1' and not 'no_sn' in sn:
        return 'Best1 NUQ SN'
    elif method == 'nuq_best1':
        return 'Best1 NUQ'
    
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [97]:
table_all.sort_values(('SST-5', 'rpp'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,Reg. Type,UE Score,SST-5,SST-5,SST-5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
54,Best1 NUQ SN,metric,aleatoric,0.48±0.47,439.64±14.46,10.36±0.35
51,Best1 NUQ,CER,aleatoric,0.45±0.19,441.54±14.82,10.51±0.33
42,Best1 NUQ SN,-,aleatoric,0.38±0.33,448.86±16.83,10.55±0.56
48,Best1 NUQ SN,CER,aleatoric,0.19±0.28,450.36±11.97,10.56±0.48
56,Best1 NUQ SN,metric,total,0.34±0.47,450.42±12.39,10.56±0.36
...,...,...,...,...,...,...
46,Best1 NUQ,-,epistemic,-0.33±0.15,483.75±10.15,11.31±0.23
29,Best NUQ,-,total,-0.33±0.17,484.58±10.70,11.32±0.22
28,Best NUQ,-,epistemic,-0.39±0.19,488.75±9.96,11.37±0.21
11,NUQ,-,total,-0.37±0.12,485.66±7.94,11.39±0.21


In [98]:
table_all.iloc[[0,1,2,3,4,5,15,16,17,54,55,56,60,61,62,63,64,65]]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,SST-5,SST-5,SST-5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,MD SN (ours),-,MD,0.12±0.35,463.88±7.44,10.84±0.12
1,MD,-,MD,0.30±0.29,462.00±6.77,10.69±0.20
2,MD SN (ours),CER,MD,0.02±0.30,463.94±7.96,10.78±0.19
3,MD,CER,MD,0.24±0.38,457.05±11.54,10.76±0.22
4,MD SN (ours),metric,MD,0.12±0.28,461.98±9.16,10.77±0.22
5,MD,metric,MD,-0.36±0.67,468.72±17.34,10.97±0.45
15,NUQ,CER,aleatoric,-0.01±0.12,462.83±15.13,10.97±0.27
16,NUQ,CER,epistemic,-0.28±0.19,474.22±17.08,11.27±0.33
17,NUQ,CER,total,-0.17±0.16,469.72±16.50,11.16±0.30
54,Best1 NUQ SN,metric,aleatoric,0.48±0.47,439.64±14.46,10.36±0.35


# SST-5 NUQ HP

In [6]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['nuq']
regs = ['raw']
spectralnorm = ['no_sn']
dataset_names = ['SST-5']
dataset_fnames = ['sst5']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for tune_bandwidth in ['classification', 'isj', 'scott']:
                for n_neighbors in [20, 30, 40, 50, 60]:
                    for log_pN in [0, -20, -40]:
                        run_dirs = []
                        name_sn = ''
                        names = [f'{method}|{tune_bandwidth}|{n_neighbors}|{log_pN}']
                        for name in dataset_fnames:
                            model_series_dir = f'../workdir/run_glue_for_model_series_nuq_hp/electra_{reg}_{sn}/{name}/{tune_bandwidth}_{n_neighbors}_{log_pN}/{method}'
                            print(model_series_dir)
                            run_dirs.append([model_series_dir])
                        agg_func = choose_agg_func(method)
                        try:
                            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                            baselines.append(res_df.iloc[-1:])
                            tables.append(res_df.iloc[:-1])
                        except:
                            print('pass')
                            pass

../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_-20/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_-40/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_30_0/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_30_-20/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_30_-40/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_40_0/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_40_-20/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_40_-40/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_50_0/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classificat

In [7]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])]).reset_index()

In [8]:
table_all.sort_values(('SST-5', 'rpp'))

Unnamed: 0_level_0,Method,UE Score,SST-5,SST-5,SST-5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp
135,baseline|scott|60|-40,max_prob,77.76±0.13,615.23±7.65,10.12±0.14
36,nuq|classification|60|0,nuq_aleatoric,-0.45±0.20,656.45±24.45,10.55±0.26
39,nuq|classification|60|-20,nuq_aleatoric,-0.48±0.17,658.94±24.65,10.58±0.27
3,nuq|classification|20|-20,nuq_aleatoric,-0.50±0.15,645.53±9.80,10.59±0.15
42,nuq|classification|60|-40,nuq_aleatoric,-0.52±0.21,665.78±26.70,10.61±0.33
...,...,...,...,...,...
70,nuq|isj|40|-40,nuq_epistemic,-2.23±0.32,717.14±10.49,12.38±0.08
69,nuq|isj|40|-40,nuq_aleatoric,-2.23±0.32,717.14±10.49,12.38±0.08
68,nuq|isj|40|-20,nuq_total,-2.23±0.32,717.14±10.49,12.38±0.08
66,nuq|isj|40|-20,nuq_aleatoric,-2.23±0.32,717.14±10.49,12.38±0.08


In [29]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['nuq']
regs = ['raw']
spectralnorm = ['no_sn']
dataset_names = ['SST-5']
dataset_fnames = ['sst5']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for n_neighbors in [20, 30, 40, 50, 60]:
                for log_pN in [0, -20, -40]:
                    for n_samples in [3, 5]:
                        for n_folds in [10, 20]:
                            for n_points in [10, 50]:
                                run_dirs = []
                                name_sn = ''
                                names = [f'{method}|classification_{n_points}_{n_folds}_{n_samples}|{n_neighbors}|{log_pN}']
                                for name in dataset_fnames:
                                    model_series_dir = f'../workdir/run_glue_for_model_series_nuq_hp/electra_{reg}_{sn}/{name}/classification_{n_neighbors}_{log_pN}_{n_points}_{n_folds}_{n_samples}/{method}'
                                    print(model_series_dir)
                                    run_dirs.append([model_series_dir])
                                agg_func = choose_agg_func(method)
                                try:
                                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                                    baselines.append(res_df.iloc[-1:])
                                    tables.append(res_df.iloc[:-1])
                                except:
                                    print('pass')
                                    pass

../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_10_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_50_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_10_20_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_50_20_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_10_10_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_50_10_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_10_20_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_0_50_20_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_-20_10_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst5/classification_20_-20_50_10_3/nuq
../wor

In [30]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])]).reset_index()

In [42]:
def to_float(x):
    return float(x.split('±')[0])

table_all.sort_values(('SST-5', 'rpp'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,SST-5,SST-5,SST-5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp
239,nuq|classification_50_20_5|50|0,nuq_total,0.19±0.32,609.93±37.23,9.92±0.48
171,nuq|classification_50_10_3|40|-20,nuq_aleatoric,0.08±0.24,621.93±26.74,10.00±0.42
173,nuq|classification_50_10_3|40|-20,nuq_total,0.02±0.21,635.79±25.94,10.07±0.41
77,nuq|classification_50_10_3|30|0,nuq_total,0.05±0.45,615.27±33.63,10.07±0.56
297,nuq|classification_50_20_3|60|0,nuq_aleatoric,0.03±0.38,613.46±27.69,10.07±0.49
...,...,...,...,...,...
70,nuq|classification_50_20_5|20|-40,nuq_epistemic,-0.89±0.11,679.74±16.44,10.99±0.17
52,nuq|classification_50_10_3|20|-40,nuq_epistemic,-0.91±0.10,680.23±16.02,11.00±0.16
274,nuq|classification_50_20_3|50|-40,nuq_epistemic,-0.80±0.17,686.07±16.94,11.01±0.34
34,nuq|classification_50_20_3|20|-20,nuq_epistemic,-0.93±0.10,680.28±15.87,11.02±0.13


# ARC-REJ tables

In [13]:
from analyze_results import from_model_outptus_calc_rejection_table, aggregate_runs_rejection_table, format_arc_table_results

In [156]:
import os 

def get_one_table(runs_dir, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baseline=None, methods=None):
    default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
        #"var.ratio": var_ratio,
        #"sampled_entropy": mean_entropy,
    }
    
    if methods is None:
        methods = default_methods

    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs_rejection_table(
            runs_dir, methods=methods, metric=metric
        )

        if agg_res.empty:
            print("Broken\n")
            continue

            
    improvement = format_arc_table_results(agg_res, baseline_col='max_prob', percents=True)    
    improvement = improvement.loc[['max_prob'] + list(methods.keys())]
    improvement.index = ['baseline (max_prob)'] + list(improvement.index[1:])

    table.append(improvement)
    res_table = pd.concat(table, axis=1)

    return res_table


def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    elif  metric_type == "arc_tab":
        return from_model_outptus_calc_rejection_table
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods


def format_arc_table_results(results, baseline_col, subtract=False, percents=False, ndp=2):
    
    if subtract:
        baseline = results.T[[baseline_col]].astype(float)
        res = results.T.drop(columns=baseline_col).astype(float)
        baseline_columns = baseline.columns
        baseline.columns = res.columns
        diff_res = res.subtract(baseline, fill_value=0)
        baseline.columns = baseline_columns
        diff_res = pd.concat([diff_res, baseline], axis=1).T
    else:
        diff_res = results
        
    base_score = diff_res[diff_res.columns[0]]
    percents_res = (diff_res.divide(base_score, axis='rows') - 1)*100
    percents_res = percents_res.groupby(level=0).mean()
    
    mean_res = diff_res.groupby(level=0).mean()
    std_res = diff_res.groupby(level=0).std()
    if percents:
        mean_res *= 100
        std_res *= 100
    formatted_results = mean_res.applymap(
        lambda x: "{}±".format(round(x, ndp))
    ) + std_res.applymap(lambda x: "{}".format(round(x, ndp))) \
      + percents_res.applymap(lambda x: " ({}%)".format(round(x, ndp)))

    return formatted_results
    
metric_types=['arc_tab']
methods = ['mahalanobis', 'nuq']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['Amazon']
dataset_fnames = ['amazon']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/amazon/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/amazon/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/amazon/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_sn/amazon/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_no_sn/amazon/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0/nuq
../workdir/run_glue_for_model_series/electra_raw_no_sn/amazon/0.0/nuq
../workdir/run_glue_for_model_series/electra_reg_sn/amazon/0.0/nuq
../workdir/run_glue_for_model_series/electra_reg_no_sn/amazon/0.0/nuq
../workdir/run_glue_for_model_series/electra_metric_sn/amazon/0.0/nuq
../workdir/run_glue_for_model_series/electra_metric_no_sn/amazon/0.0/nuq


In [157]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [158]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [160]:
table_all.iloc[[0, 19, 22, 25, 27, 29]]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,Amazon,Amazon,Amazon,Amazon,Amazon,Amazon,Amazon,Amazon,Amazon,Amazon,Amazon,Amazon
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,0%,5%,10%,20%,30%,40%,50%,60%,70%,80%,90%,99%
0,MD SN (ours),-,MD,72.93±0.29 (0.0%),74.44±0.35 (2.07%),75.69±0.38 (3.79%),77.84±0.67 (6.73%),79.81±1.01 (9.43%),81.9±1.39 (12.3%),83.91±1.71 (15.07%),85.99±1.92 (17.92%),88.21±2.05 (20.95%),90.32±2.03 (23.86%),92.29±1.97 (26.55%),94.84±1.41 (30.05%)
19,NUQ SN,metric,epistemic,72.54±0.32 (0.0%),74.53±0.34 (2.75%),76.31±0.39 (5.2%),79.84±0.31 (10.06%),83.29±0.36 (14.82%),86.44±0.36 (19.16%),88.79±0.29 (22.4%),90.85±0.19 (25.24%),92.44±0.26 (27.44%),93.99±0.24 (29.58%),95.35±0.3 (31.44%),97.7±1.07 (34.68%)
22,NUQ,metric,epistemic,72.91±0.3 (0.0%),74.94±0.29 (2.78%),76.72±0.3 (5.23%),80.16±0.34 (9.94%),83.56±0.3 (14.61%),86.79±0.3 (19.03%),89.3±0.22 (22.47%),91.08±0.16 (24.92%),92.66±0.16 (27.08%),94.1±0.21 (29.07%),95.52±0.34 (31.01%),97.7±0.81 (34.0%)
25,SR,-,MP,72.97±0.3 (0.0%),74.79±0.33 (2.49%),76.43±0.33 (4.73%),79.65±0.27 (9.15%),82.74±0.26 (13.38%),86.08±0.17 (17.97%),89.33±0.12 (22.42%),91.53±0.15 (25.43%),93.24±0.28 (27.77%),94.98±0.28 (30.16%),96.86±0.27 (32.74%),98.71±0.65 (35.27%)
27,SR,CER,MP,72.88±0.34 (0.0%),74.72±0.34 (2.52%),76.31±0.35 (4.71%),79.53±0.27 (9.12%),82.83±0.26 (13.65%),86.08±0.28 (18.11%),89.26±0.16 (22.48%),91.51±0.18 (25.56%),93.33±0.2 (28.06%),95.07±0.31 (30.44%),96.94±0.18 (33.0%),98.2±1.1 (34.75%)
29,SR,metric,MP,72.91±0.3 (0.0%),74.75±0.34 (2.52%),76.42±0.3 (4.82%),79.63±0.25 (9.21%),82.72±0.25 (13.45%),86.04±0.15 (18.01%),89.37±0.15 (22.57%),91.53±0.19 (25.54%),93.26±0.28 (27.91%),94.94±0.28 (30.21%),96.88±0.29 (32.88%),98.65±0.52 (35.31%)


# SST-2 NUQ HP

In [3]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['nuq']
regs = ['raw']
spectralnorm = ['no_sn']
dataset_names = ['SST-2']
dataset_fnames = ['sst2']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for n_neighbors in [20, 30, 40, 50, 60]:
                for log_pN in [0, -20, -40]:
                    for n_samples in [3, 5]:
                        for n_folds in [10, 20]:
                            for n_points in [10, 50]:
                                run_dirs = []
                                name_sn = ''
                                names = [f'{method}|classification_{n_points}_{n_folds}_{n_samples}|{n_neighbors}|{log_pN}']
                                for name in dataset_fnames:
                                    model_series_dir = f'../workdir/run_glue_for_model_series_nuq_hp/electra_{reg}_{sn}/{name}/classification_{n_neighbors}_{log_pN}_{n_points}_{n_folds}_{n_samples}/{method}'
                                    print(model_series_dir)
                                    run_dirs.append([model_series_dir])
                                agg_func = choose_agg_func(method)
                                try:
                                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                                    baselines.append(res_df.iloc[-1:])
                                    tables.append(res_df.iloc[:-1])
                                except:
                                    print('pass')
                                    pass

../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_10_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_50_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_10_20_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_50_20_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_10_10_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_50_10_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_10_20_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_0_50_20_5/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_-20_10_10_3/nuq
../workdir/run_glue_for_model_series_nuq_hp/electra_raw_no_sn/sst2/classification_20_-20_50_10_3/nuq
../wor

In [4]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])]).reset_index()

In [5]:
def to_float(x):
    return float(x.split('±')[0])

table_all.sort_values(('SST-2', 'rpp'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp
357,nuq|classification_50_20_5|60|-40,nuq_aleatoric,0.44±0.06,21.16±3.54,0.90±0.16
243,nuq|classification_50_10_3|50|-20,nuq_aleatoric,0.41±0.08,21.81±3.42,0.94±0.17
315,nuq|classification_50_10_3|60|-20,nuq_aleatoric,0.41±0.09,21.83±3.41,0.94±0.17
27,nuq|classification_50_10_3|20|-20,nuq_aleatoric,0.41±0.09,21.77±3.42,0.94±0.17
30,nuq|classification_10_20_3|20|-20,nuq_aleatoric,0.40±0.09,21.82±3.45,0.94±0.17
...,...,...,...,...,...
50,nuq|classification_10_10_3|20|-40,nuq_total,-1.21±0.36,75.23±14.75,2.88±0.57
49,nuq|classification_10_10_3|20|-40,nuq_epistemic,-1.21±0.36,75.24±14.75,2.88±0.57
67,nuq|classification_10_20_5|20|-40,nuq_epistemic,-1.21±0.37,75.31±14.69,2.88±0.57
284,nuq|classification_10_20_5|50|-40,nuq_total,-1.23±0.38,75.45±14.59,2.90±0.69


In [7]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='ddu':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['mahalanobis', 'nuq_best1', 'ddu']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['SST-2']
dataset_fnames = ['sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_no_sn/sst2/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_sn/sst2/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_raw_no_sn/sst2/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_reg_sn/sst2/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_reg_no_sn/sst2/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_metric_sn/sst2/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_metric_no_sn/sst2/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_raw_sn/sst2/0.0/ddu
../workdir/run_glue_for_model_series/elec

In [8]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [9]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    
    elif method == 'nuq_best' and not 'no_sn' in sn:
        return 'Best NUQ SN'
    elif method == 'nuq_best':
        return 'Best NUQ'
    
    elif method == 'nuq_best1' and not 'no_sn' in sn:
        return 'Best1 NUQ SN'
    elif method == 'nuq_best1':
        return 'Best1 NUQ'
    
    elif method == 'ddu' and not 'no_sn' in sn:
        return 'DDU SN'
    elif method == 'ddu':
        return 'DDU'
    
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [10]:
table_all

Unnamed: 0_level_0,Method,Reg. Type,UE Score,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,MD SN (ours),-,MD,0.37±0.30,12.77±1.83,0.90±0.12
1,MD,-,MD,0.44±0.21,11.61±2.26,0.81±0.13
2,MD SN (ours),CER,MD,0.54±0.34,11.51±2.19,0.82±0.14
3,MD,CER,MD,0.24±0.19,11.22±0.92,0.78±0.08
4,MD SN (ours),metric,MD,0.20±0.27,11.11±2.02,0.80±0.13
5,MD,metric,MD,0.02±0.21,11.33±1.62,0.80±0.10
6,Best1 NUQ SN,-,aleatoric,0.40±0.28,12.48±1.81,0.88±0.12
7,Best1 NUQ SN,-,epistemic,-0.94±0.54,38.59±10.26,2.62±0.64
8,Best1 NUQ SN,-,total,-0.94±0.54,38.56±10.28,2.60±0.66
9,Best1 NUQ,-,aleatoric,0.41±0.21,12.23±2.25,0.83±0.13


# MNLI

In [35]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['nuq_best1', 'mahalanobis', 'ddu', 'ddu_maha']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MNLI']
dataset_fnames = ['mnli']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/mnli/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_raw_no_sn/mnli/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_reg_sn/mnli/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_reg_no_sn/mnli/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_metric_sn/mnli/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_metric_no_sn/mnli/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_raw_sn/mnli/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/mnli/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/mnli/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/mnli/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_sn/mnli/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_no_sn/mnli/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_sn/mnli/0.0/ddu
../workdir/run_glue_for_model_series/elec

In [36]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [37]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    
    elif method == 'nuq_best' and not 'no_sn' in sn:
        return 'Best NUQ SN'
    elif method == 'nuq_best':
        return 'Best NUQ'
    
    elif method == 'nuq_best1' and not 'no_sn' in sn:
        return 'Best1 NUQ SN'
    elif method == 'nuq_best1':
        return 'Best1 NUQ'
    
    elif method == 'ddu' and not 'no_sn' in sn:
        return 'DDU SN'
    elif method == 'ddu':
        return 'DDU'
    
    elif method == 'ddu_maha' and not 'no_sn' in sn:
        return 'DDU Maha SN'
    elif method == 'ddu_maha':
        return 'DDU Maha'
    
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [38]:
table_all

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MNLI,MNLI,MNLI
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,Best1 NUQ SN,-,aleatoric,0.91±0.16,467.00±24.64,2.43±0.16
1,Best1 NUQ SN,-,epistemic,0.81±0.09,486.90±18.83,2.54±0.09
2,Best1 NUQ SN,-,total,0.86±0.12,476.96±18.97,2.48±0.12
3,Best1 NUQ,-,aleatoric,1.45±0.44,471.74±21.82,2.37±0.08
4,Best1 NUQ,-,epistemic,1.43±0.44,475.45±26.74,2.38±0.09
5,Best1 NUQ,-,total,1.44±0.44,472.87±22.86,2.38±0.08
6,Best1 NUQ SN,CER,aleatoric,1.20±0.19,469.25±13.72,2.41±0.09
7,Best1 NUQ SN,CER,epistemic,1.01±0.19,492.63±18.06,2.58±0.09
8,Best1 NUQ SN,CER,total,1.07±0.18,484.18±15.36,2.51±0.09
9,Best1 NUQ,CER,aleatoric,0.97±0.18,472.62±32.88,2.38±0.14


In [30]:
table_all.iloc[[19, 30]]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MNLI,MNLI,MNLI
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
19,MD,-,MD,1.47±0.44,461.38±16.83,2.34±0.07
30,DDU Maha,-,MP,1.42±0.44,474.41±22.62,2.40±0.08


# Twitter HSO

In [10]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['nuq_best1', 'mahalanobis', 'ddu']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['Twitter']
dataset_fnames = ['twitter_hso']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../workdir/run_glue_for_model_series/electra_raw_sn/twitter_hso/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_raw_no_sn/twitter_hso/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_reg_sn/twitter_hso/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_reg_no_sn/twitter_hso/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_metric_sn/twitter_hso/0.0/nuq_best1
Broken

Broken

Broken

pass
../workdir/run_glue_for_model_series/electra_metric_no_sn/twitter_hso/0.0/nuq_best1
../workdir/run_glue_for_model_series/electra_raw_sn/twitter_hso/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/twitter_hso/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/twitter_hso/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/twitter_hso/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_sn/twitter_hso/0.0/mahalanobis
Broken

Broken

Broken

pass
../workdir/run_glue_for_model_series/electra_metri

In [11]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [12]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    
    elif method == 'nuq_best' and not 'no_sn' in sn:
        return 'Best NUQ SN'
    elif method == 'nuq_best':
        return 'Best NUQ'
    
    elif method == 'nuq_best1' and not 'no_sn' in sn:
        return 'Best1 NUQ SN'
    elif method == 'nuq_best1':
        return 'Best1 NUQ'
    
    elif method == 'ddu' and not 'no_sn' in sn:
        return 'DDU SN'
    elif method == 'ddu':
        return 'DDU'
    
    elif method == 'ddu_maha' and not 'no_sn' in sn:
        return 'DDU Maha SN'
    elif method == 'ddu_maha':
        return 'DDU Maha'
    
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [14]:
table_all

Unnamed: 0_level_0,Method,Reg. Type,UE Score,Twitter,Twitter,Twitter
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,Best1 NUQ SN,-,aleatoric,-0.09±0.19,1079.27±71.42,5.16±0.18
1,Best1 NUQ SN,-,epistemic,0.07±0.05,912.76±13.95,4.99±0.07
2,Best1 NUQ SN,-,total,0.13±0.07,905.58±12.33,4.92±0.07
3,Best1 NUQ,-,aleatoric,0.12±0.23,1019.10±59.22,5.09±0.11
4,Best1 NUQ,-,epistemic,0.16±0.15,905.53±8.47,5.04±0.07
5,Best1 NUQ,-,total,0.21±0.18,901.10±11.77,4.99±0.08
6,Best1 NUQ SN,CER,aleatoric,0.17±0.77,1190.37±186.84,5.73±0.63
7,Best1 NUQ SN,CER,epistemic,0.65±0.55,941.80±38.66,5.24±0.14
8,Best1 NUQ SN,CER,total,0.73±0.57,929.52±36.49,5.15±0.14
9,Best1 NUQ,CER,aleatoric,-0.39±0.51,1180.03±84.00,5.49±0.29


In [15]:
table_all.iloc[[2,14,15,19,21,26,27,28,29,30]]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,Twitter,Twitter,Twitter
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
2,Best1 NUQ SN,-,total,0.13±0.07,905.58±12.33,4.92±0.07
14,Best1 NUQ,metric,total,0.03±0.11,895.29±9.86,4.94±0.13
15,MD SN (ours),-,MD,0.29±0.04,892.32±15.71,4.76±0.07
19,MD,metric,MD,0.17±0.13,885.74±15.12,4.79±0.10
21,DDU,-,MP,0.36±0.15,888.51±7.79,4.85±0.09
26,SR SN,-,MP,88.28±0.07,936.65±13.05,5.06±0.04
27,SR,-,MP,88.19±0.15,968.42±33.97,5.19±0.16
28,SR SN,CER,MP,87.47±0.52,1200.61±187.28,5.89±0.53
29,SR,CER,MP,88.24±0.19,946.77±48.29,5.10±0.21
30,SR,metric,MP,88.39±0.12,897.13±30.30,4.97±0.14


# Decomposing SST-2

In [8]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       "relative_mahalanobis_distance": rel_maha_dist,
                       "marginal_mahalanobis_distance": marg_maha_dist}
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['rejection-curve-auc', "rcc-auc", 'rpp']
methods = ['nuq', 'mahalanobis', 'decomposing_md']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['MPRC', 'CoLA', 'SST-2', 'SST-5']
dataset_fnames = ['mrpc', 'cola', 'sst2', 'sst5']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../../workdir/run_glue_for_model_series/electra_raw_sn/mrpc/nuq
../../workdir/run_glue_for_model_series/electra_raw_sn/cola/nuq
../../workdir/run_glue_for_model_series/electra_raw_sn/sst2/nuq
../../workdir/run_glue_for_model_series/electra_raw_sn/sst5/nuq
../../workdir/run_glue_for_model_series/electra_raw_no_sn/mrpc/nuq
../../workdir/run_glue_for_model_series/electra_raw_no_sn/cola/nuq
../../workdir/run_glue_for_model_series/electra_raw_no_sn/sst2/nuq
../../workdir/run_glue_for_model_series/electra_raw_no_sn/sst5/nuq
../../workdir/run_glue_for_model_series/electra_reg_sn/mrpc/nuq
../../workdir/run_glue_for_model_series/electra_reg_sn/cola/nuq
../../workdir/run_glue_for_model_series/electra_reg_sn/sst2/nuq
../../workdir/run_glue_for_model_series/electra_reg_sn/sst5/nuq
../../workdir/run_glue_for_model_series/electra_reg_no_sn/mrpc/nuq
../../workdir/run_glue_for_model_series/electra_reg_no_sn/cola/nuq
../../workdir/run_glue_for_model_series/electra_reg_no_sn/sst2/nuq
../../workdir/run_g

In [9]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [11]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    
    elif method == 'decomposing_md' and not 'no_sn' in sn:
        return 'Decomposing SN'
    elif method == 'decomposing_md':
        return 'Decomposing'
    
    elif method == 'nuq_best1' and not 'no_sn' in sn:
        return 'Best1 NUQ SN'
    elif method == 'nuq_best1':
        return 'Best1 NUQ'
    
    elif method == 'ddu' and not 'no_sn' in sn:
        return 'DDU SN'
    elif method == 'ddu':
        return 'DDU'
    
    elif method == 'ddu' and not 'no_sn' in sn:
        return 'DDU SN'
    elif method == 'ddu':
        return 'DDU'
    
    elif method == 'ddu_maha' and not 'no_sn' in sn:
        return 'DDU Maha SN'
    elif method == 'ddu_maha':
        return 'DDU Maha'
    
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    elif method == 'disc_md':
        return 'Disc MD'
    elif method == 'nondisc_md':
        return 'Nondisc MD'
    elif method == 'disc+nondisc_md':
        return 'Disc+Nondisc MD'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [12]:
table_all

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MPRC,MPRC,MPRC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2,SST-5,SST-5,SST-5
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,NUQ SN,-,aleatoric,0.51±0.18,13.75±1.53,1.82±0.24,0.55±0.15,40.54±3.95,1.98±0.19,0.41±0.37,12.86±1.95,0.87±0.18,0.32±0.29,448.59±13.75,10.51±0.21
1,NUQ SN,-,epistemic,0.12±0.21,16.05±1.67,2.19±0.24,0.18±0.13,46.64±4.00,2.35±0.14,0.12±0.54,16.82±5.54,1.16±0.42,-0.28±0.26,480.37±12.75,11.20±0.16
2,NUQ SN,-,total,0.22±0.27,15.51±1.68,2.10±0.24,0.26±0.14,45.60±4.01,2.29±0.17,0.30±0.44,14.51±4.47,1.00±0.33,0.13±0.30,462.40±15.61,10.74±0.25
3,NUQ,-,aleatoric,0.13±0.27,11.95±1.10,1.59±0.16,0.58±0.25,43.44±3.26,2.02±0.08,0.56±0.27,12.46±3.67,0.80±0.10,0.88±0.26,443.31±20.07,10.21±0.29
4,NUQ,-,epistemic,-0.21±0.38,14.36±1.87,1.95±0.25,0.13±0.17,49.36±2.44,2.47±0.13,0.32±0.30,15.04±3.02,1.04±0.22,-0.10±0.05,489.90±19.12,11.27±0.09
5,NUQ,-,total,-0.06±0.35,13.31±1.67,1.79±0.26,0.21±0.24,48.24±2.24,2.40±0.11,0.49±0.28,12.53±2.42,0.88±0.16,0.53±0.29,466.39±26.16,10.61±0.31
6,NUQ SN,CER,aleatoric,0.24±0.25,13.38±1.51,1.88±0.12,0.92±0.24,45.81±4.32,2.03±0.08,0.61±0.34,11.37±0.84,0.81±0.06,0.40±0.39,447.65±17.89,10.46±0.28
7,NUQ SN,CER,epistemic,-0.07±0.15,15.23±2.00,2.18±0.22,0.24±0.16,51.21±2.23,2.71±0.13,0.12±0.12,18.22±5.16,1.29±0.36,-0.30±0.30,477.48±15.58,11.17±0.23
8,NUQ SN,CER,total,0.04±0.18,14.62±2.05,2.07±0.22,0.44±0.21,48.60±2.88,2.52±0.18,0.38±0.21,14.51±3.87,1.02±0.26,0.19±0.47,459.42±21.55,10.71±0.36
9,NUQ,CER,aleatoric,0.27±0.23,13.14±1.00,1.78±0.15,0.57±0.07,43.54±4.63,2.00±0.11,0.33±0.16,10.69±2.06,0.75±0.14,0.19±0.35,439.01±15.14,10.43±0.20


In [6]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
sn_values = [0.2, 0.4, 0.6, 0.8, 1, 2, 3]
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for sn_value in sn_values:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}_{sn_value}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series_sn_params/electra_{reg}_{sn}/{name}/0.2/{sn_value}/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/mrpc/0.2/0.2/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/cola/0.2/0.2/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/sst2/0.2/0.2/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/mrpc/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/cola/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/sst2/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/mrpc/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/cola/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/sst2/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/mrpc/0.2/0.8/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/cola/0.2/0.8/mahalanobis

In [7]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-18:])]).reset_index()

In [11]:
def to_float(x):
    return float(x.split('±')[0])

table_all.sort_values(by=('SST-2', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp
17,mahalanobis|metric|sn_0.8,mahalanobis_distance,85.38±0.67,30.82±1.68,2.24±0.14,85.03±1.25,119.17±16.56,3.46±0.40,93.63±0.41,20.85±4.03,0.92±0.14
18,mahalanobis|metric|sn_2,mahalanobis_distance,85.51±0.22,29.01±0.99,2.10±0.08,85.47±1.29,107.51±15.20,3.14±0.41,93.69±0.46,21.26±4.13,0.92±0.13
19,mahalanobis|metric|sn_3,mahalanobis_distance,85.51±0.22,29.01±0.99,2.10±0.08,85.47±1.29,107.51±15.20,3.14±0.41,93.69±0.46,21.26±4.13,0.92±0.13
15,mahalanobis|metric|sn_0.4,mahalanobis_distance,85.72±0.36,29.89±1.69,2.21±0.18,84.58±1.47,131.34±29.13,3.78±0.74,93.74±0.73,21.77±4.79,0.96±0.15
36,baseline|metric|sn_2,max_prob,0,49.65±8.14,2.99±0.38,0,98.97±6.24,2.94±0.10,0,21.88±4.22,0.93±0.20
37,baseline|metric|sn_3,max_prob,0,49.65±8.14,2.99±0.38,0,98.97±6.24,2.94±0.10,0,21.88±4.22,0.93±0.20
16,mahalanobis|metric|sn_0.6,mahalanobis_distance,85.74±0.90,31.09±2.77,2.28±0.16,84.69±1.46,129.01±24.56,3.72±0.61,93.58±0.61,21.93±5.11,0.95±0.15
14,mahalanobis|metric|sn_0.2,mahalanobis_distance,84.58±1.00,31.00±2.98,2.16±0.14,85.71±0.52,102.05±9.56,2.95±0.28,93.29±0.86,21.96±3.48,0.96±0.13
3,mahalanobis|raw|sn_0.8,mahalanobis_distance,85.79±0.61,29.06±2.27,2.06±0.16,87.73±0.44,78.53±4.69,2.32±0.09,93.62±0.75,22.71±3.73,1.01±0.11
1,mahalanobis|raw|sn_0.4,mahalanobis_distance,85.04±1.30,30.68±3.43,2.15±0.16,87.20±0.45,83.07±3.78,2.42±0.07,93.58±0.76,23.34±3.54,1.03±0.12


In [12]:
old_sn_values = {
    'cola': {
        'metric': 2,
        'reg': 1,
        'raw': 2,
    },
    'mrpc': {
        'metric': 2,
        'reg': 1,
        'raw': 2,
    },
    'sst2': {
        'metric': 0.8,
        'reg': 0.6,
        'raw': 0.8,
    },
    'conll2003': {
        'metric': 2,
        'reg': 3,
        'raw': 3,
    },
}

# Electra tune SN values

In [57]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn']
dataset_names = ['SST-2']#['MRPC', 'CoLA']#, 'SST-2']
dataset_fnames = ['sst2']#['mrpc', 'cola']#, 'sst2']
sn_values = [0.2, 0.4, 0.6, 0.8, 1, 2, 3]
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for sn_value in sn_values:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}_{sn_value}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_{reg}_{sn}/{name}/0.2/{sn_value}/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_raw_sn/sst2/0.2/0.2/mahalanobis
pass
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_raw_sn/sst2/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_raw_sn/sst2/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_raw_sn/sst2/0.2/0.8/mahalanobis
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_raw_sn/sst2/0.2/1/mahalanobis
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_raw_sn/sst2/0.2/2/mahalanobis
pass
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_raw_sn/sst2/0.2/3/mahalanobis
pass
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_reg_sn/sst2/0.2/0.2/mahalanobis
pass
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_reg_sn/sst2/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_20_old_net/electra_reg_sn/sst2/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_se

In [58]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-18:])]).reset_index()

In [59]:
sn_values = {
    'cola': {
        'metric': 0.4,
        'reg': 0.4,
        'raw': 1,
    },
    'mrpc': {
        'metric': 0.4,
        'reg': 3,
        'raw': 1,
    },
    'sst2': {
        'metric': 1,
        'reg': 0.8,
        'raw': 0.8,
    },
    'conll2003': {
        'metric': 3,
        'reg': 1,
        'raw': 2,
    },
}

In [60]:
def to_float(x):
    return float(x.split('±')[0])

table_all.sort_values(by=('SST-2', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp
2,mahalanobis|raw|sn_0.8,mahalanobis_distance,93.36±0.64,20.00±3.44,0.87±0.13
6,mahalanobis|reg|sn_0.8,mahalanobis_distance,93.27±1.07,20.45±4.22,0.89±0.17
4,mahalanobis|reg|sn_0.4,mahalanobis_distance,93.38±0.68,20.64±2.72,0.90±0.12
5,mahalanobis|reg|sn_0.6,mahalanobis_distance,93.29±0.90,20.68±4.57,0.90±0.17
1,mahalanobis|raw|sn_0.6,mahalanobis_distance,93.43±1.08,20.98±4.53,0.89±0.15
29,baseline|metric|sn_1,max_prob,0,21.01±2.39,0.91±0.12
31,baseline|metric|sn_3,max_prob,0,21.15±3.46,0.91±0.14
30,baseline|metric|sn_2,max_prob,0,21.15±3.46,0.91±0.14
9,mahalanobis|reg|sn_3,mahalanobis_distance,93.53±0.98,21.26±2.88,0.93±0.12
0,mahalanobis|raw|sn_0.4,mahalanobis_distance,93.38±0.76,21.26±4.42,0.91±0.15


In [39]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['metric']
spectralnorm = ['sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
sn_values = [0.2, 0.4, 0.6, 0.8, 1, 2, 3]
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for sn_value in sn_values:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}_{sn_value}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series_sn_params/electra_{reg}_{sn}/{name}/0.2/{sn_value}/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/mrpc/0.2/0.2/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/cola/0.2/0.2/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/sst2/0.2/0.2/mahalanobis
pass
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/mrpc/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/cola/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/sst2/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/mrpc/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/cola/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/sst2/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/mrpc/0.2/0.8/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/elec

In [40]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [47]:
def to_float(x):
    return float(x.split('±')[0])

table_all.sort_values(by=('SST-2', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp
3,mahalanobis|metric|sn_1,mahalanobis_distance,84.90±1.59,30.37±2.20,2.13±0.17,87.33±0.51,76.08±3.54,2.22±0.08,93.57±0.63,20.74±5.30,0.88±0.17
4,mahalanobis|metric|sn_2,mahalanobis_distance,84.60±1.00,30.37±1.19,2.10±0.11,87.40±0.30,75.55±3.73,2.21±0.08,93.57±0.60,21.01±4.79,0.90±0.16
5,mahalanobis|metric|sn_3,mahalanobis_distance,84.60±1.00,30.37±1.19,2.10±0.11,87.40±0.30,75.55±3.73,2.21±0.08,93.57±0.60,21.01±4.79,0.90±0.16
1,mahalanobis|metric|sn_0.6,mahalanobis_distance,85.22±0.71,29.58±0.81,2.10±0.06,87.39±0.32,75.76±3.99,2.21±0.11,93.42±0.85,21.14±3.33,0.92±0.09
9,baseline|metric|sn_1,max_prob,0,45.99±15.62,2.83±0.58,0,115.48±2.85,3.06±0.09,0,21.66±3.05,0.94±0.16
2,mahalanobis|metric|sn_0.8,mahalanobis_distance,84.72±0.29,30.22±1.27,2.09±0.13,87.09±0.36,76.68±3.64,2.22±0.11,93.65±0.82,21.87±4.32,0.93±0.10
8,baseline|metric|sn_0.8,max_prob,0,44.49±12.65,2.75±0.49,0,129.59±12.77,3.33±0.23,0,22.15±2.89,0.94±0.14
0,mahalanobis|metric|sn_0.4,mahalanobis_distance,85.42±0.64,29.47±2.00,2.09±0.13,87.31±0.40,75.26±2.02,2.20±0.03,93.44±0.96,22.66±4.38,0.95±0.11
10,baseline|metric|sn_2,max_prob,0,47.42±13.01,2.91±0.55,0,117.55±23.49,3.09±0.36,0,23.16±3.43,0.98±0.19
11,baseline|metric|sn_3,max_prob,0,47.42±13.01,2.91±0.55,0,117.55±23.49,3.09±0.36,0,23.16±3.43,0.98±0.19


## Deberta

In [87]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
sn_values = [0.4, 0.6, 0.8, 1, 2, 3]
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for sn_value in sn_values:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}_{sn_value}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series_sn_params/deberta_{reg}_{sn}/{name}/0.2/{sn_value}/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/mrpc/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/cola/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/sst2/0.2/0.4/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/mrpc/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/cola/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/sst2/0.2/0.6/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/mrpc/0.2/0.8/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/cola/0.2/0.8/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/sst2/0.2/0.8/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/mrpc/0.2/1/mahalanobis
../../workdir/run_tasks_for_model_series_sn_params/deberta_raw_sn/cola/0.2/1/mahalanobis
../

In [88]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [91]:
def to_float(x):
    return float(x.split('±')[0])

table_all.sort_values(by=('MRPC', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp
7,mahalanobis|reg|sn_0.6,mahalanobis_distance,86.94±0.57,24.14±1.94,1.78±0.17,85.15±0.50,105.89±4.66,2.96±0.12,91.92±0.86,26.96±4.19,1.18±0.17
8,mahalanobis|reg|sn_0.8,mahalanobis_distance,86.83±0.55,24.19±2.01,1.79±0.18,85.45±0.18,104.26±5.75,2.96±0.12,91.76±0.88,29.29±3.29,1.26±0.13
11,mahalanobis|reg|sn_3,mahalanobis_distance,86.90±0.83,25.32±1.20,1.86±0.06,85.22±0.34,106.45±9.05,3.01±0.20,91.95±1.00,28.73±5.05,1.25±0.20
10,mahalanobis|reg|sn_2,mahalanobis_distance,86.90±0.83,25.32±1.20,1.86±0.06,85.22±0.34,106.45±9.05,3.01±0.20,91.95±1.00,28.73±5.05,1.25±0.20
9,mahalanobis|reg|sn_1,mahalanobis_distance,87.13±0.67,25.53±2.32,1.94±0.16,85.57±0.31,105.70±5.66,3.02±0.11,91.91±0.69,29.21±2.42,1.26±0.09
6,mahalanobis|reg|sn_0.4,mahalanobis_distance,86.10±0.54,26.12±1.25,1.90±0.06,85.46±0.50,100.30±5.80,2.89±0.15,91.78±0.92,27.23±3.20,1.18±0.11
2,mahalanobis|raw|sn_0.8,mahalanobis_distance,86.42±0.82,27.36±0.82,2.05±0.11,84.72±0.27,109.78±4.81,3.03±0.14,92.20±0.65,27.61±3.33,1.22±0.13
4,mahalanobis|raw|sn_2,mahalanobis_distance,86.67±1.09,27.70±1.85,2.09±0.13,85.26±0.15,111.10±5.56,3.14±0.14,91.81±0.80,29.57±3.53,1.25±0.10
5,mahalanobis|raw|sn_3,mahalanobis_distance,86.67±1.09,27.70±1.85,2.09±0.13,85.26±0.15,111.10±5.56,3.14±0.14,91.81±0.80,29.57±3.53,1.25±0.10
3,mahalanobis|raw|sn_1,mahalanobis_distance,86.51±0.78,27.76±0.86,2.10±0.12,85.14±0.29,110.84±5.12,3.08±0.12,91.87±0.61,27.52±3.94,1.19±0.15


In [90]:
deberta_sn_values = {
    'cola': {
        'metric': 0.8,
        'reg': 0.4,
        'raw': 0.4,
    },
    'mrpc': {
        'metric': 1 (None),
        'reg': 0.6,
        'raw': 0.8,
    },
    'sst2': {
        'metric': 0.6,
        'reg': 0.6,
        'raw': 0.6,
    },
    'conll2003': {
        'metric': 2,
        'reg': 1,
        'raw': 1,
    },
}

TypeError: 'int' object is not callable

In [3]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=['accuracy', "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['new', 'new_val']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}|{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../../workdir/run_tasks_for_model_series_{sn}/electra_{reg}_sn/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            agg_func = choose_agg_func(method)
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass

../../workdir/run_tasks_for_model_series_new/electra_raw_sn/mrpc/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new/electra_raw_sn/cola/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new/electra_raw_sn/sst2/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new_val/electra_raw_sn/mrpc/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new_val/electra_raw_sn/cola/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new_val/electra_raw_sn/sst2/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new/electra_reg_sn/mrpc/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new/electra_reg_sn/cola/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new/electra_reg_sn/sst2/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new_val/electra_reg_sn/mrpc/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new_val/electra_reg_sn/cola/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_new_val/electra_reg_sn/sst2/0.0/mahalanobis
../.

In [4]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [5]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        if sn == 'new':
            return 'MD SN (ours)'
        elif sn == 'new_val':
            return 'MD SN (ours) opt.val.'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        if sn == 'new':
            return 'SR SN'
        elif sn == 'new_val':
            return 'SR SN opt.val.'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    elif method == 'disc_md':
        return 'Disc MD'
    elif method == 'nondisc_md':
        return 'Nondisc MD'
    elif method == 'disc+nondisc_md':
        return 'Disc+Nondisc MD'
    return 'MP'

table_all['Reg. Type'] = table_all.Method.apply(lambda x: preproc_regs(x))
table_all['Method'] = table_all.Method.apply(lambda x: preproc_method(x))
table_all['UE Score'] = table_all['UE Score'].apply(lambda x: preproc_ue(x))
table_all = table_all[list(table_all.columns[:1]) + list(table_all.columns[-1:]) + list(table_all.columns[1:-1])].reset_index(drop=True)

In [6]:
table_all

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp,accuracy,rcc-auc,rpp
0,MD SN (ours),-,MD,88.64±1.04,12.79±2.34,1.76±0.33,86.64±0.37,41.05±2.64,1.98±0.10,93.29±0.78,12.74±2.66,0.86±0.12
1,MD SN (ours) opt.val.,-,MD,87.66±0.95,12.94±2.45,1.71±0.34,86.72±0.34,39.74±2.43,1.96±0.11,92.91±0.83,11.98±1.82,0.80±0.09
2,MD SN (ours),CER,MD,88.15±0.72,12.70±1.88,1.72±0.30,86.67±0.61,41.96±0.70,2.01±0.08,93.23±0.26,11.37±1.34,0.79±0.09
3,MD SN (ours) opt.val.,CER,MD,88.03±0.99,12.08±1.20,1.62±0.15,86.34±0.41,41.32±1.78,1.95±0.07,93.27±0.56,10.46±0.75,0.75±0.03
4,MD SN (ours),metric,MD,87.99±0.85,13.56±1.98,1.82±0.25,86.53±0.38,41.38±1.57,2.00±0.09,93.42±0.89,11.12±2.85,0.80±0.17
5,MD SN (ours) opt.val.,metric,MD,87.87±0.53,12.61±1.49,1.66±0.22,86.39±0.35,40.00±2.60,1.98±0.10,93.16±0.45,11.28±1.50,0.80±0.11
6,SR SN,-,MP,0,16.17±2.35,2.04±0.32,0,48.56±2.45,2.56±0.10,0,19.45±7.14,1.34±0.48
7,SR SN opt.val.,-,MP,0,20.40±5.70,2.34±0.48,0,76.62±9.34,3.51±0.18,0,18.54±4.98,1.16±0.31
8,SR SN,CER,MP,0,15.26±2.87,2.04±0.31,0,64.75±11.96,3.00±0.46,0,17.90±5.05,1.17±0.28
9,SR SN opt.val.,CER,MP,0,24.63±9.40,2.61±0.72,0,91.21±18.02,3.78±0.53,0,42.85±17.16,2.08±0.43


# DDPP HP

In [24]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
comsizes = [50]

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
kernels = ['rbf']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for kernel in kernels:
                for reg in regs:
                    run_dirs = []
                    names = [f'ddpp_{method}|{max_frac}|{kernel}']
                    for name in dataset_fnames:
                        model_series_dir = f'../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/{name}/0.0/{method}_{kernel}_{max_frac}_{cs}'
                        run_dirs.append([model_series_dir])
                    try:
                        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                        baselines.append(res_df.iloc[-1:])
                        tables.append(res_df.iloc[:-1])
                    except:
                        print(f'Not exists one of this dirs: {run_dirs}')

In [9]:
table_all = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [19]:
def to_float(x):
    return float(x.split('±')[0])

table_all.sort_values(by=('CoLA', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
8,ddpp_ddpp_dpp|0.4|rbf,variance,-0.13±0.35,43.42±9.25,2.53±0.35,0.21±0.28,97.15±7.43,2.73±0.12,0.00±0.14,25.55±1.98,1.12±0.09
6,ddpp_ddpp_dpp|0.4|rbf,bald,-0.19±0.30,45.10±10.64,2.57±0.38,0.20±0.28,97.25±7.24,2.73±0.12,-0.08±0.16,28.25±4.38,1.19±0.11
30,ddpp_ddpp_ood|0.45|rbf,bald,-0.10±0.09,39.64±7.78,2.50±0.25,0.17±0.29,98.19±7.24,2.78±0.12,0.04±0.17,27.05±2.72,1.09±0.20
9,ddpp_ddpp_dpp|0.45|rbf,bald,-0.05±0.15,39.35±9.14,2.44±0.36,0.12±0.29,99.45±7.38,2.79±0.11,0.01±0.10,26.49±5.83,1.11±0.17
11,ddpp_ddpp_dpp|0.45|rbf,variance,0.01±0.07,38.11±5.99,2.38±0.24,0.13±0.24,100.00±7.98,2.80±0.10,0.10±0.11,23.94±3.65,1.02±0.13
12,ddpp_ddpp_dpp|0.5|rbf,bald,0.01±0.06,38.04±6.70,2.40±0.23,0.13±0.29,100.27±7.07,2.81±0.09,-0.01±0.18,25.75±3.36,1.13±0.16
21,ddpp_ddpp_ood|0.3|rbf,bald,-0.08±0.14,39.78±6.75,2.48±0.24,0.14±0.28,100.29±12.15,2.79±0.16,-0.04±0.21,27.28±4.06,1.16±0.21
23,ddpp_ddpp_ood|0.3|rbf,variance,-0.08±0.11,39.33±6.90,2.46±0.25,0.14±0.25,100.50±13.65,2.77±0.17,-0.08±0.15,27.46±5.61,1.20±0.21
14,ddpp_ddpp_dpp|0.5|rbf,variance,0.02±0.05,38.10±7.25,2.39±0.24,0.14±0.28,100.73±8.23,2.79±0.10,-0.01±0.13,25.39±3.31,1.13±0.14
39,ddpp_ddpp_ood|0.6|rbf,bald,-0.08±0.08,40.75±7.92,2.49±0.30,0.13±0.21,101.32±17.26,2.78±0.25,-0.12±0.15,29.80±2.98,1.23±0.13


In [20]:
ds_to_mf = {
    'mrpc': {'ddpp_ood': 0.4, 'ddpp_dpp': 0.55},
    'cola': {'ddpp_ood': 0.45, 'ddpp_dpp': 0.4},
    'sst2': {'ddpp_ood': 0.35, 'ddpp_dpp': 0.45}
}

In [11]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
comsizes = [50]

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
kernels = ['rbf']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for kernel in kernels:
                for reg in regs:
                    run_dirs = []
                    names = [f'ddpp_{method}|{max_frac}|{kernel}']
                    for name in dataset_fnames:
                        model_series_dir = f'../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/{name}/0.0/{method}_{kernel}_{max_frac}_{cs}'
                        run_dirs.append([model_series_dir])
                    try:
                        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                        baselines.append(res_df.iloc[-1:])
                        tables.append(res_df.iloc[:-1])
                    except:
                        print(f'Not exists one of this dirs: {run_dirs}')

In [12]:
table_all_deberta = pd.concat([pd.concat(tables), pd.concat(baselines[-6:])]).reset_index()

In [28]:
def to_float(x):
    return float(x.split('±')[0])

table_all_deberta.sort_values(by=('MRPC', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
31,ddpp_ddpp_ood|0.45|rbf,sampled_max_prob,0.23±0.36,41.42±24.84,2.59±0.92,-0.04±0.07,125.24±14.33,3.51±0.31,-0.09±0.07,38.49±3.93,1.58±0.14
34,ddpp_ddpp_ood|0.5|rbf,sampled_max_prob,0.13±0.33,42.25±24.94,2.67±0.89,-0.07±0.08,126.24±15.42,3.52±0.31,-0.11±0.16,39.16±2.47,1.60±0.07
22,ddpp_ddpp_ood|0.3|rbf,sampled_max_prob,0.18±0.40,42.28±24.24,2.63±0.84,-0.13±0.10,129.53±15.14,3.60±0.31,-0.09±0.08,38.41±3.27,1.56±0.09
33,ddpp_ddpp_ood|0.5|rbf,bald,0.13±0.32,42.42±25.33,2.67±0.89,-0.17±0.16,130.31±18.20,3.62±0.34,-0.08±0.18,38.26±3.36,1.56±0.11
35,ddpp_ddpp_ood|0.5|rbf,variance,0.13±0.32,42.53±25.89,2.67±0.91,-0.16±0.13,130.21±18.75,3.60±0.34,-0.10±0.18,38.83±3.17,1.59±0.11
32,ddpp_ddpp_ood|0.45|rbf,variance,0.21±0.36,42.74±24.37,2.59±0.94,-0.12±0.11,127.82±13.56,3.59±0.30,-0.02±0.13,36.50±6.74,1.50±0.23
19,ddpp_ddpp_dpp|0.6|rbf,sampled_max_prob,0.21±0.32,43.03±24.54,2.61±0.89,-0.11±0.06,126.98±13.29,3.57±0.28,0.05±0.06,35.29±4.15,1.43±0.13
28,ddpp_ddpp_ood|0.4|rbf,sampled_max_prob,0.14±0.36,43.31±22.59,2.67±0.83,-0.04±0.08,125.85±14.15,3.51±0.32,-0.16±0.17,39.86±2.47,1.64±0.11
30,ddpp_ddpp_ood|0.45|rbf,bald,0.21±0.34,43.47±24.51,2.62±0.95,-0.16±0.11,128.77±13.13,3.62±0.28,-0.01±0.11,36.45±6.10,1.49±0.23
21,ddpp_ddpp_ood|0.3|rbf,bald,0.12±0.41,43.50±23.17,2.71±0.83,-0.29±0.06,135.24±15.10,3.75±0.28,-0.02±0.09,38.30±5.43,1.51±0.12


In [25]:
deberta_ds_to_mf = {
    'mrpc': {'ddpp_ood': 0.45, 'ddpp_dpp': 0.6},
    'cola': {'ddpp_ood': 0.45, 'ddpp_dpp': 0.6},
    'sst2': {'ddpp_ood': 0.45, 'ddpp_dpp': 0.6}
}

# Final results

## Electra

In [35]:
import os 

metric_types=["rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw', 'reg', 'metric']

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
kernels = ['rbf']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        names = [f'ddpp_{method}|{reg}']
        for name in dataset_fnames:
            model_series_dir = f'../../workdir/run_tasks_for_model_series/electra_{reg}_no_sn/{name}/0.0/{method}'
            run_dirs.append([model_series_dir])
        try:
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
        except:
            print(f'Not exists one of this dirs: {run_dirs}')

In [36]:
table_res = pd.concat([pd.concat(tables), pd.concat(baselines[-3:])]).reset_index()

In [37]:
table_res

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,CoLA,CoLA,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp,rcc-auc,rpp
0,ddpp_ddpp_dpp|raw,bald,23.08±7.00,2.63±0.63,49.59±5.40,2.48±0.31,16.08±2.37,1.05±0.18
1,ddpp_ddpp_dpp|raw,sampled_max_prob,21.79±7.72,2.57±0.68,47.86±5.51,2.39±0.31,17.55±3.03,1.19±0.23
2,ddpp_ddpp_dpp|raw,variance,22.30±7.15,2.58±0.65,49.75±3.96,2.44±0.29,16.70±1.38,1.12±0.12
3,ddpp_ddpp_dpp|reg,bald,15.94±3.77,2.07±0.36,55.11±7.42,2.61±0.31,14.87±2.22,0.96±0.13
4,ddpp_ddpp_dpp|reg,sampled_max_prob,14.75±1.43,2.02±0.16,54.01±9.79,2.55±0.18,14.47±1.63,0.99±0.11
5,ddpp_ddpp_dpp|reg,variance,15.12±2.27,2.03±0.24,54.51±8.80,2.58±0.22,13.56±1.37,0.91±0.14
6,ddpp_ddpp_dpp|metric,bald,20.54±4.72,2.52±0.34,43.95±1.68,2.17±0.12,15.48±1.81,1.03±0.08
7,ddpp_ddpp_dpp|metric,sampled_max_prob,18.45±2.88,2.41±0.26,43.61±1.61,2.16±0.11,16.78±3.43,1.14±0.26
8,ddpp_ddpp_dpp|metric,variance,19.51±3.40,2.47±0.28,43.82±1.82,2.17±0.14,15.79±1.67,1.07±0.14
9,ddpp_ddpp_ood|raw,bald,23.85±8.39,2.69±0.58,52.59±12.08,2.42±0.34,18.27±3.05,1.22±0.23


In [54]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=["rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series_sn/electra_{reg}_{sn}/{name}/0.0/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../workdir/run_tasks_for_model_series_sn/electra_raw_sn/mrpc/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_raw_sn/cola/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_raw_sn/sst2/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_reg_sn/mrpc/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_reg_sn/cola/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_reg_sn/sst2/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_metric_sn/mrpc/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_metric_sn/cola/0.0/mahalanobis
../../workdir/run_tasks_for_model_series_sn/electra_metric_sn/sst2/0.0/mahalanobis


In [55]:
table_res = pd.concat([pd.concat(tables), pd.concat(baselines[-3:])]).reset_index()

In [56]:
table_res

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,CoLA,CoLA,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp,rcc-auc,rpp
0,mahalanobis|raw|sn,mahalanobis_distance,13.44±1.28,1.85±0.20,40.07±3.62,1.95±0.16,11.77±1.33,0.83±0.08
1,mahalanobis|reg|sn,mahalanobis_distance,14.41±1.96,1.94±0.21,37.82±2.91,1.90±0.12,12.32±1.37,0.85±0.10
2,mahalanobis|metric|sn,mahalanobis_distance,12.04±1.33,1.56±0.12,39.37±2.00,1.97±0.15,12.05±1.42,0.84±0.07
3,baseline|raw|sn,max_prob,18.83±3.89,2.46±0.46,81.25±12.56,3.40±0.33,19.02±6.07,1.21±0.35
4,baseline|reg|sn,max_prob,19.27±6.27,2.49±0.65,94.11±12.51,3.65±0.36,28.37±12.91,1.55±0.46
5,baseline|metric|sn,max_prob,38.75±15.49,3.61±0.81,117.90±24.50,4.20±0.57,18.01±5.86,1.17±0.29


## Deberta

In [60]:
import os 

metric_types=["rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw', 'reg', 'metric']

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        names = [f'ddpp_{method}|{reg}']
        for name in dataset_fnames:
            model_series_dir = f'../../workdir/run_tasks_for_model_series/deberta_{reg}_no_sn/{name}/0.0/{method}'
            run_dirs.append([model_series_dir])
        try:
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
        except:
            print(f'Not exists one of this dirs: {run_dirs}')

In [61]:
table_res = pd.concat([pd.concat(tables), pd.concat(baselines[-3:])]).reset_index()

In [62]:
table_res

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,CoLA,CoLA,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp,rcc-auc,rpp
0,ddpp_ddpp_dpp|raw,bald,18.05±2.65,2.37±0.29,70.87±7.90,3.47±0.29,18.65±2.68,1.21±0.15
1,ddpp_ddpp_dpp|raw,sampled_max_prob,18.13±3.27,2.30±0.34,68.12±6.34,3.29±0.23,17.74±4.17,1.17±0.24
2,ddpp_ddpp_dpp|raw,variance,18.12±2.53,2.36±0.27,69.81±7.82,3.40±0.29,18.41±3.57,1.20±0.19
3,ddpp_ddpp_dpp|reg,bald,15.03±2.43,1.92±0.22,75.21±6.18,3.47±0.34,18.17±6.79,1.13±0.31
4,ddpp_ddpp_dpp|reg,sampled_max_prob,16.69±5.35,1.99±0.45,72.15±7.10,3.29±0.34,16.57±6.35,1.08±0.31
5,ddpp_ddpp_dpp|reg,variance,14.80±2.56,1.88±0.22,73.34±8.08,3.39±0.39,17.61±7.41,1.10±0.32
6,ddpp_ddpp_dpp|metric,bald,22.01±5.14,2.47±0.38,94.80±26.55,4.36±0.82,19.11±3.55,1.26±0.26
7,ddpp_ddpp_dpp|metric,sampled_max_prob,20.98±5.38,2.35±0.42,86.19±16.29,3.90±0.51,17.15±2.96,1.14±0.23
8,ddpp_ddpp_dpp|metric,variance,21.89±5.24,2.45±0.39,90.05±20.81,4.11±0.63,18.50±3.46,1.22±0.26
9,ddpp_ddpp_ood|raw,bald,18.67±4.39,2.37±0.48,69.38±8.14,3.34±0.32,18.67±3.16,1.22±0.22


In [3]:
import os 
from sklearn.metrics import accuracy_score

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    results["mahalanobis_distance"] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='decomposing_md':
        disc_md = lambda x: np.squeeze(x[0], axis=-1)
        nondisc_md = lambda x: np.squeeze(x[1], axis=-1)
        sum_md = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {"disc_md": disc_md, 
                       'nondisc_md': nondisc_md,
                       'disc+nondisc_md': sum_md}
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    elif method=='ddu' or method=='ddu_maha':
        ddu = lambda x: -np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"ddu": ddu}
    elif method=='sngp':
        ddu = lambda x: np.squeeze(x[:, 0], axis=-1)
        agg_methods = {"stds": ddu}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

    
metric_types=["rcc-auc", 'rpp']
methods = ['sngp']
regs = ['raw']
spectralnorm = ['sngp']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series/electra_{reg}_{sn}/{name}/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../workdir/run_tasks_for_model_series/electra_raw_sngp/mrpc/sngp
../../workdir/run_tasks_for_model_series/electra_raw_sngp/cola/sngp
../../workdir/run_tasks_for_model_series/electra_raw_sngp/sst2/sngp


In [4]:
table_res = pd.concat([pd.concat(tables), pd.concat(baselines[-3:])]).reset_index()

In [5]:
table_res

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,CoLA,CoLA,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp,rcc-auc,rpp
0,sngp|raw|sngp,stds,55.84±50.04,5.40±4.08,48.31±2.53,2.38±0.10,12.84±3.33,0.88±0.19
1,baseline|raw|sngp,max_prob,50.34±44.61,4.99±3.47,67.57±5.93,3.26±0.15,31.30±5.45,1.90±0.29


## Results with HS-RAU

In [243]:
import os 
from sklearn.metrics import accuracy_score
from ue4nlp.alpaca_calibrator import compute_ece, compute_sce

def from_model_outputs_calc_acc(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    for method in methods:
        results[method] = accuracy_score(labels, predictions)
    results["max_prob"] = accuracy_score(labels, predictions)
    return results

def from_model_outputs_calc_ece(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    for method in methods:
        results[method] = compute_ece(20, np.asarray(model_outputs["probabilities"]), labels, len(labels)).numpy()[0]
    results["max_prob"] = compute_ece(20, np.asarray(model_outputs["probabilities"]), labels, len(labels)).numpy()[0]
    return results

def from_model_outputs_calc_sce(model_outputs, methods=None):
    predictions = np.argmax(np.asarray(model_outputs["probabilities"]), axis=-1)
    labels = np.asarray(model_outputs["true_labels"])
    results = {}
    for method in methods:
        results[method] = compute_sce(20, np.asarray(model_outputs["probabilities"]), labels).numpy()[0]*10
    results["max_prob"] = compute_sce(20, np.asarray(model_outputs["probabilities"]), labels).numpy()[0]*10
    return results

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"
    elif metric_type  == "accuracy":
        return from_model_outputs_calc_acc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    elif metric_type == "ece":
        return from_model_outputs_calc_ece
    elif metric_type == "sce":
        return from_model_outputs_calc_sce
    else:
        raise ValueError("Wrong metric type!")
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        #rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        #marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    return agg_methods

    
metric_types=["rcc-auc", 'rpp', 'ece', 'sce', 'accuracy']
methods = ['mahalanobis', 'mc']
regs = ['hs_rau']
spectralnorm = ['no_sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2', 'SST-5', 'Amazon']
dataset_fnames = ['mrpc', 'cola', 'sst2', 'sst5', 'amazon']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/mrpc/0.0/mahalanobis
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/cola/0.0/mahalanobis
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/sst2/0.0/mahalanobis
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/sst5/0.0/mahalanobis
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/amazon/0.0/mahalanobis
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/mrpc/0.0/mc
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/cola/0.0/mc
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/sst2/0.0/mc
../../../uncertainty-estimation_cp/workdir/run_tasks_for_model_series/electra_hs_rau_no_sn/sst5/0.0/mc
../../../uncertainty-estim

In [244]:
table_res = pd.concat([pd.concat([t[:-1] for t in tables]), pd.concat(baselines[-1:])]).reset_index()

In [245]:
table_res

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,...,SST-5,SST-5,SST-5,SST-5,SST-5,Amazon,Amazon,Amazon,Amazon,Amazon
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,ece,sce,accuracy,rcc-auc,rpp,ece,...,rcc-auc,rpp,ece,sce,accuracy,rcc-auc,rpp,ece,sce,accuracy
0,mahalanobis|hs_rau|no_sn,mahalanobis_distance,13.58±1.03,1.91±0.17,11.06±0.74,55.97±3.49,88.60±0.67,41.17±1.19,2.03±0.03,12.53±0.54,...,474.62±14.04,11.51±0.19,14.43±1.36,36.62±1.30,54.74±0.88,4753.86±237.67,6.12±0.28,13.98±0.41,29.03±0.88,73.32±0.19
1,mc|hs_rau|no_sn,bald,11.07±2.47,1.38±0.25,11.06±0.74,55.97±3.49,88.60±0.67,45.94±2.35,2.16±0.09,12.53±0.54,...,423.89±12.97,10.19±0.27,14.43±1.36,36.62±1.30,54.74±0.88,3623.06±76.74,4.59±0.08,13.98±0.41,29.03±0.88,73.32±0.19
2,mc|hs_rau|no_sn,sampled_max_prob,10.52±1.44,1.35±0.21,11.06±0.74,55.97±3.49,88.60±0.67,44.09±1.82,2.04±0.08,12.53±0.54,...,405.04±15.21,9.59±0.29,14.43±1.36,36.62±1.30,54.74±0.88,3488.64±57.27,4.36±0.07,13.98±0.41,29.03±0.88,73.32±0.19
3,mc|hs_rau|no_sn,variance,10.80±2.04,1.37±0.23,11.06±0.74,55.97±3.49,88.60±0.67,45.32±2.05,2.12±0.09,12.53±0.54,...,410.81±15.88,9.87±0.30,14.43±1.36,36.62±1.30,54.74±0.88,3589.49±65.19,4.58±0.07,13.98±0.41,29.03±0.88,73.32±0.19
4,baseline|hs_rau|no_sn,max_prob,13.18±1.64,1.65±0.21,0,0,0,53.37±6.13,2.30±0.15,0,...,417.35±9.86,10.00±0.29,0,0,0,3525.96±60.09,4.44±0.06,0,0,0


In [246]:
def preproc_regs(x):
    reg = x.split('|')[1]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    elif reg == 'hs_rau':
        return 'HS-RAU'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    
    elif method == 'decomposing_md' and not 'no_sn' in sn:
        return 'Decomposing SN'
    elif method == 'decomposing_md':
        return 'Decomposing'
    
    elif method == 'nuq_best1' and not 'no_sn' in sn:
        return 'Best1 NUQ SN'
    elif method == 'nuq_best1':
        return 'Best1 NUQ'
    
    elif method == 'ddu' and not 'no_sn' in sn:
        return 'DDU SN'
    elif method == 'ddu':
        return 'DDU'
    
    elif method == 'ddu' and not 'no_sn' in sn:
        return 'DDU SN'
    elif method == 'ddu':
        return 'DDU'
    
    elif method == 'ddu_maha' and not 'no_sn' in sn:
        return 'DDU Maha SN'
    elif method == 'ddu_maha':
        return 'DDU Maha'
    
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    elif method == 'disc_md':
        return 'Disc MD'
    elif method == 'nondisc_md':
        return 'Nondisc MD'
    elif method == 'disc+nondisc_md':
        return 'Disc+Nondisc MD'
    return 'MP'

table_res['Reg. Type'] = table_res.Method.apply(lambda x: preproc_regs(x))
table_res['Method'] = table_res.Method.apply(lambda x: preproc_method(x))
table_res['UE Score'] = table_res['UE Score'].apply(lambda x: preproc_ue(x))
table_res = table_res[list(table_res.columns[:1]) + list(table_res.columns[-1:]) + list(table_res.columns[1:-1])].reset_index(drop=True)

In [248]:
table_res5 = table_res[list(table_res.columns[:3])+list(table_res.columns[-10:])]

In [259]:
table_old = pd.DataFrame({('Method', ''): ['SR', 'SR'],
                          ('Reg. Type', ''): ['CER', 'metric'],
                          ('UE Score', ''): ['MP', 'MP'],
                          ('SST-5', 'rcc-auc'): ['-', '438.14±2.20'],
                          ('SST-5', 'rpp'): ['-', '10.60±0.23'],
                          ('SST-5', 'ece'): ['-', '-'],
                          ('SST-5', 'sce'): ['-', '-'],
                          ('SST-5', 'accuracy'): ['-', '-'],
                          ('Amazon', 'rcc-auc'): ['3651.25±45.92', '-'],
                          ('Amazon', 'rpp'): ['4.64±0.06', '-'],
                          ('Amazon', 'ece'): ['-', '-'],
                          ('Amazon', 'sce'): ['-', '-'],
                          ('Amazon', 'accuracy'): ['-', '-']
                         }, columns=table_res5.columns)

In [263]:
pd.concat([table_res5, table_old]).reset_index(drop=True)

Unnamed: 0_level_0,Method,Reg. Type,UE Score,SST-5,SST-5,SST-5,SST-5,SST-5,Amazon,Amazon,Amazon,Amazon,Amazon
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rcc-auc,rpp,ece,sce,accuracy,rcc-auc,rpp,ece,sce,accuracy
0,MD,HS-RAU,MD,474.62±14.04,11.51±0.19,14.43±1.36,36.62±1.30,54.74±0.88,4753.86±237.67,6.12±0.28,13.98±0.41,29.03±0.88,73.32±0.19
1,MC dropout,HS-RAU,BALD,423.89±12.97,10.19±0.27,14.43±1.36,36.62±1.30,54.74±0.88,3623.06±76.74,4.59±0.08,13.98±0.41,29.03±0.88,73.32±0.19
2,MC dropout,HS-RAU,SMP,405.04±15.21,9.59±0.29,14.43±1.36,36.62±1.30,54.74±0.88,3488.64±57.27,4.36±0.07,13.98±0.41,29.03±0.88,73.32±0.19
3,MC dropout,HS-RAU,PV,410.81±15.88,9.87±0.30,14.43±1.36,36.62±1.30,54.74±0.88,3589.49±65.19,4.58±0.07,13.98±0.41,29.03±0.88,73.32±0.19
4,SR,HS-RAU,MP,417.35±9.86,10.00±0.29,0,0,0,3525.96±60.09,4.44±0.06,0,0,0
5,SR,CER,MP,-,-,-,-,-,3651.25±45.92,4.64±0.06,-,-,-
6,SR,metric,MP,438.14±2.20,10.60±0.23,-,-,-,-,-,-,-,-


In [226]:
table_res = table_res[table_res.columns[:-10]]

In [227]:
import os 
from sklearn.metrics import accuracy_score
from ue4nlp.alpaca_calibrator import compute_ece
    
def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        #rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        #marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    return agg_methods

    
metric_types=["rcc-auc", 'rpp', 'ece', 'sce', 'accuracy']
methods = ['mahalanobis', 'mc']
regs = ['raw']
spectralnorm = ['no_sn']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series/electra_{reg}_{sn}/{name}/noise_perc_0.0/{method}'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method)
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print('pass')
                    pass

../../workdir/run_tasks_for_model_series/electra_raw_no_sn/mrpc/noise_perc_0.0/mahalanobis
../../workdir/run_tasks_for_model_series/electra_raw_no_sn/cola/noise_perc_0.0/mahalanobis
../../workdir/run_tasks_for_model_series/electra_raw_no_sn/sst2/noise_perc_0.0/mahalanobis
../../workdir/run_tasks_for_model_series/electra_raw_no_sn/mrpc/noise_perc_0.0/mc
../../workdir/run_tasks_for_model_series/electra_raw_no_sn/cola/noise_perc_0.0/mc
../../workdir/run_tasks_for_model_series/electra_raw_no_sn/sst2/noise_perc_0.0/mc


In [228]:
table_res1 = pd.concat([pd.concat([t[:-1] for t in tables]), pd.concat(baselines[-1:])]).reset_index()

In [229]:
table_res1['Reg. Type'] = table_res1.Method.apply(lambda x: preproc_regs(x))
table_res1['Method'] = table_res1.Method.apply(lambda x: preproc_method(x))
table_res1['UE Score'] = table_res1['UE Score'].apply(lambda x: preproc_ue(x))
table_res1 = table_res1[list(table_res1.columns[:1]) + list(table_res1.columns[-1:]) + list(table_res1.columns[1:-1])].reset_index(drop=True)

In [230]:
table_res1[('MRPC', 'rcc-auc')][0] = '13.69±1.25'
table_res1[('MRPC', 'rpp')][0] = '1.88±0.13'

table_res1[('CoLA', 'rcc-auc')][0] = '41.73±1.45'
table_res1[('CoLA', 'rpp')][0] = '1.96±0.04'

table_res1[('SST-2', 'rcc-auc')][0] = '13.08±2.58'
table_res1[('SST-2', 'rpp')][0] = '0.86±0.15'

In [240]:
def bold_max(table):
    attr = 'font-weight: bold'
    data = table[table.columns[3:]].apply(lambda x: x.str.split('±').str[0].astype(float))
    is_max = data == data.min()
    is_max[('MRPC', 'accuracy')] = False
    is_max[('CoLA', 'accuracy')] = False
    is_max[('SST-2', 'accuracy')] = False
    info_col = table[table.columns[:3]]
    for col in info_col.columns:
        info_col[col] = ''
    vals = pd.DataFrame(np.where(is_max, attr, ''),
                        index=data.index, columns=data.columns)
    return pd.concat([info_col, vals], axis=1)

def highlight_nmax(s):
    if s.name == ('MRPC', 'accuracy'):
        return ['']*len(s)
    if s.name == ('SST-2', 'accuracy'):
        return ['']*len(s)
    if s.name == ('CoLA', 'accuracy'):
        return ['']*len(s)
    try:
        s_vals = s.str.split('±').str[0].astype(float)
        is_large = s_vals.nsmallest(3).values
        return ['background-color: yellow' if v in is_large else '' for v in s_vals]
    except:
        return ['']*len(s)

In [242]:
res = pd.concat([table_res, table_res1]).reset_index(drop=True)
res.style.apply(highlight_nmax).apply(bold_max, axis=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rcc-auc,rpp,ece,sce,accuracy,rcc-auc,rpp,ece,sce,accuracy,rcc-auc,rpp,ece,sce,accuracy
0,MD,HS-RAU,MD,13.58±1.03,1.91±0.17,11.06±0.74,55.97±3.49,88.60±0.67,41.17±1.19,2.03±0.03,12.53±0.54,63.40±2.66,86.07±0.50,14.33±2.37,0.97±0.14,7.18±0.39,36.43±1.82,92.56±0.49
1,MC dropout,HS-RAU,BALD,11.07±2.47,1.38±0.25,11.06±0.74,55.97±3.49,88.60±0.67,45.94±2.35,2.16±0.09,12.53±0.54,63.40±2.66,86.07±0.50,11.57±1.40,0.75±0.12,7.18±0.39,36.43±1.82,92.56±0.49
2,MC dropout,HS-RAU,SMP,10.52±1.44,1.35±0.21,11.06±0.74,55.97±3.49,88.60±0.67,44.09±1.82,2.04±0.08,12.53±0.54,63.40±2.66,86.07±0.50,11.52±1.57,0.76±0.12,7.18±0.39,36.43±1.82,92.56±0.49
3,MC dropout,HS-RAU,PV,10.80±2.04,1.37±0.23,11.06±0.74,55.97±3.49,88.60±0.67,45.32±2.05,2.12±0.09,12.53±0.54,63.40±2.66,86.07±0.50,11.36±1.48,0.74±0.12,7.18±0.39,36.43±1.82,92.56±0.49
4,SR,HS-RAU,MP,13.18±1.64,1.65±0.21,0,0,0,53.37±6.13,2.30±0.15,0,0,0,13.96±3.11,0.92±0.20,0,0,0
5,MD,-,MD,13.69±1.25,1.88±0.13,11.22±0.82,56.66±4.22,88.28±0.79,41.73±1.45,1.96±0.04,13.11±0.54,66.12±2.62,86.55±0.63,13.08±2.58,0.86±0.15,6.77±0.56,34.42±2.86,92.58±0.72
6,MC dropout,-,BALD,14.21±1.00,1.69±0.09,11.22±0.82,56.66±4.22,88.28±0.79,45.06±4.67,2.08±0.16,13.11±0.54,66.12±2.62,86.55±0.63,12.98±1.78,0.82±0.10,6.77±0.56,34.42±2.86,92.58±0.72
7,MC dropout,-,SMP,14.38±1.98,1.76±0.18,11.22±0.82,56.66±4.22,88.28±0.79,42.95±5.70,2.01±0.15,13.11±0.54,66.12±2.62,86.55±0.63,14.00±2.10,0.91±0.14,6.77±0.56,34.42±2.86,92.58±0.72
8,MC dropout,-,PV,13.97±1.11,1.68±0.09,11.22±0.82,56.66±4.22,88.28±0.79,44.35±4.67,2.06±0.15,13.11±0.54,66.12±2.62,86.55±0.63,12.90±1.83,0.82±0.11,6.77±0.56,34.42±2.86,92.58±0.72
9,SR,-,MP,22.32±7.70,2.58±0.62,0,0,0,49.48±3.54,2.35±0.24,0,0,0,17.93±3.66,1.22±0.27,0,0,0


In [190]:
import os 

metric_types=["rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']

dataset_names = ['MRPC']#, 'CoLA', 'SST-2']
dataset_fnames = ['mrpc']#, 'cola', 'sst2']
names = []
tables = []
baselines = []
type_codes = ['new']#, 'old']
for method in methods:
    for reg in regs:
        for type_code in type_codes:
            run_dirs = []
            names = [f'ddpp_{method}|{type_code}']
            for name in dataset_fnames:
                model_series_dir = f'../../../refactoring/uncertainty-estimation/workdir/run_tasks_for_model_series/electra_{reg}_no_sn_{type_code}/{name}/0.0/{method}'
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print(f'Not exists one of this dirs: {run_dirs}')

In [191]:
table_res2 = pd.concat([pd.concat([t[:-1] for t in tables]), pd.concat(baselines[-2:])]).reset_index()

In [192]:
table_res2

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp
0,ddpp_ddpp_dpp|new,bald,22.88±6.62,2.61±0.59
1,ddpp_ddpp_dpp|new,sampled_max_prob,21.71±7.39,2.56±0.65
2,ddpp_ddpp_dpp|new,variance,22.47±7.11,2.57±0.62
3,ddpp_ddpp_ood|new,bald,24.12±8.49,2.68±0.56
4,ddpp_ddpp_ood|new,sampled_max_prob,22.22±7.08,2.59±0.61
5,ddpp_ddpp_ood|new,variance,22.53±6.79,2.64±0.57
6,baseline|new,max_prob,22.31±7.71,2.58±0.62
7,baseline|new,max_prob,22.31±7.71,2.58±0.62


In [124]:
import os 

def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        #rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        #marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    return agg_methods

metric_types=["rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw']

dataset_names = ['MRPC']#, 'CoLA', 'SST-2']
dataset_fnames = ['mrpc']#, 'cola', 'sst2']
names = []
tables = []
baselines = []
type_codes = ['new', 'old']
for method in methods:
    for reg in regs:
        for type_code in type_codes:
            run_dirs = []
            names = [f'{method}|{type_code}']
            agg_func = choose_agg_func(method)
            for name in dataset_fnames:
                model_series_dir = f'../../../refactoring/uncertainty-estimation/workdir/run_tasks_for_model_series/electra_{reg}_no_sn_{type_code}/{name}/0.0/{method}'
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print(f'Not exists one of this dirs: {run_dirs}')

In [125]:
table_res3 = pd.concat([pd.concat([t[:-1] for t in tables]), pd.concat(baselines[-2:])]).reset_index()

In [126]:
table_res3

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp
0,mahalanobis|new,mahalanobis_distance,13.16±1.24,1.80±0.13
1,mahalanobis|old,mahalanobis_distance,13.16±1.24,1.80±0.13
2,baseline|new,max_prob,22.32±7.70,2.58±0.62
3,baseline|old,max_prob,22.32±7.70,2.58±0.62


In [144]:
import os 

def choose_agg_func(method):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq' or method=='nuq_best' or method=='nuq_best1':
        nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='hybrid':
        maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        #rel_maha_dist = lambda x: np.squeeze(x[:, 1], axis=-1)
        #marg_maha_dist = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {"mahalanobis_distance": maha_dist,
                       #"relative_mahalanobis_distance": rel_maha_dist,
                       #"marginal_mahalanobis_distance": marg_maha_dist
                      }
    return agg_methods

metric_types=["rcc-auc", 'rpp']
methods = ['nuq']
regs = ['raw']

dataset_names = ['MRPC']#, 'CoLA', 'SST-2']
dataset_fnames = ['mrpc']#, 'cola', 'sst2']
names = []
tables = []
baselines = []
type_codes = ['new', 'old']
for method in methods:
    for reg in regs:
        for type_code in type_codes:
            run_dirs = []
            names = [f'{method}|{type_code}']
            agg_func = choose_agg_func(method)
            for name in dataset_fnames:
                model_series_dir = f'../../../refactoring/uncertainty-estimation/workdir/run_tasks_for_model_series/electra_{reg}_no_sn_{type_code}/{name}/0.0/{method}'
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, methods=agg_func)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print(f'Not exists one of this dirs: {run_dirs}')

In [145]:
table_res3 = pd.concat([pd.concat([t[:-1] for t in tables]), pd.concat(baselines[-2:])]).reset_index()

In [146]:
table_res3

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp
0,nuq|new,nuq_aleatoric,14.36±1.46,1.86±0.12
1,nuq|new,nuq_epistemic,16.65±1.38,2.36±0.17
2,nuq|new,nuq_total,16.41±1.26,2.32±0.15
3,nuq|old,nuq_aleatoric,14.37±1.46,1.86±0.12
4,nuq|old,nuq_epistemic,16.65±1.39,2.36±0.17
5,nuq|old,nuq_total,16.41±1.27,2.32±0.16
6,baseline|new,max_prob,22.32±7.70,2.58±0.62
7,baseline|old,max_prob,22.32±7.70,2.58±0.62
