In [5]:
# Like common file, but with another table structure

In [6]:
import yaml
import os
from yaml import Loader as Loader
from pathlib import Path
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_auc_score

from analyze_results import (
    extract_result,
    aggregate_runs,
    from_model_outputs_calc_rcc_auc,
)
from analyze_results import (
    format_results2,
    improvement_over_baseline,
    from_model_outputs_calc_pr_auc,
    from_model_outputs_calc_rpp,
    from_model_outputs_calc_roc_auc,
    from_model_outputs_calc_arc_auc
)

from utils.utils_wandb import init_wandb, wandb
from ue4nlp.ue_scores import *


In [7]:
def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return from_model_outputs_calc_arc_auc
    if metric_type == "roc-auc":
        return from_model_outputs_calc_roc_auc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc

    elif metric_type == "pr-auc":
        return from_model_outputs_calc_pr_auc

    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp

    else:
        raise ValueError("Wrong metric type!")


def get_one_table(runs_dir, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baseline=None, methods=None):
    default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
        "var.ratio": var_ratio,
        #"sampled_entropy": mean_entropy,
    }
    if methods is None:
        methods = default_methods

    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            runs_dir, methods=methods, metric=metric
        )

        if agg_res.empty:
            print("Broken\n")
            continue

        if metric_type == "rcc-auc":
            final_score = format_results2(agg_res, percents=False)
        elif metric_type == "rpp":
            final_score = format_results2(agg_res, percents=True)
        else:
            final_score = improvement_over_baseline(agg_res, baseline_col="max_prob", baseline=baseline, metric=metric_type, percents=True, subtract=True)
        table.append(final_score)
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    # fix for rcc-auc and rpp
    if 'baseline (max_prob)' not in res_table.index:
        res_table.loc['baseline (max_prob)'] = 0
    for metric in ['rcc-auc', 'rpp']:
        try:
            res_table[metric].loc['baseline (max_prob)'] = res_table[metric].loc['max_prob']
        except:
            pass
    try:
        res_table = res_table.drop(['max_prob', 'count'])
    except:
        res_table = res_table.drop(['max_prob'])
    return res_table


def collect_tables(run_dirs, names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baseline=None, methods=None):
    all_tables = []
    for run_dir, name in zip(run_dirs, names):
        buf_table = get_one_table(run_dir, metric_types, baseline, methods)
        #print(buf_table)
        # add name to index
        indices = [(name, ind) for ind in list(buf_table.index)]
        baseline_name = 'baseline|'+'|'.join(name.split('|')[-1:])
        buf_table.loc[baseline_name] = buf_table.loc['baseline (max_prob)']
        # add reindex
        indices = indices + [(baseline_name, 'max_prob')]
        
        index = pd.MultiIndex.from_tuples(indices, names=['Method', 'UE Score'])
        buf_table.index = index
        buf_table.drop((name, 'baseline (max_prob)'), inplace=True)
        # add buf_table to final_table
        all_tables.append(buf_table)
    return pd.concat(all_tables)


def collect_datasets(runs_dirs, names, dataset_names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baselines={}, methods=None):
    all_tables = []
    for run_dir, dataset_name in zip(runs_dirs, dataset_names):
        try:
            dataset_table = collect_tables(run_dir, names, metric_types, baselines.get(dataset_name, None), methods)
            columns = pd.MultiIndex.from_tuples([(dataset_name, ind) for ind in list(dataset_table.columns)])
            dataset_table.columns = columns
            all_tables.append(dataset_table)
        except:
            print(f'empty dir {run_dir}')
    return pd.concat(all_tables, axis=1)

In [10]:
import os 

default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
raw_baselines = {}
for ds_fname, ds_name in zip(dataset_fnames, dataset_names):
    model_series_dir = f'../workdir/run_glue_for_model_series/electra-raw/{ds_fname}/0.0/ddpp_dpp_0.3_20/'
    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            model_series_dir, methods=default_methods, metric=metric
        )

        mean_res = agg_res.mean(axis=0)
        final_results = mean_res.T
        table.append(final_results.loc[['max_prob']])
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    raw_baselines[ds_name] = res_table#
    #raw_baselines[ds_name]={k:v for k,v in zip(res_table.columns.values.tolist(), res_table.values[0].tolist())}

FileNotFoundError: [Errno 2] No such file or directory: '../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/ddpp_dpp_0.3_20/'

# Determenistic methods

In [14]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis', 'mc_mahalanobis']#'nuq',
regs = ['raw','reg']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['True']:#['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                #model_series_dir = f'../workdir/final_res/run_glue_for_model_series/{name}_electra_{reg}_{sn}/{method}/'\
                model_series_dir = f'/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-{reg}-True/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_det_sn = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/mrpc/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/cola/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/sst2/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-reg-True/mrpc/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-reg-True/cola/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-reg-True/sst2/0.0/mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/mrpc/0.0/mc_mahalanobis
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra-raw-True/cola/0.0/mc_mahalanobis
/mnt/users/avazhentsev/unc

In [15]:
table_det_sn = table_det_sn.iloc[[0,1,3,5,6]].reset_index()
table_det_sn.Method = ['MD SN (ours)']*2+['SMD SN (ours)']*2+['SR SN']
table_det_sn['Reg. Type'] = ['-', 'CER', '-', 'CER', '-']

In [16]:
table_det_sn = table_det_sn[list(table_det_sn.columns[[0,-1]]) + list(table_det_sn.columns[list(range(2,11))])]

In [17]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']#'nuq',
regs = ['reg', 'raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/final_res_det/run_glue_for_model_series/{name}_electra_{reg}_{sn}/{method}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_det = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_raw_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_no_sn/mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_elect

In [18]:
table_det

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
mahalanobis|last|reg_sn,mahalanobis_distance,0.79±0.23,12.30±1.44,1.63±0.18,1.10±0.28,52.66±3.11,2.34±0.11,-0.07±0.21,14.93±2.21,1.03±0.14
mahalanobis|last|reg_no_sn,mahalanobis_distance,0.60±0.34,11.42±1.33,1.58±0.17,0.53±0.32,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
mahalanobis|last|raw_sn,mahalanobis_distance,1.00±0.39,13.57±1.40,1.84±0.20,1.99±0.25,43.41±1.81,2.05±0.07,0.18±0.25,12.98±2.22,0.88±0.14
mahalanobis|last|raw_no_sn,mahalanobis_distance,0.92±0.82,13.21±1.68,1.75±0.23,0.67±0.13,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
baseline|raw_sn,max_prob,91.93±0.51,27.08±5.47,2.86±0.56,90.61±0.25,95.49±10.90,4.06±0.26,93.94±0.30,15.84±4.45,1.06±0.29
baseline|raw_no_sn,max_prob,92.08±0.71,23.28±8.35,2.68±0.68,92.04±0.14,59.04±8.17,2.63±0.17,93.80±0.41,18.07±6.11,1.23±0.41


In [19]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['sngp']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        names = [f'{method}|last|{reg}']
        for name in dataset_fnames:
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}-sngp/{name}/0.0/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])
table_sngp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-raw-sngp/mrpc/0.0/
../workdir/run_glue_for_model_series/electra-raw-sngp/cola/0.0/
../workdir/run_glue_for_model_series/electra-raw-sngp/sst2/0.0/


In [20]:
table_sngp

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
sngp|last|raw,sngp,0.42±0.25,15.78±3.30,2.19±0.43,0.71±0.05,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10
baseline|raw,max_prob,92.24±0.26,17.87±2.46,2.62±0.24,91.31±0.09,64.62±4.29,3.41±0.09,92.64±0.37,45.53±10.95,2.40±0.39


In [14]:
import os 

def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return "rejection-curve-auc"#from_model_outputs_calc_arc_auc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp
    else:
        raise ValueError("Wrong metric type!")

nuq_aleatoric = lambda x: np.squeeze(x[0], axis=-1)
nuq_epistemic = lambda x: np.squeeze(x[1], axis=-1)
nuq_total = lambda x: np.squeeze(x[2], axis=-1)
agg_methods = {
    "nuq_aleatoric": nuq_aleatoric,
    "nuq_epistemic": nuq_epistemic,
    "nuq_total": nuq_total,
}

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['nuq']#'',
regs = ['metric', 'reg', 'raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_{reg}_{sn}/{name}/0.0/{method}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_methods)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass
table_nuq = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_metric_sn/mrpc/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_metric_sn/cola/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_metric_sn/sst2/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_metric_no_sn/mrpc/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_metric_no_sn/cola/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_metric_no_sn/sst2/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_reg_sn/mrpc/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_reg_sn/cola/0.0/nuq/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series_sn/electra_reg_sn/sst2/0.0/nuq/
Broken

Broken

Broken

empty dir ['/home/user/uncertainty-estimation/wor

In [15]:
table_nuq

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
nuq|last|metric_sn,nuq_aleatoric,2.23±2.67,76.96±156.87,2.86±2.77,1.02±1.13,224.63±359.23,4.24±2.50,1.80±1.65,155.40±342.23,1.62±1.82
nuq|last|metric_sn,nuq_epistemic,1.75±2.18,79.51±153.75,3.33±2.27,1.19±1.37,210.54±377.20,4.07±2.75,1.82±1.69,153.76±345.58,1.60±1.88
nuq|last|metric_sn,nuq_total,1.81±2.24,79.21±154.19,3.27±2.34,1.20±1.41,210.32±379.76,4.07±2.80,1.82±1.69,153.76±345.59,1.60±1.88
nuq|last|metric_no_sn,nuq_aleatoric,-1.78±2.50,40.63±25.82,4.99±2.49,0.65±0.10,47.71±4.32,2.30±0.12,0.25±0.31,12.73±2.20,0.88±0.12
nuq|last|metric_no_sn,nuq_epistemic,-0.21±0.78,22.47±3.94,3.43±0.63,0.41±0.15,50.15±3.71,2.53±0.20,-0.02±0.48,15.16±4.69,1.17±0.40
nuq|last|metric_no_sn,nuq_total,-1.78±2.62,39.78±26.21,4.90±2.52,0.55±0.15,48.30±3.23,2.43±0.18,0.25±0.29,12.42±1.98,0.87±0.12
nuq|last|raw_sn,nuq_aleatoric,0.92±1.77,84.98±146.58,3.56±1.82,1.07±1.39,224.54±361.27,4.05±2.47,1.45±1.43,155.46±343.06,1.76±2.08
nuq|last|raw_sn,nuq_epistemic,1.17±1.81,79.62±152.53,3.32±1.93,1.34±1.78,207.97±384.29,3.76±2.86,1.44±1.49,155.16±345.31,1.79±2.14
nuq|last|raw_sn,nuq_total,1.17±1.82,79.56±152.57,3.32±1.93,1.35±1.78,207.44±384.94,3.75±2.87,1.43±1.50,155.11±345.56,1.79±2.14
nuq|last|raw_no_sn,nuq_aleatoric,0.35±0.20,13.56±0.57,1.83±0.12,0.82±0.24,44.61±3.44,2.05±0.10,0.41±0.25,12.09±2.10,0.80±0.12


# MC-Mahalanobis

In [21]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mc_mahalanobis']#'nuq',
regs = ['reg', 'raw']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|last|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/final_res_det/run_glue_for_model_series/{name}_electra_{reg}_{sn}/{method}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_mc_det = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_reg_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_reg_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_reg_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/sst2_electra_raw_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/mrpc_electra_raw_no_sn/mc_mahalanobis/
../workdir/final_res_det/run_glue_for_model_series/cola_electra_raw_no_sn/mc_mahalanobis/
../workdir/final_res_det/run

In [22]:
table_mc_det

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
mc_mahalanobis|last|reg_sn,mahalanobis_distance,0.79±0.23,12.30±1.42,1.63±0.18,1.10±0.28,52.70±3.15,2.34±0.11,-0.11±0.23,15.52±2.42,1.07±0.15
mc_mahalanobis|last|reg_sn,sampled_mahalanobis_distance,0.97±0.32,11.74±2.37,1.43±0.24,0.32±0.27,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
mc_mahalanobis|last|reg_no_sn,mahalanobis_distance,0.60±0.34,11.42±1.33,1.58±0.17,0.53±0.32,43.39±3.64,2.04±0.20,0.18±0.27,14.05±4.21,0.94±0.28
mc_mahalanobis|last|reg_no_sn,sampled_mahalanobis_distance,0.69±0.55,11.38±2.76,1.49±0.29,0.14±0.53,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
mc_mahalanobis|last|raw_sn,mahalanobis_distance,1.00±0.38,13.57±1.39,1.84±0.20,1.99±0.25,43.41±1.81,2.05±0.07,0.14±0.25,13.63±2.36,0.93±0.15
mc_mahalanobis|last|raw_sn,sampled_mahalanobis_distance,1.03±0.44,14.51±1.17,1.84±0.14,1.55±0.26,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
mc_mahalanobis|last|raw_no_sn,mahalanobis_distance,0.92±0.82,13.21±1.68,1.75±0.23,0.67±0.13,41.63±1.44,1.96±0.06,0.27±0.44,14.07±3.23,0.96±0.22
mc_mahalanobis|last|raw_no_sn,sampled_mahalanobis_distance,1.10±0.83,13.36±2.21,1.57±0.23,0.30±0.18,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
baseline|raw_sn,max_prob,91.93±0.51,27.08±5.47,2.86±0.56,90.61±0.25,95.49±10.90,4.06±0.26,93.94±0.30,15.84±4.45,1.06±0.29
baseline|raw_no_sn,max_prob,92.08±0.71,23.28±8.35,2.68±0.68,92.04±0.14,59.04±8.17,2.63±0.17,93.80±0.41,18.07±6.11,1.23±0.41


In [23]:
det_res = pd.concat([table_det.iloc[[3,1,2,0]], table_mc_det.iloc[[7,3,5,1]], table_sngp.iloc[:1], pd.concat(baselines).iloc[[2,0,1,3]]])

In [24]:
det_res = det_res.reset_index()
det_res.Method = ['MD']*2+['MD SN (ours)']*2+['SMD (ours)']*2+['SMD SN (ours)']*2+['SNGP']+['SR SN']*2+['SR']*2
det_res['Reg. Type'] = ['-', 'CER']*4+['-']+['-','CER','CER','-']

In [25]:
det_res = det_res[list(det_res.columns[[0,-1]]) + list(det_res.columns[list(range(2,11))])]

In [26]:
det_res#.iloc[:2]

Unnamed: 0_level_0,Method,Reg. Type,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MD,-,0.92±0.82,13.21±1.68,1.75±0.23,0.67±0.13,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
1,MD,CER,0.60±0.34,11.42±1.33,1.58±0.17,0.53±0.32,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
2,MD SN (ours),-,1.00±0.39,13.57±1.40,1.84±0.20,1.99±0.25,43.41±1.81,2.05±0.07,0.18±0.25,12.98±2.22,0.88±0.14
3,MD SN (ours),CER,0.79±0.23,12.30±1.44,1.63±0.18,1.10±0.28,52.66±3.11,2.34±0.11,-0.07±0.21,14.93±2.21,1.03±0.14
4,SMD (ours),-,1.10±0.83,13.36±2.21,1.57±0.23,0.30±0.18,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
5,SMD (ours),CER,0.69±0.55,11.38±2.76,1.49±0.29,0.14±0.53,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
6,SMD SN (ours),-,1.03±0.44,14.51±1.17,1.84±0.14,1.55±0.26,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
7,SMD SN (ours),CER,0.97±0.32,11.74±2.37,1.43±0.24,0.32±0.27,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
8,SNGP,-,0.42±0.25,15.78±3.30,2.19±0.43,0.71±0.05,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10
9,SR SN,-,91.93±0.51,27.08±5.47,2.86±0.56,90.61±0.25,95.49±10.90,4.06±0.26,93.94±0.30,15.84±4.45,1.06±0.29


In [27]:
tab_res_det = pd.concat([det_res.iloc[[0,1]], table_det_sn.iloc[[0,1]], det_res.iloc[[4,5,6,7,8]], table_det_sn.iloc[[-1]], det_res.iloc[-3:]])

In [28]:
tab_res_det

Unnamed: 0_level_0,Method,Reg. Type,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MD,-,0.92±0.82,13.21±1.68,1.75±0.23,0.67±0.13,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
1,MD,CER,0.60±0.34,11.42±1.33,1.58±0.17,0.53±0.32,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
0,MD SN (ours),-,0.35±0.33,13.61±2.07,1.80±0.18,1.54±0.32,40.42±2.30,1.96±0.12,0.29±0.28,12.16±1.93,0.83±0.11
1,MD SN (ours),CER,0.57±0.72,14.57±1.49,1.93±0.07,1.41±0.30,39.51±2.61,1.87±0.07,0.30±0.32,10.89±1.25,0.75±0.06
4,SMD (ours),-,1.10±0.83,13.36±2.21,1.57±0.23,0.30±0.18,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
5,SMD (ours),CER,0.69±0.55,11.38±2.76,1.49±0.29,0.14±0.53,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
6,SMD SN (ours),-,1.03±0.44,14.51±1.17,1.84±0.14,1.55±0.26,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
7,SMD SN (ours),CER,0.97±0.32,11.74±2.37,1.43±0.24,0.32±0.27,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
8,SNGP,-,0.42±0.25,15.78±3.30,2.19±0.43,0.71±0.05,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10
4,SR SN,-,92.60±0.38,18.72±6.81,2.17±0.40,91.19±0.32,79.92±11.25,3.53±0.33,93.90±0.27,17.83±4.26,1.11±0.28


In [162]:
print(str(tab_res_det.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
       Method & Reg. Type & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST-2} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
           MD &         - &           0.92$\pm$0.23 &  13.21$\pm$1.68 &  1.75$\pm$0.23 &           0.67$\pm$0.13 &   41.63$\pm$1.44 &  1.96$\pm$0.06 &           0.34$\pm$0.44 &  13.01$\pm$2.88 &  0.89$\pm$0.19 \\
           MD &       CER &           1.15$\pm$0.21 &  11.42$\pm$1.33 &  1.58$\pm$0.17 &           0.53$\pm$0.32 &   43.39$\pm$3.64 &  2.04$\pm$0.20 &           0.24$\pm$0.25 &  12.90$\pm$3.55 &  0.87$\pm$0.23 \\
 MD SN (ours) &         - &           0.86$\pm$0.28 &  13.61$\pm$2.07 &  1.80$\pm$0.18 &           1.54$\pm$0.32 &   40.42$\pm$2.30 &  1.96$\pm$0.12 &           0.29$\pm$0.28 &  12.16$\pm$1.93 &  0.83$\pm$0.11 \\
 MD SN (ours) &       CER &           

# MC-Dropout 

In [33]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mc_all']#, 'mc_last']
regs = ['raw', 'reg']
dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for reg in regs:
    run_dirs = []
    #layer = method.split('_')[-1]
    names = [f'mc|{reg}']
    print(names)
    for name in dataset_fnames:
        model_series_dir = f'../workdir/final_res/run_mc_all/{name}_electra_{reg}_no_sn/mc_all/'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    try:
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])
    except:
        print('skip')
        pass
table_mc = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_mc = table_mc.reset_index()

['mc|raw']
../workdir/final_res/run_mc_all/mrpc_electra_raw_no_sn/mc_all/
../workdir/final_res/run_mc_all/cola_electra_raw_no_sn/mc_all/
../workdir/final_res/run_mc_all/sst2_electra_raw_no_sn/mc_all/
['mc|reg']
../workdir/final_res/run_mc_all/mrpc_electra_reg_no_sn/mc_all/
../workdir/final_res/run_mc_all/cola_electra_reg_no_sn/mc_all/
../workdir/final_res/run_mc_all/sst2_electra_reg_no_sn/mc_all/


In [34]:
def preproc_regs(x):
    regs = x.split('|')    
    return '-' if (regs[-1]=='raw' or regs[-1] == '') else 'CER'
        
table_mc['Reg. Type'] = table_mc.Method.apply(lambda x: preproc_regs(x))
table_mc['Dropout Layers'] = table_mc['Method'].apply(lambda x: x.split('|')[1] if 'baseline' not in x else '-')
table_mc['Method'] = table_mc['Method'].apply(lambda x: x.split('|')[0].upper() if 'baseline' not in x else x.split('|')[0])
table_mc = table_mc[list(table_mc.columns[:1]) + list(table_mc.columns[-2:]) + list(table_mc.columns[1:-2])]

In [36]:
table_mc['UE Score'] = ['BALD', 'SMP', 'PV', 'VR']*2+['MP']*2
table_mc['Reg. Type'] = list(table_mc['Reg. Type'].iloc[:-2].values) + ['-', 'CER']

In [37]:
table_mc[list(table_mc.columns[:2]) + list(table_mc.columns[3:])]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MC,-,BALD,1.00±0.13,14.99±1.47,1.66±0.17,0.45±0.13,48.13±3.96,2.21±0.13,0.37±0.32,13.59±3.84,0.86±0.14
1,MC,-,SMP,1.02±0.14,15.00±3.15,1.64±0.21,0.54±0.10,46.58±3.67,2.12±0.09,0.35±0.29,13.12±3.27,0.88±0.17
2,MC,-,PV,1.01±0.13,14.66±1.56,1.65±0.17,0.47±0.12,47.15±3.59,2.18±0.11,0.36±0.29,13.47±3.94,0.86±0.15
3,MC,-,VR,0.07±0.44,24.75±5.93,2.94±0.44,-0.42±0.15,69.07±6.33,2.96±0.13,-0.30±0.45,27.87±4.82,1.66±0.27
4,MC,CER,BALD,1.17±0.29,12.47±2.64,1.59±0.26,0.34±0.24,49.94±6.41,2.27±0.21,0.28±0.19,14.00±2.64,0.84±0.15
5,MC,CER,SMP,1.04±0.32,13.25±3.21,1.70±0.32,0.46±0.25,46.02±5.32,2.17±0.24,0.25±0.12,13.79±3.08,0.88±0.19
6,MC,CER,PV,1.12±0.27,12.62±2.51,1.61±0.26,0.37±0.23,48.39±5.93,2.23±0.21,0.27±0.17,14.13±2.71,0.85±0.16
7,MC,CER,VR,-0.18±0.22,25.39±2.42,3.27±0.23,-0.41±0.26,78.09±12.21,3.27±0.24,-0.50±0.46,29.23±5.85,1.65±0.32
8,baseline,-,MP,92.08±0.71,23.28±8.35,2.68±0.68,92.04±0.14,59.04±8.17,2.63±0.17,93.80±0.41,18.07±6.11,1.23±0.41
9,baseline,CER,MP,92.63±0.41,17.17±4.12,2.21±0.41,92.09±0.45,54.04±10.18,2.57±0.46,93.90±0.24,16.68±2.92,1.11±0.24


In [15]:
print(str(table_mc[list(table_mc.columns[:2]) + list(table_mc.columns[3:])].iloc[:8].to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
Method & Reg. Type & UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST-2} \\
       & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
    MC &         - &     BALD &           1.00$\pm$0.13 &  14.99$\pm$1.47 &  1.66$\pm$0.17 &           0.45$\pm$0.13 &   48.13$\pm$3.96 &  2.21$\pm$0.13 &           0.37$\pm$0.32 &  13.59$\pm$3.84 &  0.86$\pm$0.14 \\
    MC &         - &      SMP &           1.02$\pm$0.14 &  15.00$\pm$3.15 &  1.64$\pm$0.21 &           0.54$\pm$0.10 &   46.58$\pm$3.67 &  2.12$\pm$0.09 &           0.35$\pm$0.29 &  13.12$\pm$3.27 &  0.88$\pm$0.17 \\
    MC &         - &       PV &           1.01$\pm$0.13 &  14.66$\pm$1.56 &  1.65$\pm$0.17 &           0.47$\pm$0.12 &   47.15$\pm$3.59 &  2.18$\pm$0.11 &           0.36$\pm$0.29 &  13.47$\pm$3.94 &  0.86$\pm$0.15 \\
    MC &         - &       V

# MC-DPP all

In [25]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.5, 0.6]
comsizes = [20, 50]

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for reg in regs:
                run_dirs = []
                names = [f'ddpp_{method}|{max_frac}|{cs}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/electra-raw/{name}/0.0/{method}_{max_frac}_{cs}'
                    #print(model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print(f'Not exists one of this dirs: {run_dirs}')

Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_dpp_0.5_20']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_dpp_0.6_20']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_dpp_0.6_50']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_ood_0.6_20']
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/ddpp_ood_0.6_50']


In [26]:
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])])

In [27]:
table_dpp

Unnamed: 0_level_0,Unnamed: 1_level_0,CoLA,CoLA,CoLA,MRPC,MRPC,MRPC,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
ddpp_ddpp_dpp|0.3|20,bald,111.11±16.72,-0.05±0.12,2.98±0.26,56.43±22.61,-0.43±0.60,3.34±1.13,32.64±8.36,-0.13±0.20,1.36±0.32
ddpp_ddpp_dpp|0.3|20,sampled_max_prob,109.98±10.93,-0.02±0.08,2.93±0.22,48.79±22.51,-0.05±0.10,2.96±1.13,30.13±6.30,-0.06±0.14,1.29±0.27
ddpp_ddpp_dpp|0.3|20,variance,111.56±13.93,-0.05±0.09,2.97±0.24,54.27±21.51,-0.33±0.37,3.22±1.08,31.95±6.84,-0.09±0.13,1.33±0.26
ddpp_ddpp_dpp|0.3|50,bald,104.60±6.92,0.03±0.12,2.89±0.14,43.00±23.97,0.16±0.57,2.77±1.24,32.85±10.20,-0.06±0.19,1.30±0.34
ddpp_ddpp_dpp|0.3|50,sampled_max_prob,107.48±9.28,0.02±0.02,2.90±0.18,46.69±24.13,0.03±0.09,2.90±1.16,28.95±7.81,-0.02±0.12,1.24±0.28
ddpp_ddpp_dpp|0.3|50,variance,104.12±7.89,0.05±0.09,2.87±0.15,45.31±23.25,0.10±0.41,2.83±1.17,32.23±10.85,-0.06±0.16,1.30±0.33
ddpp_ddpp_dpp|0.4|20,bald,108.61±11.38,-0.06±0.06,2.98±0.24,49.17±23.52,-0.11±0.43,3.02±1.22,36.45±14.41,-0.21±0.28,1.44±0.46
ddpp_ddpp_dpp|0.4|20,sampled_max_prob,109.39±13.93,-0.03±0.10,2.95±0.27,48.31±22.84,-0.08±0.12,2.96±1.12,28.13±6.61,0.01±0.15,1.22±0.26
ddpp_ddpp_dpp|0.4|20,variance,108.27±11.59,-0.02±0.07,2.95±0.23,50.08±23.19,-0.15±0.34,3.06±1.16,34.06±12.32,-0.11±0.22,1.35±0.36
ddpp_ddpp_dpp|0.4|50,bald,108.04±7.63,-0.00±0.10,2.93±0.16,44.15±22.13,0.11±0.68,2.79±1.20,33.22±11.59,-0.08±0.22,1.32±0.36


In [28]:
table_dpp.sort_values(by=('SST-2', 'rcc-auc')).iloc[:50]

Unnamed: 0_level_0,Unnamed: 1_level_0,CoLA,CoLA,CoLA,MRPC,MRPC,MRPC,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp
ddpp_ddpp_dpp|0.5|50,sampled_max_prob,108.98±8.57,0.02±0.06,2.91±0.18,46.52±23.47,0.02±0.06,2.89±1.12,25.95±4.20,0.03±0.06,1.16±0.20
ddpp_ddpp_dpp|0.4|20,sampled_max_prob,109.39±13.93,-0.03±0.10,2.95±0.27,48.31±22.84,-0.08±0.12,2.96±1.12,28.13±6.61,0.01±0.15,1.22±0.26
ddpp_ddpp_dpp|0.4|50,sampled_max_prob,108.99±9.82,0.01±0.05,2.92±0.19,45.47±22.77,0.04±0.23,2.86±1.14,28.15±4.37,0.02±0.05,1.21±0.17
ddpp_ddpp_dpp|0.3|50,sampled_max_prob,107.48±9.28,0.02±0.02,2.90±0.18,46.69±24.13,0.03±0.09,2.90±1.16,28.95±7.81,-0.02±0.12,1.24±0.28
ddpp_ddpp_dpp|0.3|20,sampled_max_prob,109.98±10.93,-0.02±0.08,2.93±0.22,48.79±22.51,-0.05±0.10,2.96±1.13,30.13±6.30,-0.06±0.14,1.29±0.27
ddpp_ddpp_dpp|0.4|50,variance,107.11±7.33,0.03±0.10,2.90±0.15,44.30±21.54,0.12±0.61,2.79±1.17,30.81±8.46,-0.02±0.15,1.25±0.28
ddpp_ddpp_dpp|0.3|20,variance,111.56±13.93,-0.05±0.09,2.97±0.24,54.27±21.51,-0.33±0.37,3.22±1.08,31.95±6.84,-0.09±0.13,1.33±0.26
ddpp_ddpp_dpp|0.5|50,variance,106.94±9.41,0.03±0.06,2.89±0.18,43.74±21.31,0.15±0.32,2.77±1.01,32.14±13.13,-0.08±0.08,1.26±0.33
ddpp_ddpp_dpp|0.3|50,variance,104.12±7.89,0.05±0.09,2.87±0.15,45.31±23.25,0.10±0.41,2.83±1.17,32.23±10.85,-0.06±0.16,1.30±0.33
ddpp_ddpp_dpp|0.3|20,bald,111.11±16.72,-0.05±0.12,2.98±0.26,56.43±22.61,-0.43±0.60,3.34±1.13,32.64±8.36,-0.13±0.20,1.36±0.32


In [29]:
{'MRPC': {'ddpp_ood' : 0.6, 'ddpp_dpp' : 0.5},
'CoLA': {'ddpp_ood' : 0.5, 'ddpp_dpp' : 0.6},
'SST-2': {'ddpp_ood' : 0.5, 'ddpp_dpp' : 0.5}}

{'MRPC': {'ddpp_ood': 0.6, 'ddpp_dpp': 0.5},
 'CoLA': {'ddpp_ood': 0.5, 'ddpp_dpp': 0.6},
 'SST-2': {'ddpp_ood': 0.4, 'ddpp_dpp': 0.5}}

# MC-DPP calibration

In [16]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.6, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.6}, 
                   'sst2': {'False': 0.3, 'True': 0.3}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for val_subsample in val_subsamples:
        for reg in regs:
            run_dirs = []
            dpp_type = 'with_ood' if method=='True' else 'on_masks'
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            for name in dataset_fnames:
                max_frac = max_fracs_dicts[name][method]
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_dpp_cal = pd.concat([pd.concat(tables), pd.concat(baselines[-2:-1])])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/False/0.6/2021-10-02/03-46-27/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/False/0.6/2021-10-03/03-10-07/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/False/0.3/2021-10-03/10-23-36/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/dpp/False/0.6/2021-10-02/03-22-18/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/False/0.6/2021-10-03/02-29-18/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.1/dpp/False/0.3/2021-10-03/10-02-10/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.6/2021-10-02/05-02-54/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/True/0.6/2021-10-03/05-28-09/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/True/0.3/2021-10-03/11-51-10/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/dpp/True/0.6/2021-10-02/04-33-03/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/True/0.6/202

In [17]:
def preproc_cal(x):
    if 'baseline' in x:
        return '-'
    ds = x.split('|')[1]
    if ds == '0.1':
        return 'val.'
    return 'train'
        
table_dpp_cal = table_dpp_cal.reset_index()
table_dpp_cal['Calibr. Dataset'] = table_dpp_cal.Method.apply(lambda x: preproc_cal(x))
table_dpp_cal['Method'] = table_dpp_cal['Method'].apply(lambda x: x.split('|')[0])
table_dpp_cal = table_dpp_cal[list(table_dpp_cal.columns[:1]) + list(table_dpp_cal.columns[-1:]) + list(table_dpp_cal.columns[1:-1])]

In [18]:
table_dpp_cal

Unnamed: 0_level_0,Method,Calibr. Dataset,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,train,bald,-0.05±0.16,15.99±0.98,2.03±0.18,-0.11±0.39,51.39±9.34,2.54±0.37,-0.01±0.19,19.37±3.13,1.15±0.18
1,DPP_on_masks,train,sampled_max_prob,0.04±0.18,14.42±1.06,1.92±0.20,0.00±0.31,48.92±7.28,2.43±0.28,0.01±0.22,17.14±3.35,1.11±0.23
2,DPP_on_masks,train,variance,0.02±0.22,14.76±1.48,1.94±0.25,-0.04±0.32,49.79±8.13,2.47±0.32,-0.03±0.19,19.03±2.89,1.15±0.19
3,DPP_on_masks,val.,bald,-0.69±1.05,25.97±18.99,2.60±1.04,-0.44±0.26,59.58±7.91,2.86±0.20,0.09±0.23,15.88±3.32,1.07±0.23
4,DPP_on_masks,val.,sampled_max_prob,-0.02±0.45,15.18±3.32,1.92±0.36,-0.20±0.14,53.21±3.52,2.61±0.18,-0.01±0.20,17.46±3.05,1.18±0.22
5,DPP_on_masks,val.,variance,-0.06±0.44,15.36±3.35,1.95±0.35,-0.26±0.11,54.27±4.05,2.66±0.14,0.02±0.20,16.84±2.86,1.14±0.19
6,DPP_with_ood,train,bald,-0.31±0.51,20.21±6.49,2.30±0.48,0.04±0.30,51.21±10.43,2.38±0.30,-0.37±0.16,25.45±3.33,1.50±0.16
7,DPP_with_ood,train,sampled_max_prob,-0.05±0.16,15.22±0.86,2.04±0.19,0.15±0.27,48.60±8.17,2.27±0.24,0.05±0.24,17.60±5.34,1.08±0.24
8,DPP_with_ood,train,variance,-0.11±0.22,16.91±3.15,2.09±0.20,0.10±0.28,49.57±9.21,2.30±0.26,-0.24±0.26,22.98±4.39,1.37±0.24
9,DPP_with_ood,val.,bald,-1.82±1.11,37.60±17.51,3.75±1.07,-0.31±0.44,65.64±16.01,2.73±0.42,-0.42±0.07,27.17±3.71,1.57±0.08


In [19]:
print(str(table_dpp_cal.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
       Method & Calibr. Dataset &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &           train &              bald &          -0.05$\pm$0.16 &   15.99$\pm$0.98 &  2.03$\pm$0.18 &          -0.11$\pm$0.39 &   51.39$\pm$9.34 &  2.54$\pm$0.37 &          -0.01$\pm$0.19 &  19.37$\pm$3.13 &  1.15$\pm$0.18 \\
 DPP\_on\_masks &           train &  sampled\_max\_prob &           0.04$\pm$0.18 &   14.42$\pm$1.06 &  1.92$\pm$0.20 &           0.00$\pm$0.31 &   48.92$\pm$7.28 &  2.43$\pm$0.28 &           0.01$\pm$0.22 &  17.14$\pm$3.35 &  1.11$\pm$0.23 \\
 DPP\_on\_masks &           train &          variance &           0.02$\pm$0.22 &   14.76$\pm$1.48 &  1.94$\pm$0.25 &          -0.04$\pm$0.32 &   49.79$\pm$8

# MC-DPP regs

In [20]:
table_dpp.sort_values(by=('SST2 (10%)', 'rcc-auc')).iloc[:50]

NameError: name 'table_dpp' is not defined

In [21]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['reg']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.3, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.3}, 
                   'sst2': {'False': 0.4, 'True': 0.6}}
val_subsamples_dicts = {'mrpc': {'False': 0.0, 'True': 0.0}, 
                        'cola': {'False': 0.0, 'True': 0.0}, 
                        'sst2': {'False': 0.0, 'True': 0.0}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        dpp_type = 'with_ood' if method=='True' else 'on_masks'
        for name in dataset_fnames:      
            val_subsample = val_subsamples_dicts[name][method]
            max_frac = max_fracs_dicts[name][method]
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
            model_series_dir += os.listdir(model_series_dir)[-1]
            model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/dpp/False/0.3/2021-10-01/16-30-33/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/dpp/False/0.6/2021-10-02/22-25-22/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/dpp/False/0.4/2021-10-03/13-24-49/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/dpp/True/0.6/2021-10-02/02-26-43/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/dpp/True/0.3/2021-10-02/08-14-31/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/dpp/True/0.6/2021-10-03/21-04-44/


In [22]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_dpp_reg = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_dpp_reg = table_dpp_reg.reset_index()
table_dpp_reg['Reg. Type'] = table_dpp_reg.Method.apply(lambda x: preproc_regs(x))
table_dpp_reg['Method'] = table_dpp_reg['Method'].apply(lambda x: x.split('|')[0])
table_dpp_reg = table_dpp_reg[list(table_dpp_reg.columns[:1]) + list(table_dpp_reg.columns[-1:]) + list(table_dpp_reg.columns[1:-1])]

In [23]:
table_dpp_reg

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,reg,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13
1,DPP_on_masks,reg,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12
2,DPP_on_masks,reg,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09
3,DPP_with_ood,reg,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26
4,DPP_with_ood,reg,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10
5,DPP_with_ood,reg,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09
6,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11
7,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11


In [24]:
print(str(table_dpp_reg.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
       Method & Reg. Type &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &       reg &              bald &           0.36$\pm$0.33 &  12.06$\pm$1.94 &  1.62$\pm$0.34 &          -0.36$\pm$0.34 &  54.66$\pm$6.57 &  2.77$\pm$0.37 &           0.17$\pm$0.13 &  14.86$\pm$1.94 &  0.97$\pm$0.13 \\
 DPP\_on\_masks &       reg &  sampled\_max\_prob &           0.27$\pm$0.15 &  12.53$\pm$0.98 &  1.70$\pm$0.22 &          -0.17$\pm$0.29 &  51.43$\pm$5.55 &  2.57$\pm$0.27 &           0.23$\pm$0.13 &  13.23$\pm$1.86 &  0.91$\pm$0.12 \\
 DPP\_on\_masks &       reg &          variance &           0.28$\pm$0.23 &  12.32$\pm$1.49 &  1.66$\pm$0.28 &          -0.22$\pm$0.31 &  52.46$\pm$5.81 &  2.63$\pm$0.30 &          

# Raw DPP

In [25]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['reg', 'raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]


dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for max_frac in max_fracs:
    for val_subsample in val_subsamples:
        for reg in regs:
            run_dirs = []
            names = [f'dpp|{max_frac}|{val_subsample}|{reg}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/raw_dpp/{max_frac}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/raw_dpp/0.3/2021-10-01/16-36-07/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/raw_dpp/0.3/2021-10-02/04-14-06/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/raw_dpp/0.3/2021-10-03/02-09-45/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/raw_dpp/0.3/2021-10-01/18-07-08/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/raw_dpp/0.3/2021-10-02/07-18-23/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/raw_dpp/0.3/2021-10-03/04-38-20/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.1/raw_dpp/0.3/2021-10-01/16-14-13/
../workdir/run_glue_for_model_series/electra-reg/cola/0.1/raw_dpp/0.3/2021-10-02/03-37-17/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.1/raw_dpp/0.3/2021-10-03/01-42-58/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/raw_dpp/0.3/2021-10-01/17-47-49/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/raw_dpp/0.3/2021-10-02/06-46-16/

In [None]:
table_dpp.sort_values(by=('SST2 (10%)', 'rcc-auc')).iloc[:50]

In [26]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': 0.3, 
                   'cola': 0.4, 
                   'sst2': 0.6}
val_subsamples_dicts = {'mrpc': 0.0, 
                        'cola': 0.1, 
                        'sst2': 0.0}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for reg in regs:
    run_dirs = []
    for name in dataset_fnames:      
        val_subsample = val_subsamples_dicts[name]
        max_frac = max_fracs_dicts[name]
        names = [f'DPP|{val_subsample}|{reg}']
        model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
        model_series_dir += os.listdir(model_series_dir)[-1]
        model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
    baselines.append(res_df.iloc[-1:])
    tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.3/2021-10-01/19-33-07/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/True/0.4/2021-10-02/19-28-25/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/True/0.6/2021-10-03/23-59-01/


In [27]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_raw_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_raw_dpp = table_raw_dpp.reset_index()
table_raw_dpp['Method'] = table_raw_dpp['Method'].apply(lambda x: x.split('|')[0])
#table_raw_dpp = table_raw_dpp[list(table_raw_dpp.columns[:1]) + list(table_raw_dpp.columns[-1:]) + list(table_raw_dpp.columns[1:-1])]

In [28]:
table_raw_dpp

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP,bald,-0.31±0.52,20.57±6.22,2.28±0.51,-0.31±0.45,65.82±14.59,2.73±0.40,-0.19±0.29,21.60±6.41,1.33±0.29
1,DPP,sampled_max_prob,-0.08±0.17,15.65±1.56,2.07±0.22,-0.01±0.17,55.23±8.15,2.41±0.19,0.04±0.23,17.06±4.21,1.10±0.22
2,DPP,variance,0.01±0.40,15.95±5.33,1.98±0.43,-0.06±0.24,58.42±11.42,2.48±0.21,-0.01±0.20,18.76±5.07,1.15±0.22
3,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.02±0.17,53.17±3.70,2.62±0.20,93.89±0.21,17.10±3.29,1.13±0.23


In [29]:
print(str(table_raw_dpp.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
   Method &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
          & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
      DPP &              bald &          -0.31$\pm$0.52 &  20.57$\pm$6.22 &  2.28$\pm$0.51 &          -0.31$\pm$0.45 &  65.82$\pm$14.59 &  2.73$\pm$0.40 &          -0.19$\pm$0.29 &  21.60$\pm$6.41 &  1.33$\pm$0.29 \\
      DPP &  sampled\_max\_prob &          -0.08$\pm$0.17 &  15.65$\pm$1.56 &  2.07$\pm$0.22 &          -0.01$\pm$0.17 &   55.23$\pm$8.15 &  2.41$\pm$0.19 &           0.04$\pm$0.23 &  17.06$\pm$4.21 &  1.10$\pm$0.22 \\
      DPP &          variance &           0.01$\pm$0.40 &  15.95$\pm$5.33 &  1.98$\pm$0.43 &          -0.06$\pm$0.24 &  58.42$\pm$11.42 &  2.48$\pm$0.21 &          -0.01$\pm$0.20 &  18.76$\pm$5.07 &  1.15$\pm$0.22 \\
 baseline &       

In [30]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.6, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.6}, 
                   'sst2': {'False': 0.3, 'True': 0.3}}
val_subsamples_dicts = {'mrpc': {'False': 0.0, 'True': 0.0}, 
                        'cola': {'False': 0.0, 'True': 0.0}, 
                        'sst2': {'False': 0.0, 'True': 0.1}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []


for method in methods:
    for reg in regs:
        run_dirs = []
        dpp_type = 'with_ood' if method=='True' else 'on_masks'
        for name in dataset_fnames:      
            val_subsample = val_subsamples_dicts[name][method]
            max_frac = max_fracs_dicts[name][method]
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
            model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
            model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/False/0.6/2021-10-02/03-46-27/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/False/0.6/2021-10-03/03-10-07/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/False/0.3/2021-10-03/10-23-36/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.6/2021-10-02/05-02-54/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/True/0.6/2021-10-03/05-28-09/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.1/dpp/True/0.3/2021-10-03/11-22-29/


In [31]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_dpp_2 = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_dpp_2 = table_dpp_2.reset_index()
table_dpp_2['Method'] = table_dpp_2['Method'].apply(lambda x: x.split('|')[0])

In [32]:
table_dpp_2

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,bald,-0.05±0.16,15.99±0.98,2.03±0.18,-0.11±0.39,51.39±9.34,2.54±0.37,-0.01±0.19,19.37±3.13,1.15±0.18
1,DPP_on_masks,sampled_max_prob,0.04±0.18,14.42±1.06,1.92±0.20,0.00±0.31,48.92±7.28,2.43±0.28,0.01±0.22,17.14±3.35,1.11±0.23
2,DPP_on_masks,variance,0.02±0.22,14.76±1.48,1.94±0.25,-0.04±0.32,49.79±8.13,2.47±0.32,-0.03±0.19,19.03±2.89,1.15±0.19
3,DPP_with_ood,bald,-0.31±0.51,20.21±6.49,2.30±0.48,0.04±0.30,51.21±10.43,2.38±0.30,-0.42±0.07,27.17±3.71,1.57±0.08
4,DPP_with_ood,sampled_max_prob,-0.05±0.16,15.22±0.86,2.04±0.19,0.15±0.27,48.60±8.17,2.27±0.24,0.21±0.19,13.76±2.39,0.94±0.20
5,DPP_with_ood,variance,-0.11±0.22,16.91±3.15,2.09±0.20,0.10±0.28,49.57±9.21,2.30±0.26,-0.17±0.12,22.83±4.46,1.32±0.12
6,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.89±0.21,17.10±3.29,1.13±0.23
7,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.86±0.21,17.31±2.91,1.19±0.22


In [33]:
print(str(table_dpp_2.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
       Method &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &              bald &          -0.05$\pm$0.16 &  15.99$\pm$0.98 &  2.03$\pm$0.18 &          -0.11$\pm$0.39 &   51.39$\pm$9.34 &  2.54$\pm$0.37 &          -0.01$\pm$0.19 &  19.37$\pm$3.13 &  1.15$\pm$0.18 \\
 DPP\_on\_masks &  sampled\_max\_prob &           0.04$\pm$0.18 &  14.42$\pm$1.06 &  1.92$\pm$0.20 &           0.00$\pm$0.31 &   48.92$\pm$7.28 &  2.43$\pm$0.28 &           0.01$\pm$0.22 &  17.14$\pm$3.35 &  1.11$\pm$0.23 \\
 DPP\_on\_masks &          variance &           0.02$\pm$0.22 &  14.76$\pm$1.48 &  1.94$\pm$0.25 &          -0.04$\pm$0.32 &   49.79$\pm$8.13 &  2.47$\pm$0.32 &          -0.03$\pm$0.19 &  19.03$\pm$2.89 &  1.15$\pm$

# Ensemble

In [60]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
dataset_to_time = {'mrpc':'17-25-06', 'cola': '17-34-50', 'sst2': '17-45-49'}
run_dirs = []
for name in dataset_fnames:      
    names = [f'Deep Ensemble']
    time = dataset_to_time[name]
    model_series_dir = f'/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/{time}/final_results/'
    print(model_series_dir)
    run_dirs.append([model_series_dir])
ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)

/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/17-25-06/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/17-34-50/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-12/17-45-49/final_results/


In [61]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = '-'
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab['UE Score'] = ['BALD', 'SMP', 'PV', 'VR', 'MP']
ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-1:]) + list(ens_tab.columns[1:-1])]

In [62]:
ens_tab

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,-,BALD,0.29±0.19,26.61±4.14,2.51±0.29,-0.40±0.36,73.43±10.75,2.96±0.35,0.37±0.27,21.29±3.65,1.02±0.09
1,Deep Ensemble,-,SMP,0.82±0.30,16.48±4.34,1.96±0.34,-0.03±0.19,55.72±4.96,2.60±0.20,0.50±0.25,13.43±1.84,0.87±0.08
2,Deep Ensemble,-,PV,0.40±0.22,25.56±5.96,2.41±0.31,-0.30±0.32,68.49±8.94,2.87±0.30,0.41±0.24,17.88±2.59,0.97±0.07
3,Deep Ensemble,-,VR,0.06±0.33,24.98±1.55,3.04±0.46,-1.45±0.29,114.57±7.79,4.02±0.19,-0.33±0.30,27.82±3.65,1.74±0.20
4,baseline,-,MP,92.21±0.59,21.23±8.59,2.57±0.64,91.50±0.37,64.03±7.96,3.07±0.38,93.66±0.22,20.70±3.03,1.39±0.23


In [63]:
print(str(ens_tab.iloc[:-1].to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
        Method & Reg. Type & UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST-2} \\
               & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 Deep Ensemble &         - &     BALD &           0.29$\pm$0.19 &  26.61$\pm$4.14 &  2.51$\pm$0.29 &          -0.40$\pm$0.36 &  73.43$\pm$10.75 &  2.96$\pm$0.35 &           0.37$\pm$0.27 &  21.29$\pm$3.65 &  1.02$\pm$0.09 \\
 Deep Ensemble &         - &      SMP &           0.82$\pm$0.30 &  16.48$\pm$4.34 &  1.96$\pm$0.34 &          -0.03$\pm$0.19 &   55.72$\pm$4.96 &  2.60$\pm$0.20 &           0.50$\pm$0.25 &  13.43$\pm$1.84 &  0.87$\pm$0.08 \\
 Deep Ensemble &         - &       PV &           0.40$\pm$0.22 &  25.56$\pm$5.96 &  2.41$\pm$0.31 &          -0.30$\pm$0.32 &   68.49$\pm$8.94 &  2.87$\pm$0.30 &           0.41$\pm$0.24 &  17.88$\pm$2.59 &  0.97$

# DE + mahalanobis

In [29]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']

dataset_names = ['MRPC', 'CoLA', 'SST-2']
dataset_fnames = ['mrpc', 'cola', 'sst2']
dataset_to_time = {'mrpc': '12-56-53', 'cola': '13-08-59', 'sst2': '13-44-40'}
run_dirs = []
for name in dataset_fnames:      
    names = [f'Deep Ensemble']
    time = dataset_to_time[name]
    model_series_dir = f'/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/{time}/final_results/'
    print(model_series_dir)
    run_dirs.append([model_series_dir])
ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)

/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/12-56-53/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/13-08-59/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-10-19/13-44-40/final_results/


In [30]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = '-'
ens_tab['UE Score'] = ['MD', 'SMD', 'MP']
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-1:]) + list(ens_tab.columns[1:-1])]

In [31]:
ens_tab['Method'] = 'DE+'+ens_tab['UE Score']
ens_tab = ens_tab.drop(columns=['UE Score'])

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [32]:
tab_res_det = tab_res_det.reset_index(drop=True)

In [33]:
pd.concat([tab_res_det.iloc[:8], ens_tab.iloc[1:2], det_res.iloc[8:]]).reset_index(drop=True)

Unnamed: 0_level_0,Method,Reg. Type,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST-2,SST-2,SST-2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MD,-,0.92±0.82,13.21±1.68,1.75±0.23,0.67±0.13,41.63±1.44,1.96±0.06,0.34±0.44,13.01±2.88,0.89±0.19
1,MD,CER,0.60±0.34,11.42±1.33,1.58±0.17,0.53±0.32,43.39±3.64,2.04±0.20,0.24±0.25,12.90±3.55,0.87±0.23
2,MD SN (ours),-,0.35±0.33,13.61±2.07,1.80±0.18,1.54±0.32,40.42±2.30,1.96±0.12,0.29±0.28,12.16±1.93,0.83±0.11
3,MD SN (ours),CER,0.57±0.72,14.57±1.49,1.93±0.07,1.41±0.30,39.51±2.61,1.87±0.07,0.30±0.32,10.89±1.25,0.75±0.06
4,SMD (ours),-,1.10±0.83,13.36±2.21,1.57±0.23,0.30±0.18,47.82±3.71,2.35±0.15,0.22±0.43,17.38±3.72,1.02±0.14
5,SMD (ours),CER,0.69±0.55,11.38±2.76,1.49±0.29,0.14±0.53,51.32±7.44,2.43±0.39,0.13±0.21,16.32±3.58,0.99±0.19
6,SMD SN (ours),-,1.03±0.44,14.51±1.17,1.84±0.14,1.55±0.26,50.95±2.66,2.51±0.10,0.13±0.25,14.70±1.95,0.94±0.08
7,SMD SN (ours),CER,0.97±0.32,11.74±2.37,1.43±0.24,0.32±0.27,68.14±4.92,3.15±0.10,-0.05±0.07,16.54±4.37,1.02±0.13
8,DE+SMD,-,0.43±0.36,13.16±0.19,1.82±0.27,0.87±0.33,46.43±2.14,2.22±0.12,0.75±0.26,8.63±0.68,0.62±0.03
9,SNGP,-,0.42±0.25,15.78±3.30,2.19±0.43,0.71±0.05,51.87±2.38,2.64±0.05,1.45±0.32,13.88±1.63,0.94±0.10


# Combine all

In [37]:
table_dpp_reg['Dropout Layers'] = 'last'
table_det['Dropout Layers'] = '-'

In [38]:
table_dpp_reg

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%),Dropout Layers
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,Unnamed: 13_level_1
0,DPP_on_masks,reg,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13,last
1,DPP_on_masks,reg,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12,last
2,DPP_on_masks,reg,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09,last
3,DPP_with_ood,reg,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26,last
4,DPP_with_ood,reg,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10,last
5,DPP_with_ood,reg,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09,last
6,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11,last
7,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11,last


In [52]:
res = pd.concat([table_mc.iloc[[0,1,2]], table_dpp_reg.iloc[:-2], table_det.iloc[[9,10,11,15]], table_mc_det.iloc[[3]], ens_tab.iloc[:-1], table_dpp_reg.iloc[-2:]])
res = res[table_mc.columns].reset_index(drop=True)

In [53]:
res

Unnamed: 0_level_0,Method,Reg. Type,Dropout Layers,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MC,raw,all,bald,0.59±0.11,11.78±0.77,1.41±0.12,0.17±0.22,47.55±5.41,2.28±0.17,0.28±0.10,13.51±2.03,0.85±0.11
1,MC,raw,all,sampled_max_prob,0.53±0.13,11.60±1.21,1.45±0.14,0.30±0.18,43.51±4.19,2.15±0.15,0.22±0.12,13.99±1.69,0.91±0.12
2,MC,raw,all,variance,0.60±0.12,11.66±0.70,1.41±0.12,0.23±0.23,45.97±4.97,2.24±0.17,0.28±0.10,13.24±1.58,0.85±0.11
3,DPP_on_masks,reg,last,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13
4,DPP_on_masks,reg,last,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12
5,DPP_on_masks,reg,last,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09
6,DPP_with_ood,reg,last,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26
7,DPP_with_ood,reg,last,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10
8,DPP_with_ood,reg,last,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09
9,NUQ,spectral_norm,,epistemic,0.43±0.19,11.62±1.23,1.63±0.17,0.30±0.10,43.13±2.81,2.14±0.05,0.36±0.07,10.88±0.80,0.77±0.09


In [54]:
print(str(res.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllllll}
\toprule
         Method &      Reg. Type & Dropout Layers &                      UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
                & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
             MC &            raw &            all &                          bald &           0.59$\pm$0.11 &  11.78$\pm$0.77 &  1.41$\pm$0.12 &           0.17$\pm$0.22 &  47.55$\pm$5.41 &  2.28$\pm$0.17 &           0.28$\pm$0.10 &  13.51$\pm$2.03 &  0.85$\pm$0.11 \\
             MC &            raw &            all &              sampled\_max\_prob &           0.53$\pm$0.13 &  11.60$\pm$1.21 &  1.45$\pm$0.14 &           0.30$\pm$0.18 &  43.51$\pm$4.19 &  2.15$\pm$0.15 &           0.22$\pm$0.12 &  13.99$\pm$1.69 &  0.91$\pm$0.12 \\
             MC &            raw &            all &                      v

# DPP All 20ng

In [10]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.5, 0.6]
comsizes = [50]

dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for reg in regs:
                run_dirs = []
                names = [f'{method}|{max_frac}|{cs}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/electra_raw_no_sn/{name}/0.0/{method}_{max_frac}_50/results/'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    print(f'Not exists one of this dirs: {run_dirs}')

../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.3_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.4_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.5_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_0.6_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.3_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.4_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.5_50/results/
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_0.6_50/results/


In [11]:
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-1:])])

In [12]:
table_dpp.sort_values(by= ('20newsgroups',             'rcc-auc'))

Unnamed: 0_level_0,Unnamed: 1_level_0,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ddpp_dpp|0.3|50,var.ratio,-2.97±0.17,1093.25±23.79,5.80±0.19
ddpp_ood|0.3|50,var.ratio,-3.05±0.15,1104.95±22.85,5.86±0.20
ddpp_dpp|0.4|50,var.ratio,-3.16±0.14,1123.48±18.32,6.00±0.19
ddpp_ood|0.4|50,var.ratio,-3.20±0.13,1130.50±16.95,6.02±0.22
ddpp_dpp|0.5|50,var.ratio,-3.32±0.15,1146.23±17.34,6.14±0.19
ddpp_ood|0.5|50,var.ratio,-3.32±0.16,1149.10±19.71,6.16±0.16
ddpp_dpp|0.6|50,var.ratio,-3.44±0.13,1165.61±20.28,6.26±0.21
ddpp_ood|0.6|50,var.ratio,-3.45±0.13,1169.64±16.75,6.27±0.18
baseline|50,max_prob,91.38±0.18,473.46±28.89,2.82±0.16
ddpp_dpp|0.6|50,sampled_max_prob,-0.01±0.11,475.87±24.75,2.83±0.15


# 20 ng

In [15]:
import os 

default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
raw_baselines = {}
for ds_fname, ds_name in zip(dataset_fnames, dataset_names):
    model_series_dir = f'../workdir/run_glue_for_model_series/electra_raw_no_sn/{ds_fname}/0.0/ddpp_dpp_best/'
    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            model_series_dir, methods=default_methods, metric=metric
        )

        mean_res = agg_res.mean(axis=0)
        final_results = mean_res.T
        table.append(final_results.loc[['max_prob']])
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    raw_baselines[ds_name] = res_table#
    #raw_baselines[ds_name]={k:v for k,v in zip(res_table.columns.values.tolist(), res_table.values[0].tolist())}

In [65]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['metric', 'reg', 'raw']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', 'no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass
table_det = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra_metric_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_sn/20newsgroups/0.0/mahalanobis
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/mahalanobis


In [25]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp_best', 'ddpp_ood_best', 'mc_all']
regs = ['metric', 'reg', 'raw']
dataset_names = ['20newsgroups']
dataset_fnames = ['20newsgroups']
names = []
tables = []
#baselines = []
for method in methods:
    for reg in regs:
        for sn in ['no_sn']:
            run_dirs = []
            name_sn = ''
            names = [f'{method}|{reg}_{sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.0/{method}'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            try:
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
                #baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
            except:
                print('pass')
                pass
table_mc = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_dpp_best
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_dpp_best']
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/ddpp_dpp_best
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_dpp_best
../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_ood_best
Broken

Broken

Broken

empty dir ['../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/ddpp_ood_best']
pass
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/ddpp_ood_best
../workdir/run_glue_for_model_series/electra_raw_no_sn/20newsgroups/0.0/ddpp_ood_best
../workdir/run_glue_for_model_series/electra_metric_no_sn/20newsgroups/0.0/mc_all
../workdir/run_glue_for_model_series/electra_reg_no_sn/20newsgroups/0.0/mc_all
../workdir/run_glue_for_model_series/electra_raw_no_s

In [110]:
res_table = pd.concat([table_mc.iloc[:-2], table_det.iloc[:-2], pd.concat(baselines[-6:])]).reset_index(drop=False)

In [111]:
res_table = res_table[res_table['UE Score']!='var.ratio']

In [112]:
def preproc_regs(x):
    reg = x.split('|')[-1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif x == 'mahalanobis':
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    return 'MD'

res_table['Reg. Type'] = res_table.Method.apply(lambda x: preproc_regs(x))
res_table['Method'] = res_table.Method.apply(lambda x: preproc_method(x))
res_table['UE Score'] = res_table['UE Score'].apply(lambda x: preproc_ue(x))

In [113]:
res_table = res_table[list(res_table.columns[:1]) + list(res_table.columns[-1:]) + list(res_table.columns[1:-1])].reset_index(drop=True)

In [114]:
res_table

Unnamed: 0_level_0,Method,Reg. Type,UE Score,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+DPP) (ours),CER,BALD,-0.11±0.22,463.87±35.71,2.78±0.22
1,DDPP (+DPP) (ours),CER,SMP,-0.03±0.16,443.63±30.31,2.71±0.17
2,DDPP (+DPP) (ours),CER,PV,-0.15±0.19,466.33±28.88,2.82±0.19
3,DDPP (+DPP) (ours),-,BALD,-0.03±0.18,441.21±22.50,2.64±0.17
4,DDPP (+DPP) (ours),-,SMP,-0.01±0.16,438.45±30.94,2.63±0.16
5,DDPP (+DPP) (ours),-,PV,-0.01±0.19,436.32±26.77,2.63±0.17
6,DDPP (+OOD) (ours),CER,BALD,-0.15±0.16,465.83±25.44,2.81±0.15
7,DDPP (+OOD) (ours),CER,SMP,-0.15±0.20,467.87±37.65,2.82±0.18
8,DDPP (+OOD) (ours),CER,PV,-0.17±0.20,469.24±30.78,2.83±0.18
9,DDPP (+OOD) (ours),-,BALD,-0.14±0.33,459.79±47.69,2.76±0.33


In [115]:
res_table.iloc[list(range(12)) + list(range(27, 33))]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+DPP) (ours),CER,BALD,-0.11±0.22,463.87±35.71,2.78±0.22
1,DDPP (+DPP) (ours),CER,SMP,-0.03±0.16,443.63±30.31,2.71±0.17
2,DDPP (+DPP) (ours),CER,PV,-0.15±0.19,466.33±28.88,2.82±0.19
3,DDPP (+DPP) (ours),-,BALD,-0.03±0.18,441.21±22.50,2.64±0.17
4,DDPP (+DPP) (ours),-,SMP,-0.01±0.16,438.45±30.94,2.63±0.16
5,DDPP (+DPP) (ours),-,PV,-0.01±0.19,436.32±26.77,2.63±0.17
6,DDPP (+OOD) (ours),CER,BALD,-0.15±0.16,465.83±25.44,2.81±0.15
7,DDPP (+OOD) (ours),CER,SMP,-0.15±0.20,467.87±37.65,2.82±0.18
8,DDPP (+OOD) (ours),CER,PV,-0.17±0.20,469.24±30.78,2.83±0.18
9,DDPP (+OOD) (ours),-,BALD,-0.14±0.33,459.79±47.69,2.76±0.33


In [116]:
res_table.iloc[list(range(12,33))]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,20newsgroups,20newsgroups,20newsgroups
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp
12,MC dropout,metric,BALD,-0.04±0.12,497.47±73.40,2.71±0.27
13,MC dropout,metric,SMP,-0.06±0.16,493.36±95.29,2.73±0.33
14,MC dropout,metric,PV,-0.05±0.14,495.59±87.65,2.74±0.30
15,MC dropout,CER,BALD,0.47±0.09,367.30±17.41,2.18±0.09
16,MC dropout,CER,SMP,0.46±0.10,368.76±20.08,2.19±0.10
17,MC dropout,CER,PV,0.44±0.09,369.50±19.59,2.21±0.09
18,MC dropout,-,BALD,0.47±0.08,364.66±10.97,2.15±0.08
19,MC dropout,-,SMP,0.48±0.09,364.41±15.67,2.14±0.09
20,MC dropout,-,PV,0.46±0.09,363.92±12.14,2.16±0.08
21,MD SN (ours),metric,MD,0.61±0.08,351.46±12.24,2.00±0.08


# Legacy

In [4]:
# Get results with new dpp models
names = ['MC|last|reg', 'MC|all|reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-reg/{name.lower()}/last/results', f'../workdir/run_glue_for_model_series/electra-reg/{name.lower()}/all/results'] for name in dataset_fnames]
mc_table_reg = collect_datasets(run_dirs, names, dataset_names)

names = ['DPP2|reg', 'DPP OOD|reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_reg = collect_datasets(run_dirs, names, dataset_names)

names = ['MC|last|no reg', 'MC|all|no reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/last/results', f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/all/results'] for name in dataset_fnames]
mc_table_no_reg = collect_datasets(run_dirs, names, dataset_names)

names = ['DPP2|no reg', 'DPP OOD|no reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_no_reg = collect_datasets(run_dirs, names, dataset_names)

In [5]:
names = ['DPP2|no reg calibrated', 'DPP OOD|no reg calibrated']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-raw-calibrate/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-raw-calibrate/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_no_reg_cal = collect_datasets(run_dirs, names, dataset_names)

In [6]:
dpp_table_no_reg_cal

Unnamed: 0_level_0,Unnamed: 1_level_0,SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),MRPC,MRPC,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,CoLA,CoLA
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
DPP2|no reg calibrated,bald,93.7±0.3,78.3±4.4,22.4±5.6,27.4±4.4,1.4±0.3,92.8±0.8,78.6±7.9,18.6±10.4,34.0±5.1,2.1±0.8,91.9±0.3,76.0±2.8,55.8±4.2,30.1±5.1,2.8±0.3
DPP2|no reg calibrated,sampled_max_prob,93.7±0.2,79.3±2.6,19.6±3.1,28.5±4.2,1.3±0.2,93.1±0.4,82.3±3.6,12.9±2.8,37.2±5.7,1.7±0.3,92.1±0.2,78.1±0.6,50.7±2.5,36.9±3.9,2.6±0.1
DPP2|no reg calibrated,variance,93.7±0.2,79.7±3.3,19.8±3.7,28.5±3.8,1.3±0.2,93.1±0.3,82.0±3.5,13.9±3.3,36.8±4.8,1.8±0.3,92.0±0.2,77.5±1.2,52.6±1.6,33.5±4.0,2.6±0.1
DPP2|no reg calibrated,sampled_entropy,93.7±0.2,79.3±2.6,19.6±3.1,28.5±4.2,1.3±0.2,93.1±0.4,82.3±3.6,12.9±2.8,37.2±5.7,1.7±0.3,92.1±0.2,78.1±0.6,50.7±2.5,36.9±3.9,2.6±0.1
DPP OOD|no reg calibrated,bald,93.8±0.2,81.0±3.1,18.9±3.2,29.8±3.4,1.2±0.2,93.0±0.4,81.4±3.5,14.6±4.5,35.4±4.3,1.8±0.4,92.0±0.3,77.2±2.6,54.4±4.7,31.4±4.4,2.7±0.3
DPP OOD|no reg calibrated,sampled_max_prob,93.8±0.2,79.9±2.6,18.8±2.9,28.6±4.3,1.3±0.2,93.1±0.3,82.1±3.6,13.0±2.8,37.0±5.8,1.8±0.3,92.3±0.2,79.8±0.5,47.5±2.7,38.8±3.5,2.4±0.1
DPP OOD|no reg calibrated,variance,93.8±0.2,81.2±3.0,18.1±3.2,29.2±3.5,1.2±0.2,93.1±0.4,82.6±3.4,13.2±3.3,36.7±4.4,1.7±0.3,92.3±0.2,79.4±1.6,49.0±2.7,35.3±3.6,2.4±0.2
DPP OOD|no reg calibrated,sampled_entropy,93.8±0.2,79.9±2.6,18.8±2.9,28.6±4.3,1.3±0.2,93.1±0.3,82.1±3.6,13.0±2.8,37.0±5.8,1.8±0.3,92.3±0.2,79.8±0.5,47.5±2.7,38.8±3.5,2.4±0.1


In [7]:
names = ['DPP2|reg calibrated', 'DPP OOD|reg calibrated']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_reg_cal = collect_datasets(run_dirs, names, dataset_names)

In [8]:
overall_table_cal = pd.concat([mc_table_reg, dpp_table_reg_cal, mc_table_no_reg, dpp_table_no_reg, dpp_table_no_reg_cal])

In [9]:
overall_table_cal

Unnamed: 0_level_0,Unnamed: 1_level_0,SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),MRPC,MRPC,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,CoLA,CoLA
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
MC|last|reg,bald,93.7±0.2,78.7±2.1,24.4±5.1,27.2±2.6,1.4±0.2,92.8±0.5,80.8±3.1,16.5±3.8,40.5±3.4,2.0±0.4,91.6±0.3,75.0±2.3,65.2±7.2,32.8±2.3,3.0±0.3
MC|last|reg,sampled_max_prob,94.0±0.1,82.9±1.1,16.5±1.5,31.3±3.4,1.1±0.1,93.1±0.5,84.7±3.1,12.2±2.8,46.4±4.4,1.6±0.4,91.9±0.2,77.8±1.7,54.6±5.4,36.1±1.2,2.7±0.2
MC|last|reg,variance,93.8±0.1,81.3±1.0,20.9±3.3,30.2±2.8,1.2±0.1,93.0±0.5,83.7±3.3,13.5±3.2,45.0±3.4,1.7±0.4,91.8±0.3,77.2±1.9,58.0±6.2,35.5±1.7,2.8±0.2
MC|last|reg,sampled_entropy,94.0±0.1,82.9±1.1,16.5±1.5,31.3±3.4,1.1±0.1,93.1±0.5,84.7±3.1,12.2±2.8,46.4±4.4,1.6±0.4,91.9±0.2,77.8±1.7,54.6±5.4,36.1±1.2,2.7±0.2
SR|reg,SR|reg,94.0±0.1,83.2±1.1,15.9±1.3,31.3±3.4,1.1±0.1,93.1±0.5,84.8±3.1,12.1±2.8,46.5±4.5,1.6±0.4,91.9±0.2,77.8±1.7,54.4±5.5,36.0±1.2,2.7±0.2
MC|all|reg,bald,94.1±0.1,85.9±1.4,14.0±2.3,32.0±3.1,0.9±0.1,93.5±0.2,87.7±0.9,10.4±1.6,45.4±2.5,1.3±0.2,92.1±0.1,79.4±1.2,49.4±3.1,34.0±1.6,2.5±0.1
MC|all|reg,sampled_max_prob,94.1±0.1,85.2±0.9,13.7±0.9,31.2±2.8,0.9±0.1,93.5±0.2,87.9±1.3,10.0±1.3,46.5±5.0,1.3±0.2,92.2±0.2,80.3±1.4,47.6±2.9,37.5±1.2,2.4±0.2
MC|all|reg,variance,94.1±0.1,85.8±1.1,13.8±1.6,32.0±3.2,0.9±0.1,93.5±0.2,87.9±1.0,10.2±1.5,46.0±2.6,1.3±0.2,92.2±0.1,79.7±1.2,48.8±2.6,35.0±1.4,2.5±0.1
MC|all|reg,sampled_entropy,94.1±0.1,85.2±0.9,13.7±0.9,31.2±2.8,0.9±0.1,93.5±0.2,87.9±1.3,10.0±1.3,46.5±5.0,1.3±0.2,92.2±0.2,80.3±1.4,47.6±2.9,37.5±1.2,2.4±0.2
DPP2|reg calibrated,bald,93.8±0.3,81.0±4.4,18.7±4.2,30.0±5.6,1.2±0.3,92.8±0.7,81.5±6.5,16.9±8.3,41.6±6.0,2.0±0.7,91.7±0.2,75.9±1.8,59.0±6.0,32.9±2.8,2.9±0.2
