In [1]:
# Like common file, but with another table structure

In [2]:
import yaml
import os
from yaml import Loader as Loader
from pathlib import Path
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_auc_score

from analyze_results import (
    extract_result,
    aggregate_runs,
    from_model_outputs_calc_rcc_auc,
)
from analyze_results import (
    format_results2,
    improvement_over_baseline,
    from_model_outputs_calc_pr_auc,
    from_model_outputs_calc_rpp,
    from_model_outputs_calc_roc_auc,
    from_model_outputs_calc_arc_auc
)

from utils.utils_wandb import init_wandb, wandb
from ue4nlp.ue_scores import *


In [3]:
def choose_metric(metric_type):
    if metric_type  == "rejection-curve-auc":
        return from_model_outputs_calc_arc_auc
    if metric_type == "roc-auc":
        return from_model_outputs_calc_roc_auc
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc

    elif metric_type == "pr-auc":
        return from_model_outputs_calc_pr_auc

    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp

    else:
        raise ValueError("Wrong metric type!")


def get_one_table(runs_dir, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baseline=None):
    default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
        #"sampled_entropy": mean_entropy,
    }

    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            runs_dir, methods=default_methods, metric=metric
        )

        if agg_res.empty:
            print("Broken\n")
            continue

        if metric_type == "rcc-auc":
            final_score = format_results2(agg_res, percents=False)
        elif metric_type == "rpp":
            final_score = format_results2(agg_res, percents=True)
        else:
            final_score = improvement_over_baseline(agg_res, baseline_col="max_prob", baseline=baseline, metric=metric_type, percents=True, subtract=True)
        table.append(final_score)
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    # fix for rcc-auc and rpp
    if 'baseline (max_prob)' not in res_table.index:
        res_table.loc['baseline (max_prob)'] = 0
    for metric in ['rcc-auc', 'rpp']:
        try:
            res_table[metric].loc['baseline (max_prob)'] = res_table[metric].loc['max_prob']
        except:
            pass
    try:
        res_table = res_table.drop(['max_prob', 'count'])
    except:
        res_table = res_table.drop(['max_prob'])
    return res_table


def collect_tables(run_dirs, names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baseline=None):
    all_tables = []
    for run_dir, name in zip(run_dirs, names):
        buf_table = get_one_table(run_dir, metric_types, baseline)
        #print(buf_table)
        # add name to index
        indices = [(name, ind) for ind in list(buf_table.index)]
        baseline_name = 'baseline|'+'|'.join(name.split('|')[2:])
        buf_table.loc[baseline_name] = buf_table.loc['baseline (max_prob)']
        # add reindex
        indices = indices + [(baseline_name, 'max_prob')]
        
        index = pd.MultiIndex.from_tuples(indices, names=['Method', 'UE Score'])
        buf_table.index = index
        buf_table.drop((name, 'baseline (max_prob)'), inplace=True)
        # add buf_table to final_table
        all_tables.append(buf_table)
    return pd.concat(all_tables)


def collect_datasets(runs_dirs, names, dataset_names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], baselines={}):
    all_tables = []
    for run_dir, dataset_name in zip(runs_dirs, dataset_names):
        dataset_table = collect_tables(run_dir, names, metric_types, baselines.get(dataset_name, None))
        columns = pd.MultiIndex.from_tuples([(dataset_name, ind) for ind in list(dataset_table.columns)])
        dataset_table.columns = columns
        all_tables.append(dataset_table)
    return pd.concat(all_tables, axis=1)

In [4]:
import os 

default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
raw_baselines = {}
for ds_fname, ds_name in zip(dataset_fnames, dataset_names):
    model_series_dir = f'../workdir/run_glue_for_model_series/electra-raw/{ds_fname}/0.0/mahalanobis/'
    model_series_dir += os.listdir(model_series_dir)[-1]
    model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            model_series_dir, methods=default_methods, metric=metric
        )

        mean_res = agg_res.mean(axis=0)
        final_results = mean_res.T
        table.append(final_results.loc[['max_prob']])
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    raw_baselines[ds_name] = res_table#{k:v for k,v in zip(res_table.columns.values.tolist(), res_table.values[0].tolist())}

In [5]:
raw_baselines

{'MRPC':           rejection-curve-auc    rcc-auc      rpp
 max_prob             0.927839  15.032621  0.01966,
 'CoLA':           rejection-curve-auc    rcc-auc       rpp
 max_prob             0.922343  48.808382  0.024289,
 'SST2 (10%)':           rejection-curve-auc    rcc-auc       rpp
 max_prob               0.9389  17.100472  0.011332}

# Determenistic methods

In [45]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['nuq', 'mahalanobis']
regs = ['reg', 'raw']
dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['', '-True']:
            run_dirs = []
            name_sn = '|spectral_norm' if sn == '-True' else ''
            names = [f'{method}|last|{reg}{name_sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}{sn}/{name}/0.0/{method}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_det = pd.concat([pd.concat(tables), pd.concat(baselines[-4:])])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/nuq/2021-09-21/11-25-22/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/nuq/2021-09-21/11-46-33/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/nuq/2021-09-21/12-20-34/
../workdir/run_glue_for_model_series/electra-reg-True/mrpc/0.0/nuq/2021-09-22/07-09-01/
../workdir/run_glue_for_model_series/electra-reg-True/cola/0.0/nuq/2021-09-22/07-29-50/
../workdir/run_glue_for_model_series/electra-reg-True/sst2/0.0/nuq/2021-09-22/08-03-27/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/nuq/2021-09-21/11-29-31/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/nuq/2021-09-21/11-53-38/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/nuq/2021-09-21/12-50-18/
../workdir/run_glue_for_model_series/electra-raw-True/mrpc/0.0/nuq/2021-09-22/07-13-05/
../workdir/run_glue_for_model_series/electra-raw-True/cola/0.0/nuq/2021-09-22/07-36-36/
../workdir/run_glue_for_model_series/electra-raw-True/sst2/0.0

In [46]:
def preproc_regs(x):
    ind = 2
    if 'baseline' in x:
        ind = 1
    regs = x.split('|')[ind:]
    if len(regs) == 2 and regs[0] == 'raw':
        return regs[-1]
    return '+'.join(regs)
        
table_det = table_det.reset_index()
table_det['Reg. Type'] = table_det.Method.apply(lambda x: preproc_regs(x))
table_det['Method'] = table_det['Method'].apply(lambda x: 'NUQ' if 'nuq' in x else x.split('|')[0].capitalize() )
table_det = table_det[list(table_det.columns[:1]) + list(table_det.columns[-1:]) + list(table_det.columns[1:-1])]

In [47]:
table_det

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,NUQ,reg,epistemic,0.29±0.20,12.48±1.54,1.68±0.26,0.12±0.19,45.10±4.09,2.26±0.18,0.28±0.10,11.98±1.31,0.85±0.10
1,NUQ,reg,aleatoric,0.14±0.23,13.26±1.70,1.84±0.31,0.09±0.18,45.75±3.86,2.31±0.17,0.28±0.11,12.01±1.69,0.85±0.12
2,NUQ,reg,total,0.27±0.21,12.59±1.56,1.70±0.26,0.11±0.20,45.29±4.01,2.28±0.18,0.32±0.10,11.80±1.49,0.83±0.10
3,NUQ,reg+spectral_norm,epistemic,0.29±0.22,12.30±1.47,1.70±0.24,0.04±0.11,46.43±1.01,2.33±0.11,0.20±0.07,13.36±1.25,0.92±0.06
4,NUQ,reg+spectral_norm,aleatoric,0.17±0.26,13.06±1.55,1.85±0.25,0.00±0.10,47.00±1.14,2.37±0.12,0.18±0.11,13.71±1.55,0.94±0.08
5,NUQ,reg+spectral_norm,total,0.27±0.23,12.45±1.51,1.73±0.24,0.04±0.10,46.57±1.06,2.34±0.12,0.20±0.10,13.30±1.52,0.91±0.08
6,NUQ,raw,epistemic,0.19±0.21,13.40±1.29,1.80±0.22,0.34±0.13,41.91±2.11,2.06±0.12,0.26±0.11,12.38±1.59,0.87±0.11
7,NUQ,raw,aleatoric,0.16±0.21,13.59±1.30,1.84±0.22,0.31±0.13,42.34±2.00,2.10±0.12,0.22±0.12,12.85±1.70,0.91±0.12
8,NUQ,raw,total,0.19±0.21,13.44±1.29,1.81±0.22,0.33±0.13,41.94±2.08,2.06±0.12,0.26±0.12,12.43±1.67,0.87±0.12
9,NUQ,spectral_norm,epistemic,0.43±0.19,11.62±1.23,1.63±0.17,0.30±0.10,43.13±2.81,2.14±0.05,0.36±0.07,10.88±0.80,0.77±0.09


In [48]:
print(str(table_det.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
      Method &          Reg. Type &              UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
             & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
         NUQ &                reg &             epistemic &           0.29$\pm$0.20 &  12.48$\pm$1.54 &  1.68$\pm$0.26 &           0.12$\pm$0.19 &  45.10$\pm$4.09 &  2.26$\pm$0.18 &           0.28$\pm$0.10 &  11.98$\pm$1.31 &  0.85$\pm$0.10 \\
         NUQ &                reg &             aleatoric &           0.14$\pm$0.23 &  13.26$\pm$1.70 &  1.84$\pm$0.31 &           0.09$\pm$0.18 &  45.75$\pm$3.86 &  2.31$\pm$0.17 &           0.28$\pm$0.11 &  12.01$\pm$1.69 &  0.85$\pm$0.12 \\
         NUQ &                reg &                 total &           0.27$\pm$0.21 &  12.59$\pm$1.56 &  1.70$\pm$0.26 &           0.11$\pm$0.20 &  45.2

In [11]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['sngp']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        names = [f'{method}|last|{reg}']
        for name in dataset_fnames:
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}-sngp/{name}/0.0/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])
table_det = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-raw-sngp/mrpc/0.0/
../workdir/run_glue_for_model_series/electra-raw-sngp/cola/0.0/
../workdir/run_glue_for_model_series/electra-raw-sngp/sst2/0.0/


In [12]:
def preproc_regs(x):
    ind = 2
    if 'baseline' in x:
        ind = 1
    regs = x.split('|')[ind:]
    if len(regs) == 2 and regs[0] == 'raw':
        return '-'
    return '+'.join(regs)
        
table_det = table_det.reset_index()
table_det['Reg. Type'] = table_det.Method.apply(lambda x: preproc_regs(x))
table_det['Method'] = table_det['Method'].apply(lambda x: 'SNGP' if 'sngp' in x else x.split('|')[0])
table_det.loc[list(range(2)), ('UE Score', '')] = 'std'
table_det = table_det[list(table_det.columns[:1]) + list(table_det.columns[-1:]) + list(table_det.columns[1:-1])]

In [13]:
table_det

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,SNGP,raw,std,-0.24±0.30,16.59±2.53,2.27±0.30,-0.47±0.29,59.21±8.50,2.85±0.25,-0.03±0.26,19.90±7.41,1.17±0.26
1,baseline,raw,std,91.97±0.41,21.61±6.58,2.87±0.39,90.84±0.55,84.29±23.30,3.82±0.50,92.59±0.30,45.39±7.71,2.44±0.31


In [11]:
print(str(table_det.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
   Method & Reg. Type &  UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
          & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &       rcc-auc &        rpp & rejection-curve-auc &        rcc-auc &        rpp \\
\midrule
     SNGP &       reg &       std &          -2.47$\pm$3.93 &  38.40$\pm$38.03 &  3.80$\pm$2.62 &          -1.44$\pm$0.32 &    86.34$\pm$8.70 &  3.62$\pm$0.32 &          -5.69$\pm$8.08 &  128.97$\pm$149.41 &  4.86$\pm$4.26 \\
     SNGP &       raw &       std &          -0.34$\pm$0.18 &   17.27$\pm$1.69 &  2.33$\pm$0.15 &          -0.68$\pm$0.23 &    65.33$\pm$6.41 &  3.02$\pm$0.21 &          -0.23$\pm$0.15 &     24.92$\pm$6.43 &  1.37$\pm$0.16 \\
 baseline &       reg &  max\_prob &          90.22$\pm$3.45 &  37.81$\pm$28.42 &  3.92$\pm$2.10 &          89.86$\pm$0.63 &  116.22$\pm$31.18 &  4.60$\pm$0.62 &          88.21$\pm$7.29 &  128.68$\pm$146.73 & 

# MC-Mahalanobis

In [49]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mc_mahalanobis']
regs = ['raw', 'reg']
dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['', '-True']:
            run_dirs = []
            name_sn = '|spectral_norm' if sn == '-True' else ''
            names = [f'{method}|last|{reg}{name_sn}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}{sn}/{name}/0.0/{method}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_mc_det = pd.concat([pd.concat(tables), pd.concat(baselines[-4:])])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/mc_mahalanobis/2021-09-28/21-10-42/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/mc_mahalanobis/2021-09-28/22-30-16/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/mc_mahalanobis/2021-09-29/01-10-54/
../workdir/run_glue_for_model_series/electra-raw-True/mrpc/0.0/mc_mahalanobis/2021-09-29/06-52-29/
../workdir/run_glue_for_model_series/electra-raw-True/cola/0.0/mc_mahalanobis/2021-09-29/07-41-30/
../workdir/run_glue_for_model_series/electra-raw-True/sst2/0.0/mc_mahalanobis/2021-09-29/09-14-39/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/mc_mahalanobis/2021-09-28/21-50-20/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/mc_mahalanobis/2021-09-28/23-50-17/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/mc_mahalanobis/2021-09-29/09-47-45/
../workdir/run_glue_for_model_series/electra-reg-True/mrpc/0.0/mc_mahalanobis/2021-09-29/07-17-10/
../workdir/run_glue_for_model_series/ele

In [5]:
def preproc_regs(x):
    ind = 2
    if 'baseline' in x:
        ind = 1
    regs = x.split('|')[ind:]
    if len(regs) == 2 and regs[0] == 'raw':
        return regs[-1]
    return '+'.join(regs)
        
table_mc_det = table_mc_det.reset_index()
table_mc_det['Reg. Type'] = table_mc_det.Method.apply(lambda x: preproc_regs(x))
table_mc_det['Method'] = table_mc_det['Method'].apply(lambda x: x.split('|')[0][:2].upper() + ' ' + x.split('|')[0][3:].capitalize() if 'maha' in x else x.split('|')[0])
table_mc_det = table_mc_det[list(table_mc_det.columns[:1]) + list(table_mc_det.columns[-1:]) + list(table_mc_det.columns[1:-1])]

In [51]:
table_mc_det

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MC Mahalanobis,raw,mahalanobis_distance,0.18±0.18,13.38±1.17,1.81±0.20,0.41±0.19,40.68±2.63,2.02±0.17,0.22±0.11,12.75±1.38,0.91±0.11
1,MC Mahalanobis,raw,sampled_mahalanobis_distance,0.31±0.19,13.06±1.43,1.67±0.14,-0.09±0.32,51.27±6.49,2.53±0.29,-0.00±0.22,22.13±5.52,1.14±0.22
2,MC Mahalanobis,spectral_norm,mahalanobis_distance,0.44±0.24,11.50±1.54,1.62±0.23,0.51±0.08,39.43±2.04,1.97±0.07,0.29±0.03,11.64±0.69,0.85±0.06
3,MC Mahalanobis,spectral_norm,sampled_mahalanobis_distance,0.64±0.17,10.73±1.83,1.44±0.16,0.11±0.12,47.63±2.59,2.36±0.11,0.11±0.12,18.12±5.20,1.04±0.14
4,MC Mahalanobis,reg,mahalanobis_distance,0.33±0.17,12.11±1.30,1.64±0.21,0.21±0.22,43.42±4.26,2.18±0.23,0.15±0.20,13.98±2.63,1.00±0.20
5,MC Mahalanobis,reg,sampled_mahalanobis_distance,0.18±0.29,13.82±2.31,1.83±0.28,-1.22±0.74,82.36±29.02,3.65±0.82,-0.35±0.43,26.71±9.85,1.50±0.44
6,MC Mahalanobis,reg+spectral_norm,mahalanobis_distance,0.33±0.26,12.13±1.98,1.70±0.32,0.25±0.09,43.45±1.48,2.15±0.12,-14.27±22.26,295.49±434.88,2.38±2.09
7,MC Mahalanobis,reg+spectral_norm,sampled_mahalanobis_distance,0.24±0.20,13.06±1.84,1.79±0.24,-0.94±0.44,68.33±10.40,3.34±0.45,-14.42±22.24,297.98±432.64,2.51±2.08
8,baseline,raw,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.89±0.21,17.10±3.29,1.13±0.23
9,baseline,spectral_norm,max_prob,92.60±0.21,16.60±3.53,2.25±0.17,91.55±0.17,65.50±6.87,3.15±0.18,93.91±0.21,16.21±2.95,1.12±0.23


In [11]:
print(str(table_det.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
         Method &          Reg. Type &                      UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
                & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &        rcc-auc &        rpp \\
\midrule
 MC Mahalanobis &                raw &          mahalanobis\_distance &           0.18$\pm$0.18 &  13.38$\pm$1.17 &  1.81$\pm$0.20 &           0.41$\pm$0.19 &   40.68$\pm$2.63 &  2.02$\pm$0.17 &           0.22$\pm$0.11 &     12.75$\pm$1.38 &  0.91$\pm$0.11 \\
 MC Mahalanobis &                raw &  sampled\_mahalanobis\_distance &           0.31$\pm$0.19 &  13.06$\pm$1.43 &  1.67$\pm$0.14 &          -0.09$\pm$0.32 &   51.27$\pm$6.49 &  2.53$\pm$0.29 &          -0.00$\pm$0.22 &     22.13$\pm$5.52 &  1.14$\pm$0.22 \\
 MC Mahalanobis &      spectral\_norm &          mahalanobis\_distance &           0.44$\pm$0.24 &  1

# MC-Dropout 

In [12]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mc/all', 'mc/last']
regs = ['raw']
dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        layer = method.split('/')[-1]
        names = [f'mc|{layer}|{reg}']
        for name in dataset_fnames:
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/0.0/{method}/'
            model_series_dir += os.listdir(model_series_dir)[-1]
            model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])
table_mc = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_mc = table_mc.reset_index()

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/mc/all/2021-09-23/08-47-35/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/mc/all/2021-09-23/09-38-50/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/mc/all/2021-09-23/10-26-04/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/mc/last/2021-09-23/08-37-24/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/mc/last/2021-09-23/09-06-43/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/mc/last/2021-09-23/10-06-44/


In [13]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]
        
table_mc['Reg. Type'] = table_mc.Method.apply(lambda x: preproc_regs(x))
table_mc['Dropout Layers'] = table_mc['Method'].apply(lambda x: x.split('|')[1] if 'baseline' not in x else '-')
table_mc['Method'] = table_mc['Method'].apply(lambda x: x.split('|')[0].upper() if 'baseline' not in x else x.split('|')[0])
table_mc = table_mc[list(table_mc.columns[:1]) + list(table_mc.columns[-2:]) + list(table_mc.columns[1:-2])]

In [14]:
table_mc

Unnamed: 0_level_0,Method,Reg. Type,Dropout Layers,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MC,raw,all,bald,0.59±0.11,11.78±0.77,1.41±0.12,0.17±0.22,47.55±5.41,2.28±0.17,0.28±0.10,13.51±2.03,0.85±0.11
1,MC,raw,all,sampled_max_prob,0.53±0.13,11.60±1.21,1.45±0.14,0.30±0.18,43.51±4.19,2.15±0.15,0.22±0.12,13.99±1.69,0.91±0.12
2,MC,raw,all,variance,0.60±0.12,11.66±0.70,1.41±0.12,0.23±0.23,45.97±4.97,2.24±0.17,0.28±0.10,13.24±1.58,0.85±0.11
3,MC,raw,last,bald,-0.82±0.32,25.62±2.90,2.84±0.30,-0.47±0.50,64.35±14.49,2.89±0.48,-0.35±0.17,26.10±5.15,1.50±0.18
4,MC,raw,last,sampled_max_prob,-0.06±0.27,16.16±2.88,2.04±0.30,-0.01±0.32,49.20±6.71,2.45±0.28,-0.02±0.23,18.09±4.47,1.15±0.25
5,MC,raw,last,variance,-0.40±0.26,20.70±2.96,2.39±0.26,-0.17±0.43,55.38±10.10,2.60±0.39,-0.14±0.19,21.72±4.77,1.28±0.20
6,baseline,raw,-,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.89±0.21,17.10±3.29,1.13±0.23
7,baseline,raw,-,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.89±0.21,17.10±3.29,1.13±0.23


In [15]:
print(str(table_mc.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllllll}
\toprule
   Method & Reg. Type & Dropout Layers &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
          & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
       MC &       raw &            all &              bald &           0.59$\pm$0.11 &  11.78$\pm$0.77 &  1.41$\pm$0.12 &           0.17$\pm$0.22 &   47.55$\pm$5.41 &  2.28$\pm$0.17 &           0.28$\pm$0.10 &  13.51$\pm$2.03 &  0.85$\pm$0.11 \\
       MC &       raw &            all &  sampled\_max\_prob &           0.53$\pm$0.13 &  11.60$\pm$1.21 &  1.45$\pm$0.14 &           0.30$\pm$0.18 &   43.51$\pm$4.19 &  2.15$\pm$0.15 &           0.22$\pm$0.12 &  13.99$\pm$1.69 &  0.91$\pm$0.12 \\
       MC &       raw &            all &          variance &           0.60$\pm$0.12 &  11.66$\pm$0.70 &  1.41$\pm$0.12 &           0.23$\pm$0.23 

# MC-DPP all

In [5]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['reg', 'raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]


dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for val_subsample in val_subsamples:
            for reg in regs:
                run_dirs = []
                dpp_type = 'with_ood' if method=='True' else 'on_masks'
                names = [f'dpp_{dpp_type}|{max_frac}|{val_subsample}|{reg}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
                    model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                    model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/dpp/False/0.3/2021-10-01/16-30-33/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/dpp/False/0.3/2021-10-02/06-23-06/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/dpp/False/0.3/2021-10-03/07-25-53/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/False/0.3/2021-10-01/18-31-05/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/False/0.3/2021-10-02/10-05-55/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/False/0.3/2021-10-03/10-23-36/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.1/dpp/False/0.3/2021-10-01/16-14-02/
../workdir/run_glue_for_model_series/electra-reg/cola/0.1/dpp/False/0.3/2021-10-02/05-55-17/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.1/dpp/False/0.3/2021-10-03/07-04-44/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/dpp/False/0.3/2021-10-01/18-15-59/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/False/0.

In [6]:
table_dpp.sort_values(by=('SST2 (10%)', 'rcc-auc')).iloc[:50]

Unnamed: 0_level_0,Unnamed: 1_level_0,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
dpp_with_ood|0.4|0.0|reg,sampled_max_prob,-0.12±0.10,13.55±0.73,1.87±0.19,0.21±0.10,47.52±5.06,2.35±0.26,0.05±0.07,12.03±1.49,0.84±0.09
dpp_with_ood|0.6|0.0|reg,sampled_max_prob,-0.09±0.07,13.39±0.73,1.84±0.18,0.15±0.07,48.73±5.30,2.41±0.27,0.06±0.05,12.10±1.59,0.85±0.10
dpp_with_ood|0.3|0.0|reg,sampled_max_prob,-0.14±0.09,13.67±0.74,1.90±0.18,0.25±0.11,47.20±4.88,2.32±0.25,0.04±0.10,12.49±1.58,0.86±0.08
dpp_on_masks|0.6|0.0|reg,sampled_max_prob,0.02±0.02,12.75±0.98,1.74±0.21,-0.02±0.02,51.43±5.55,2.57±0.27,-0.01±0.02,13.07±1.80,0.90±0.11
dpp_on_masks|0.6|0.0|reg,variance,-0.01±0.12,12.89±1.23,1.74±0.22,-0.07±0.10,52.46±5.81,2.63±0.30,-0.01±0.05,13.18±1.98,0.91±0.13
dpp_with_ood|0.6|0.0|reg,variance,0.11±0.14,12.35±0.69,1.66±0.22,0.11±0.14,49.10±5.99,2.45±0.31,0.02±0.16,13.22±2.01,0.88±0.09
dpp_on_masks|0.4|0.0|reg,sampled_max_prob,0.06±0.06,12.43±1.31,1.68±0.26,0.01±0.06,51.06±4.97,2.56±0.26,-0.01±0.02,13.23±1.86,0.91±0.12
dpp_on_masks|0.3|0.0|reg,sampled_max_prob,0.04±0.03,12.53±0.98,1.70±0.22,-0.01±0.08,51.19±5.18,2.57±0.28,-0.05±0.08,13.66±2.10,0.94±0.15
dpp_on_masks|0.4|0.0|reg,variance,0.11±0.12,12.20±1.66,1.64±0.30,-0.09±0.13,52.46±4.39,2.65±0.23,-0.03±0.06,13.71±1.31,0.93±0.09
dpp_with_ood|0.3|0.1|raw,sampled_max_prob,-0.18±0.14,16.59±3.02,2.16±0.31,0.17±0.23,59.09±9.99,2.44±0.21,0.24±0.13,13.76±2.39,0.94±0.20


# MC-DPP calibration

In [16]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.6, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.6}, 
                   'sst2': {'False': 0.3, 'True': 0.3}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for val_subsample in val_subsamples:
        for reg in regs:
            run_dirs = []
            dpp_type = 'with_ood' if method=='True' else 'on_masks'
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            for name in dataset_fnames:
                max_frac = max_fracs_dicts[name][method]
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_dpp_cal = pd.concat([pd.concat(tables), pd.concat(baselines[-2:-1])])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/False/0.6/2021-10-02/03-46-27/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/False/0.6/2021-10-03/03-10-07/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/False/0.3/2021-10-03/10-23-36/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/dpp/False/0.6/2021-10-02/03-22-18/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/False/0.6/2021-10-03/02-29-18/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.1/dpp/False/0.3/2021-10-03/10-02-10/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.6/2021-10-02/05-02-54/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/True/0.6/2021-10-03/05-28-09/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/True/0.3/2021-10-03/11-51-10/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/dpp/True/0.6/2021-10-02/04-33-03/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/True/0.6/202

In [17]:
def preproc_cal(x):
    if 'baseline' in x:
        return '-'
    ds = x.split('|')[1]
    if ds == '0.1':
        return 'val.'
    return 'train'
        
table_dpp_cal = table_dpp_cal.reset_index()
table_dpp_cal['Calibr. Dataset'] = table_dpp_cal.Method.apply(lambda x: preproc_cal(x))
table_dpp_cal['Method'] = table_dpp_cal['Method'].apply(lambda x: x.split('|')[0])
table_dpp_cal = table_dpp_cal[list(table_dpp_cal.columns[:1]) + list(table_dpp_cal.columns[-1:]) + list(table_dpp_cal.columns[1:-1])]

In [18]:
table_dpp_cal

Unnamed: 0_level_0,Method,Calibr. Dataset,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,train,bald,-0.05±0.16,15.99±0.98,2.03±0.18,-0.11±0.39,51.39±9.34,2.54±0.37,-0.01±0.19,19.37±3.13,1.15±0.18
1,DPP_on_masks,train,sampled_max_prob,0.04±0.18,14.42±1.06,1.92±0.20,0.00±0.31,48.92±7.28,2.43±0.28,0.01±0.22,17.14±3.35,1.11±0.23
2,DPP_on_masks,train,variance,0.02±0.22,14.76±1.48,1.94±0.25,-0.04±0.32,49.79±8.13,2.47±0.32,-0.03±0.19,19.03±2.89,1.15±0.19
3,DPP_on_masks,val.,bald,-0.69±1.05,25.97±18.99,2.60±1.04,-0.44±0.26,59.58±7.91,2.86±0.20,0.09±0.23,15.88±3.32,1.07±0.23
4,DPP_on_masks,val.,sampled_max_prob,-0.02±0.45,15.18±3.32,1.92±0.36,-0.20±0.14,53.21±3.52,2.61±0.18,-0.01±0.20,17.46±3.05,1.18±0.22
5,DPP_on_masks,val.,variance,-0.06±0.44,15.36±3.35,1.95±0.35,-0.26±0.11,54.27±4.05,2.66±0.14,0.02±0.20,16.84±2.86,1.14±0.19
6,DPP_with_ood,train,bald,-0.31±0.51,20.21±6.49,2.30±0.48,0.04±0.30,51.21±10.43,2.38±0.30,-0.37±0.16,25.45±3.33,1.50±0.16
7,DPP_with_ood,train,sampled_max_prob,-0.05±0.16,15.22±0.86,2.04±0.19,0.15±0.27,48.60±8.17,2.27±0.24,0.05±0.24,17.60±5.34,1.08±0.24
8,DPP_with_ood,train,variance,-0.11±0.22,16.91±3.15,2.09±0.20,0.10±0.28,49.57±9.21,2.30±0.26,-0.24±0.26,22.98±4.39,1.37±0.24
9,DPP_with_ood,val.,bald,-1.82±1.11,37.60±17.51,3.75±1.07,-0.31±0.44,65.64±16.01,2.73±0.42,-0.42±0.07,27.17±3.71,1.57±0.08


In [19]:
print(str(table_dpp_cal.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
       Method & Calibr. Dataset &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &           train &              bald &          -0.05$\pm$0.16 &   15.99$\pm$0.98 &  2.03$\pm$0.18 &          -0.11$\pm$0.39 &   51.39$\pm$9.34 &  2.54$\pm$0.37 &          -0.01$\pm$0.19 &  19.37$\pm$3.13 &  1.15$\pm$0.18 \\
 DPP\_on\_masks &           train &  sampled\_max\_prob &           0.04$\pm$0.18 &   14.42$\pm$1.06 &  1.92$\pm$0.20 &           0.00$\pm$0.31 &   48.92$\pm$7.28 &  2.43$\pm$0.28 &           0.01$\pm$0.22 &  17.14$\pm$3.35 &  1.11$\pm$0.23 \\
 DPP\_on\_masks &           train &          variance &           0.02$\pm$0.22 &   14.76$\pm$1.48 &  1.94$\pm$0.25 &          -0.04$\pm$0.32 &   49.79$\pm$8

# MC-DPP regs

In [20]:
table_dpp.sort_values(by=('SST2 (10%)', 'rcc-auc')).iloc[:50]

NameError: name 'table_dpp' is not defined

In [21]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['reg']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.3, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.3}, 
                   'sst2': {'False': 0.4, 'True': 0.6}}
val_subsamples_dicts = {'mrpc': {'False': 0.0, 'True': 0.0}, 
                        'cola': {'False': 0.0, 'True': 0.0}, 
                        'sst2': {'False': 0.0, 'True': 0.0}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        dpp_type = 'with_ood' if method=='True' else 'on_masks'
        for name in dataset_fnames:      
            val_subsample = val_subsamples_dicts[name][method]
            max_frac = max_fracs_dicts[name][method]
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
            model_series_dir += os.listdir(model_series_dir)[-1]
            model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/dpp/False/0.3/2021-10-01/16-30-33/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/dpp/False/0.6/2021-10-02/22-25-22/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/dpp/False/0.4/2021-10-03/13-24-49/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/dpp/True/0.6/2021-10-02/02-26-43/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/dpp/True/0.3/2021-10-02/08-14-31/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/dpp/True/0.6/2021-10-03/21-04-44/


In [22]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_dpp_reg = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_dpp_reg = table_dpp_reg.reset_index()
table_dpp_reg['Reg. Type'] = table_dpp_reg.Method.apply(lambda x: preproc_regs(x))
table_dpp_reg['Method'] = table_dpp_reg['Method'].apply(lambda x: x.split('|')[0])
table_dpp_reg = table_dpp_reg[list(table_dpp_reg.columns[:1]) + list(table_dpp_reg.columns[-1:]) + list(table_dpp_reg.columns[1:-1])]

In [23]:
table_dpp_reg

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,reg,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13
1,DPP_on_masks,reg,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12
2,DPP_on_masks,reg,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09
3,DPP_with_ood,reg,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26
4,DPP_with_ood,reg,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10
5,DPP_with_ood,reg,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09
6,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11
7,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11


In [24]:
print(str(table_dpp_reg.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllllll}
\toprule
       Method & Reg. Type &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &       reg &              bald &           0.36$\pm$0.33 &  12.06$\pm$1.94 &  1.62$\pm$0.34 &          -0.36$\pm$0.34 &  54.66$\pm$6.57 &  2.77$\pm$0.37 &           0.17$\pm$0.13 &  14.86$\pm$1.94 &  0.97$\pm$0.13 \\
 DPP\_on\_masks &       reg &  sampled\_max\_prob &           0.27$\pm$0.15 &  12.53$\pm$0.98 &  1.70$\pm$0.22 &          -0.17$\pm$0.29 &  51.43$\pm$5.55 &  2.57$\pm$0.27 &           0.23$\pm$0.13 &  13.23$\pm$1.86 &  0.91$\pm$0.12 \\
 DPP\_on\_masks &       reg &          variance &           0.28$\pm$0.23 &  12.32$\pm$1.49 &  1.66$\pm$0.28 &          -0.22$\pm$0.31 &  52.46$\pm$5.81 &  2.63$\pm$0.30 &          

# Raw DPP

In [25]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['reg', 'raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]


dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for max_frac in max_fracs:
    for val_subsample in val_subsamples:
        for reg in regs:
            run_dirs = []
            names = [f'dpp|{max_frac}|{val_subsample}|{reg}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/raw_dpp/{max_frac}/'
                model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
                model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])
table_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])

../workdir/run_glue_for_model_series/electra-reg/mrpc/0.0/raw_dpp/0.3/2021-10-01/16-36-07/
../workdir/run_glue_for_model_series/electra-reg/cola/0.0/raw_dpp/0.3/2021-10-02/04-14-06/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.0/raw_dpp/0.3/2021-10-03/02-09-45/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/raw_dpp/0.3/2021-10-01/18-07-08/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/raw_dpp/0.3/2021-10-02/07-18-23/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/raw_dpp/0.3/2021-10-03/04-38-20/
../workdir/run_glue_for_model_series/electra-reg/mrpc/0.1/raw_dpp/0.3/2021-10-01/16-14-13/
../workdir/run_glue_for_model_series/electra-reg/cola/0.1/raw_dpp/0.3/2021-10-02/03-37-17/
../workdir/run_glue_for_model_series/electra-reg/sst2/0.1/raw_dpp/0.3/2021-10-03/01-42-58/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.1/raw_dpp/0.3/2021-10-01/17-47-49/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/raw_dpp/0.3/2021-10-02/06-46-16/

In [None]:
table_dpp.sort_values(by=('SST2 (10%)', 'rcc-auc')).iloc[:50]

In [26]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': 0.3, 
                   'cola': 0.4, 
                   'sst2': 0.6}
val_subsamples_dicts = {'mrpc': 0.0, 
                        'cola': 0.1, 
                        'sst2': 0.0}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []
for reg in regs:
    run_dirs = []
    for name in dataset_fnames:      
        val_subsample = val_subsamples_dicts[name]
        max_frac = max_fracs_dicts[name]
        names = [f'DPP|{val_subsample}|{reg}']
        model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
        model_series_dir += os.listdir(model_series_dir)[-1]
        model_series_dir += f'/{os.listdir(model_series_dir)[-1]}/'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
    baselines.append(res_df.iloc[-1:])
    tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.3/2021-10-01/19-33-07/
../workdir/run_glue_for_model_series/electra-raw/cola/0.1/dpp/True/0.4/2021-10-02/19-28-25/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/True/0.6/2021-10-03/23-59-01/


In [27]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_raw_dpp = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_raw_dpp = table_raw_dpp.reset_index()
table_raw_dpp['Method'] = table_raw_dpp['Method'].apply(lambda x: x.split('|')[0])
#table_raw_dpp = table_raw_dpp[list(table_raw_dpp.columns[:1]) + list(table_raw_dpp.columns[-1:]) + list(table_raw_dpp.columns[1:-1])]

In [28]:
table_raw_dpp

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP,bald,-0.31±0.52,20.57±6.22,2.28±0.51,-0.31±0.45,65.82±14.59,2.73±0.40,-0.19±0.29,21.60±6.41,1.33±0.29
1,DPP,sampled_max_prob,-0.08±0.17,15.65±1.56,2.07±0.22,-0.01±0.17,55.23±8.15,2.41±0.19,0.04±0.23,17.06±4.21,1.10±0.22
2,DPP,variance,0.01±0.40,15.95±5.33,1.98±0.43,-0.06±0.24,58.42±11.42,2.48±0.21,-0.01±0.20,18.76±5.07,1.15±0.22
3,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.02±0.17,53.17±3.70,2.62±0.20,93.89±0.21,17.10±3.29,1.13±0.23


In [29]:
print(str(table_raw_dpp.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
   Method &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
          & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
      DPP &              bald &          -0.31$\pm$0.52 &  20.57$\pm$6.22 &  2.28$\pm$0.51 &          -0.31$\pm$0.45 &  65.82$\pm$14.59 &  2.73$\pm$0.40 &          -0.19$\pm$0.29 &  21.60$\pm$6.41 &  1.33$\pm$0.29 \\
      DPP &  sampled\_max\_prob &          -0.08$\pm$0.17 &  15.65$\pm$1.56 &  2.07$\pm$0.22 &          -0.01$\pm$0.17 &   55.23$\pm$8.15 &  2.41$\pm$0.19 &           0.04$\pm$0.23 &  17.06$\pm$4.21 &  1.10$\pm$0.22 \\
      DPP &          variance &           0.01$\pm$0.40 &  15.95$\pm$5.33 &  1.98$\pm$0.43 &          -0.06$\pm$0.24 &  58.42$\pm$11.42 &  2.48$\pm$0.21 &          -0.01$\pm$0.20 &  18.76$\pm$5.07 &  1.15$\pm$0.22 \\
 baseline &       

In [30]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['False', 'True']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.6]
val_subsamples = [0.0, 0.1]

max_fracs_dicts = {'mrpc': {'False': 0.6, 'True': 0.6}, 
                   'cola': {'False': 0.6, 'True': 0.6}, 
                   'sst2': {'False': 0.3, 'True': 0.3}}
val_subsamples_dicts = {'mrpc': {'False': 0.0, 'True': 0.0}, 
                        'cola': {'False': 0.0, 'True': 0.0}, 
                        'sst2': {'False': 0.0, 'True': 0.1}}

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
names = []
tables = []
baselines = []


for method in methods:
    for reg in regs:
        run_dirs = []
        dpp_type = 'with_ood' if method=='True' else 'on_masks'
        for name in dataset_fnames:      
            val_subsample = val_subsamples_dicts[name][method]
            max_frac = max_fracs_dicts[name][method]
            names = [f'DPP_{dpp_type}|{val_subsample}|{reg}']
            model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}/{name}/{val_subsample}/dpp/{method}/{max_frac}/'
            model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
            model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/'
            print(model_series_dir)
            run_dirs.append([model_series_dir])
        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)
        baselines.append(res_df.iloc[-1:])
        tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/False/0.6/2021-10-02/03-46-27/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/False/0.6/2021-10-03/03-10-07/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.0/dpp/False/0.3/2021-10-03/10-23-36/
../workdir/run_glue_for_model_series/electra-raw/mrpc/0.0/dpp/True/0.6/2021-10-02/05-02-54/
../workdir/run_glue_for_model_series/electra-raw/cola/0.0/dpp/True/0.6/2021-10-03/05-28-09/
../workdir/run_glue_for_model_series/electra-raw/sst2/0.1/dpp/True/0.3/2021-10-03/11-22-29/


In [31]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

table_dpp_2 = pd.concat([pd.concat(tables), pd.concat(baselines[-2:])])
table_dpp_2 = table_dpp_2.reset_index()
table_dpp_2['Method'] = table_dpp_2['Method'].apply(lambda x: x.split('|')[0])

In [32]:
table_dpp_2

Unnamed: 0_level_0,Method,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DPP_on_masks,bald,-0.05±0.16,15.99±0.98,2.03±0.18,-0.11±0.39,51.39±9.34,2.54±0.37,-0.01±0.19,19.37±3.13,1.15±0.18
1,DPP_on_masks,sampled_max_prob,0.04±0.18,14.42±1.06,1.92±0.20,0.00±0.31,48.92±7.28,2.43±0.28,0.01±0.22,17.14±3.35,1.11±0.23
2,DPP_on_masks,variance,0.02±0.22,14.76±1.48,1.94±0.25,-0.04±0.32,49.79±8.13,2.47±0.32,-0.03±0.19,19.03±2.89,1.15±0.19
3,DPP_with_ood,bald,-0.31±0.51,20.21±6.49,2.30±0.48,0.04±0.30,51.21±10.43,2.38±0.30,-0.42±0.07,27.17±3.71,1.57±0.08
4,DPP_with_ood,sampled_max_prob,-0.05±0.16,15.22±0.86,2.04±0.19,0.15±0.27,48.60±8.17,2.27±0.24,0.21±0.19,13.76±2.39,0.94±0.20
5,DPP_with_ood,variance,-0.11±0.22,16.91±3.15,2.09±0.20,0.10±0.28,49.57±9.21,2.30±0.26,-0.17±0.12,22.83±4.46,1.32±0.12
6,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.89±0.21,17.10±3.29,1.13±0.23
7,baseline,max_prob,92.78±0.21,15.03±2.09,1.97±0.22,92.23±0.32,48.81±7.14,2.43±0.28,93.86±0.21,17.31±2.91,1.19±0.22


In [33]:
print(str(table_dpp_2.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllll}
\toprule
       Method &          UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
              & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &      rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
 DPP\_on\_masks &              bald &          -0.05$\pm$0.16 &  15.99$\pm$0.98 &  2.03$\pm$0.18 &          -0.11$\pm$0.39 &   51.39$\pm$9.34 &  2.54$\pm$0.37 &          -0.01$\pm$0.19 &  19.37$\pm$3.13 &  1.15$\pm$0.18 \\
 DPP\_on\_masks &  sampled\_max\_prob &           0.04$\pm$0.18 &  14.42$\pm$1.06 &  1.92$\pm$0.20 &           0.00$\pm$0.31 &   48.92$\pm$7.28 &  2.43$\pm$0.28 &           0.01$\pm$0.22 &  17.14$\pm$3.35 &  1.11$\pm$0.23 \\
 DPP\_on\_masks &          variance &           0.02$\pm$0.22 &  14.76$\pm$1.48 &  1.94$\pm$0.25 &          -0.04$\pm$0.32 &   49.79$\pm$8.13 &  2.47$\pm$0.32 &          -0.03$\pm$0.19 &  19.03$\pm$2.89 &  1.15$\pm$

# Ensemble

In [34]:
import os 

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']

dataset_names = ['MRPC', 'CoLA', 'SST2 (10%)']
dataset_fnames = ['mrpc', 'cola', 'sst2']
dataset_to_time = {'mrpc':'09-07-34', 'cola': '09-13-51', 'sst2': '09-20-48'}
run_dirs = []
for name in dataset_fnames:      
    names = [f'Deep Ensemble']
    time = dataset_to_time[name]
    model_series_dir = f'/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-09-24/{time}/final_results/'
    print(model_series_dir)
    run_dirs.append([model_series_dir])
ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines=raw_baselines)

/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-09-24/09-07-34/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-09-24/09-13-51/final_results/
/mnt/users/avazhentsev/uncertainty-estimation/workdir/run_glue_for_ensemble_series/2021-09-24/09-20-48/final_results/


In [35]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = 'raw'
ens_tab['Dropout Layers'] = '-'
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-2:]) + list(ens_tab.columns[1:-2])]

In [36]:
ens_tab

Unnamed: 0_level_0,Method,Reg. Type,Dropout Layers,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,raw,-,bald,0.49±0.17,13.29±1.10,1.61±0.10,0.09±0.03,52.01±3.77,2.36±0.04,0.51±0.05,10.68±1.91,0.66±0.03
1,Deep Ensemble,raw,-,sampled_max_prob,0.53±0.09,11.44±0.28,1.56±0.09,0.20±0.15,44.90±2.62,2.28±0.16,0.47±0.01,9.93±0.30,0.69±0.01
2,Deep Ensemble,raw,-,variance,0.56±0.12,13.23±1.16,1.60±0.09,0.15±0.05,50.19±2.73,2.33±0.06,0.48±0.04,9.82±0.55,0.68±0.03
3,baseline,raw,-,max_prob,92.83±0.28,14.72±2.28,1.95±0.18,91.59±0.43,62.64±12.58,3.03±0.44,94.15±0.09,13.44±1.93,0.91±0.10


# Combine all

In [37]:
table_dpp_reg['Dropout Layers'] = 'last'
table_det['Dropout Layers'] = '-'

In [38]:
table_dpp_reg

Unnamed: 0_level_0,Method,Reg. Type,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%),Dropout Layers
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,Unnamed: 13_level_1
0,DPP_on_masks,reg,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13,last
1,DPP_on_masks,reg,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12,last
2,DPP_on_masks,reg,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09,last
3,DPP_with_ood,reg,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26,last
4,DPP_with_ood,reg,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10,last
5,DPP_with_ood,reg,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09,last
6,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11,last
7,baseline,reg,max_prob,93.01±0.16,12.75±0.96,1.74±0.22,92.08±0.30,51.20±5.73,2.56±0.28,94.13±0.13,12.97±1.73,0.90±0.11,last


In [52]:
res = pd.concat([table_mc.iloc[[0,1,2]], table_dpp_reg.iloc[:-2], table_det.iloc[[9,10,11,15]], table_mc_det.iloc[[3]], ens_tab.iloc[:-1], table_dpp_reg.iloc[-2:]])
res = res[table_mc.columns].reset_index(drop=True)

In [53]:
res

Unnamed: 0_level_0,Method,Reg. Type,Dropout Layers,UE Score,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,SST2 (10%),SST2 (10%),SST2 (10%)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MC,raw,all,bald,0.59±0.11,11.78±0.77,1.41±0.12,0.17±0.22,47.55±5.41,2.28±0.17,0.28±0.10,13.51±2.03,0.85±0.11
1,MC,raw,all,sampled_max_prob,0.53±0.13,11.60±1.21,1.45±0.14,0.30±0.18,43.51±4.19,2.15±0.15,0.22±0.12,13.99±1.69,0.91±0.12
2,MC,raw,all,variance,0.60±0.12,11.66±0.70,1.41±0.12,0.23±0.23,45.97±4.97,2.24±0.17,0.28±0.10,13.24±1.58,0.85±0.11
3,DPP_on_masks,reg,last,bald,0.36±0.33,12.06±1.94,1.62±0.34,-0.36±0.34,54.66±6.57,2.77±0.37,0.17±0.13,14.86±1.94,0.97±0.13
4,DPP_on_masks,reg,last,sampled_max_prob,0.27±0.15,12.53±0.98,1.70±0.22,-0.17±0.29,51.43±5.55,2.57±0.27,0.23±0.13,13.23±1.86,0.91±0.12
5,DPP_on_masks,reg,last,variance,0.28±0.23,12.32±1.49,1.66±0.28,-0.22±0.31,52.46±5.81,2.63±0.30,0.20±0.10,13.71±1.31,0.93±0.09
6,DPP_with_ood,reg,last,bald,0.19±0.43,14.62±5.18,1.76±0.41,-0.18±0.19,52.10±3.75,2.60±0.18,0.12±0.25,16.52±4.27,1.04±0.26
7,DPP_with_ood,reg,last,sampled_max_prob,0.14±0.15,13.39±0.73,1.84±0.18,0.10±0.26,47.20±4.88,2.32±0.25,0.29±0.11,12.10±1.59,0.85±0.10
8,DPP_with_ood,reg,last,variance,0.34±0.14,12.35±0.69,1.66±0.22,0.00±0.21,48.90±4.38,2.41±0.20,0.26±0.10,13.22±2.01,0.88±0.09
9,NUQ,spectral_norm,,epistemic,0.43±0.19,11.62±1.23,1.63±0.17,0.30±0.10,43.13±2.81,2.14±0.05,0.36±0.07,10.88±0.80,0.77±0.09


In [54]:
print(str(res.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllllllll}
\toprule
         Method &      Reg. Type & Dropout Layers &                      UE Score & \multicolumn{3}{l}{MRPC} & \multicolumn{3}{l}{CoLA} & \multicolumn{3}{l}{SST2 (10\%)} \\
                & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp & rejection-curve-auc &     rcc-auc &        rpp \\
\midrule
             MC &            raw &            all &                          bald &           0.59$\pm$0.11 &  11.78$\pm$0.77 &  1.41$\pm$0.12 &           0.17$\pm$0.22 &  47.55$\pm$5.41 &  2.28$\pm$0.17 &           0.28$\pm$0.10 &  13.51$\pm$2.03 &  0.85$\pm$0.11 \\
             MC &            raw &            all &              sampled\_max\_prob &           0.53$\pm$0.13 &  11.60$\pm$1.21 &  1.45$\pm$0.14 &           0.30$\pm$0.18 &  43.51$\pm$4.19 &  2.15$\pm$0.15 &           0.22$\pm$0.12 &  13.99$\pm$1.69 &  0.91$\pm$0.12 \\
             MC &            raw &            all &                      v

# Legacy

In [4]:
# Get results with new dpp models
names = ['MC|last|reg', 'MC|all|reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-reg/{name.lower()}/last/results', f'../workdir/run_glue_for_model_series/electra-reg/{name.lower()}/all/results'] for name in dataset_fnames]
mc_table_reg = collect_datasets(run_dirs, names, dataset_names)

names = ['DPP2|reg', 'DPP OOD|reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_reg = collect_datasets(run_dirs, names, dataset_names)

names = ['MC|last|no reg', 'MC|all|no reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/last/results', f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/all/results'] for name in dataset_fnames]
mc_table_no_reg = collect_datasets(run_dirs, names, dataset_names)

names = ['DPP2|no reg', 'DPP OOD|no reg']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-raw/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_no_reg = collect_datasets(run_dirs, names, dataset_names)

In [5]:
names = ['DPP2|no reg calibrated', 'DPP OOD|no reg calibrated']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-raw-calibrate/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-raw-calibrate/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_no_reg_cal = collect_datasets(run_dirs, names, dataset_names)

In [6]:
dpp_table_no_reg_cal

Unnamed: 0_level_0,Unnamed: 1_level_0,SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),MRPC,MRPC,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,CoLA,CoLA
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
DPP2|no reg calibrated,bald,93.7±0.3,78.3±4.4,22.4±5.6,27.4±4.4,1.4±0.3,92.8±0.8,78.6±7.9,18.6±10.4,34.0±5.1,2.1±0.8,91.9±0.3,76.0±2.8,55.8±4.2,30.1±5.1,2.8±0.3
DPP2|no reg calibrated,sampled_max_prob,93.7±0.2,79.3±2.6,19.6±3.1,28.5±4.2,1.3±0.2,93.1±0.4,82.3±3.6,12.9±2.8,37.2±5.7,1.7±0.3,92.1±0.2,78.1±0.6,50.7±2.5,36.9±3.9,2.6±0.1
DPP2|no reg calibrated,variance,93.7±0.2,79.7±3.3,19.8±3.7,28.5±3.8,1.3±0.2,93.1±0.3,82.0±3.5,13.9±3.3,36.8±4.8,1.8±0.3,92.0±0.2,77.5±1.2,52.6±1.6,33.5±4.0,2.6±0.1
DPP2|no reg calibrated,sampled_entropy,93.7±0.2,79.3±2.6,19.6±3.1,28.5±4.2,1.3±0.2,93.1±0.4,82.3±3.6,12.9±2.8,37.2±5.7,1.7±0.3,92.1±0.2,78.1±0.6,50.7±2.5,36.9±3.9,2.6±0.1
DPP OOD|no reg calibrated,bald,93.8±0.2,81.0±3.1,18.9±3.2,29.8±3.4,1.2±0.2,93.0±0.4,81.4±3.5,14.6±4.5,35.4±4.3,1.8±0.4,92.0±0.3,77.2±2.6,54.4±4.7,31.4±4.4,2.7±0.3
DPP OOD|no reg calibrated,sampled_max_prob,93.8±0.2,79.9±2.6,18.8±2.9,28.6±4.3,1.3±0.2,93.1±0.3,82.1±3.6,13.0±2.8,37.0±5.8,1.8±0.3,92.3±0.2,79.8±0.5,47.5±2.7,38.8±3.5,2.4±0.1
DPP OOD|no reg calibrated,variance,93.8±0.2,81.2±3.0,18.1±3.2,29.2±3.5,1.2±0.2,93.1±0.4,82.6±3.4,13.2±3.3,36.7±4.4,1.7±0.3,92.3±0.2,79.4±1.6,49.0±2.7,35.3±3.6,2.4±0.2
DPP OOD|no reg calibrated,sampled_entropy,93.8±0.2,79.9±2.6,18.8±2.9,28.6±4.3,1.3±0.2,93.1±0.3,82.1±3.6,13.0±2.8,37.0±5.8,1.8±0.3,92.3±0.2,79.8±0.5,47.5±2.7,38.8±3.5,2.4±0.1


In [7]:
names = ['DPP2|reg calibrated', 'DPP OOD|reg calibrated']
dataset_names = ['SST2 (10%)', 'MRPC', 'CoLA']
dataset_fnames = ['SST2', 'MRPC', 'CoLA']
run_dirs = [[f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp/results', f'../workdir/run_glue_for_model_series/electra-reg-calibrate/{name.lower()}/dpp_with_ood/results'] for name in dataset_fnames]
dpp_table_reg_cal = collect_datasets(run_dirs, names, dataset_names)

In [8]:
overall_table_cal = pd.concat([mc_table_reg, dpp_table_reg_cal, mc_table_no_reg, dpp_table_no_reg, dpp_table_no_reg_cal])

In [9]:
overall_table_cal

Unnamed: 0_level_0,Unnamed: 1_level_0,SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),SST2 (10%),MRPC,MRPC,MRPC,MRPC,MRPC,CoLA,CoLA,CoLA,CoLA,CoLA
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp,rejection-curve-auc,roc-auc,rcc-auc,pr-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
MC|last|reg,bald,93.7±0.2,78.7±2.1,24.4±5.1,27.2±2.6,1.4±0.2,92.8±0.5,80.8±3.1,16.5±3.8,40.5±3.4,2.0±0.4,91.6±0.3,75.0±2.3,65.2±7.2,32.8±2.3,3.0±0.3
MC|last|reg,sampled_max_prob,94.0±0.1,82.9±1.1,16.5±1.5,31.3±3.4,1.1±0.1,93.1±0.5,84.7±3.1,12.2±2.8,46.4±4.4,1.6±0.4,91.9±0.2,77.8±1.7,54.6±5.4,36.1±1.2,2.7±0.2
MC|last|reg,variance,93.8±0.1,81.3±1.0,20.9±3.3,30.2±2.8,1.2±0.1,93.0±0.5,83.7±3.3,13.5±3.2,45.0±3.4,1.7±0.4,91.8±0.3,77.2±1.9,58.0±6.2,35.5±1.7,2.8±0.2
MC|last|reg,sampled_entropy,94.0±0.1,82.9±1.1,16.5±1.5,31.3±3.4,1.1±0.1,93.1±0.5,84.7±3.1,12.2±2.8,46.4±4.4,1.6±0.4,91.9±0.2,77.8±1.7,54.6±5.4,36.1±1.2,2.7±0.2
SR|reg,SR|reg,94.0±0.1,83.2±1.1,15.9±1.3,31.3±3.4,1.1±0.1,93.1±0.5,84.8±3.1,12.1±2.8,46.5±4.5,1.6±0.4,91.9±0.2,77.8±1.7,54.4±5.5,36.0±1.2,2.7±0.2
MC|all|reg,bald,94.1±0.1,85.9±1.4,14.0±2.3,32.0±3.1,0.9±0.1,93.5±0.2,87.7±0.9,10.4±1.6,45.4±2.5,1.3±0.2,92.1±0.1,79.4±1.2,49.4±3.1,34.0±1.6,2.5±0.1
MC|all|reg,sampled_max_prob,94.1±0.1,85.2±0.9,13.7±0.9,31.2±2.8,0.9±0.1,93.5±0.2,87.9±1.3,10.0±1.3,46.5±5.0,1.3±0.2,92.2±0.2,80.3±1.4,47.6±2.9,37.5±1.2,2.4±0.2
MC|all|reg,variance,94.1±0.1,85.8±1.1,13.8±1.6,32.0±3.2,0.9±0.1,93.5±0.2,87.9±1.0,10.2±1.5,46.0±2.6,1.3±0.2,92.2±0.1,79.7±1.2,48.8±2.6,35.0±1.4,2.5±0.1
MC|all|reg,sampled_entropy,94.1±0.1,85.2±0.9,13.7±0.9,31.2±2.8,0.9±0.1,93.5±0.2,87.9±1.3,10.0±1.3,46.5±5.0,1.3±0.2,92.2±0.2,80.3±1.4,47.6±2.9,37.5±1.2,2.4±0.2
DPP2|reg calibrated,bald,93.8±0.3,81.0±4.4,18.7±4.2,30.0±5.6,1.2±0.3,92.8±0.7,81.5±6.5,16.9±8.3,41.6±6.0,2.0±0.7,91.7±0.2,75.9±1.8,59.0±6.0,32.9±2.8,2.9±0.2
