In [4]:
from analyze_results import aggregate_runs, improvement_over_baseline, default_methods, collect_configs, aggregate_runs_de, from_model_outputs_calc_rpp, from_model_outputs_calc_rcc_auc, calc_rejection_curve_auc_seq
from pathlib import Path
import pandas as pd
from ue4nlp.ue_scores import *

def create_section_col(runs_dir, metric='roc-auc', task_type='classification', de = False, oos=False, subtract=False, percents=True):
    runs_dir = Path(runs_dir) #/ 'results'
    if not runs_dir.is_dir():
        raise ValueError()
    print(runs_dir)
    if 'ner' in task_type:
        default_methods = {
            "bald": bald,
            "var_ratio": var_ratio,
            "sampled_max_prob": sampled_max_prob,
            #"variance": probability_variance,
            "entropy": mean_entropy,
        }
    else:
        default_methods = {
            "bald": bald,
            "sampled_max_prob": sampled_max_prob,
            "variance": probability_variance,
            #"var_ratio": var_ratio,
            #"entropy": mean_entropy,
        }
    if de:
        agg_res = aggregate_runs_de(runs_dir, methods=default_methods, metric=metric)
    else:
        agg_res = aggregate_runs(runs_dir, methods=default_methods, metric=metric, task_type=task_type, oos=oos)
    #print(agg_res)
    #print(collect_configs(runs_dir))
    if agg_res.empty:
        print('Broken')
        print()
        raise ValueError()
        
    improvement = improvement_over_baseline(agg_res, baseline_col='max_prob', metric=metric, subtract=subtract, percents=percents)
    #print(improvement)
    print()
    return improvement


def build_eval_table(dataset_paths, metric='roc-auc', mc_types=None, task_type='classification', de = False, oos=False, subtract=False, percents=True):
    BASELINE_INDEX = 'baseline (max_prob)'
    
    columns = []
    names = []
    for name, paths in dataset_paths.items():
        method_results = []
        baseline = None
        for path, dropout_type, layer in paths:
            if ('token' in name) and (task_type == 'ner'):
                task_type_full = task_type + '-token'
            elif (task_type == 'ner'):
                task_type_full = task_type + '-seq'
            else:
                task_type_full = task_type
            if '|' in name:
                _, metric = name.split('|')
                #name = metric
            method_batch = create_section_col(path, metric, task_type_full, de, oos, subtract, percents)
            method_batch.drop(index=['count'], inplace=True)#, 'ens_accuracy', 'model_accuracy'
            baseline = method_batch[BASELINE_INDEX: BASELINE_INDEX]
            method_batch.drop(index=BASELINE_INDEX, inplace=True)
            method_batch = pd.Series(method_batch.values, index=[f'{dropout_type}|{layer}|{e}' for e in method_batch.index])
            
            #print(method_batch)
            method_results.append(method_batch)
        
        col = pd.concat([baseline] + method_results, axis=0)
        columns.append(col)
        names.append(name)
    
    name_map = {i : n for i,n in enumerate(names)}
    
    return pd.concat(columns, axis=1).rename(columns=name_map)

In [16]:
tasks = ['sst2', 'cola', 'mrpc']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated"

eval_table_md = build_eval_table(
    {
        'MRPC' : [
            #0.3 better
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/True/0.0', 'DPP_on_masks_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/True/0.1', 'DPP_on_masks_calibrated_val', 'last'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/False/0.0', 'DPP_on_masks', 'last'),
            
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.3/True/True/0.0', 'DPP_with_ood_calibrated_train', 'last'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/True/True/0.1', 'DPP_with_ood_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.3/True/False/0.0', 'DPP_with_ood', 'last'),
         
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/False/True/0.0', 'DPP_on_masks_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/False/True/0.1', 'DPP_on_masks_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/False/False/0.0', 'DPP_on_masks_0.4', 'last'),
            
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/True/True/0.0', 'DPP_with_ood_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/True/True/0.1', 'DPP_with_ood_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/True/False/0.0', 'DPP_with_ood_0.4', 'last'),
            
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/False/True/0.0', 'DPP_on_masks_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/False/True/0.1', 'DPP_on_masks_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/False/False/0.0', 'DPP_on_masks_0.6', 'last'),
            
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/True/True/0.0', 'DPP_with_ood_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/True/True/0.1', 'DPP_with_ood_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/True/False/0.0', 'DPP_with_ood_0.6', 'last'),
        ],
        'COLA' : [
            #0.4 better
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/False/True/0.0', 'DPP_on_masks_0.3_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/False/True/0.1', 'DPP_on_masks_0.3_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/False/False/0.0', 'DPP_on_masks_0.3', 'last'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/True/True/0.0', 'DPP_with_ood_0.3_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/True/True/0.1', 'DPP_with_ood_0.3_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/True/False/0.0', 'DPP_with_ood_0.3', 'last'),
         
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.4/False/True/0.0', 'DPP_on_masks_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.4/False/True/0.1', 'DPP_on_masks_calibrated_val', 'last'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/False/False/0.0', 'DPP_on_masks', 'last'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.4/True/True/0.0', 'DPP_with_ood_calibrated_train', 'last'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/True/True/0.1', 'DPP_with_ood_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.4/True/False/0.0', 'DPP_with_ood', 'last'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/False/True/0.0', 'DPP_on_masks_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/False/True/0.1', 'DPP_on_masks_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/False/False/0.0', 'DPP_on_masks_0.6', 'last'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/True/True/0.0', 'DPP_with_ood_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/True/True/0.1', 'DPP_with_ood_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/True/False/0.0', 'DPP_with_ood_0.6', 'last'),
        ],
        'SST2' : [
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/False/True/0.0', 'DPP_on_masks_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/False/True/0.1', 'DPP_on_masks_calibrated_val', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.3/False/False/0.0', 'DPP_on_masks', 'last'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/True/True/0.0', 'DPP_with_ood_0.3_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/True/True/0.1', 'DPP_with_ood', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/True/False/0.0', 'DPP_with_ood_0.3', 'last'),
         
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/False/True/0.0', 'DPP_on_masks_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/False/True/0.1', 'DPP_on_masks_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/False/False/0.0', 'DPP_on_masks_0.4', 'last'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/True/True/0.0', 'DPP_with_ood_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/True/True/0.1', 'DPP_with_ood_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/True/False/0.0', 'DPP_with_ood_0.4', 'last'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/False/True/0.0', 'DPP_on_masks_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/False/True/0.1', 'DPP_on_masks_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/False/False/0.0', 'DPP_on_masks_0.6', 'last'),
        
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/True/True/0.0', 'DPP_with_ood_calibrated_train', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.6/True/True/0.1', 'DPP_with_ood_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/True/False/0.0', 'DPP_with_ood', 'last'),
        ],
    },
    metric='rejection-curve-auc',#from_model_outputs_calc_rcc_auc,
    subtract=True,
    percents=True
)

eval_table_md

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/False/False/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/True/True/0.1

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/cola/DPP/0.4/False/False/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/cola/DPP/0.4/True/True/0.1

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/sst2/DPP/0.3/False/False/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/sst2/DPP/0.6/True/True/0.1



Unnamed: 0,MRPC,COLA,SST2
baseline (max_prob),92.944±0.118,92.039±0.204,93.869±0.187
DPP_on_masks|last|bald,0.173±0.125,-0.154±0.212,0.026±0.306
DPP_on_masks|last|sampled_max_prob,0.092±0.132,-0.02±0.024,0.021±0.054
DPP_on_masks|last|variance,0.142±0.127,-0.096±0.16,0.03±0.17
DPP_with_ood_calibrated_val|last|bald,-0.251±0.741,0.102±0.277,-0.106±0.237
DPP_with_ood_calibrated_val|last|sampled_max_prob,-0.189±0.106,0.224±0.165,0.161±0.071
DPP_with_ood_calibrated_val|last|variance,-0.123±0.419,0.174±0.211,0.178±0.31


In [21]:
tasks = ['sst2', 'cola', 'mrpc']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated"

eval_table_md = build_eval_table(
    {
        'COLA' : [
            #0.3 better
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/False/True/0.0', 'DPP_on_masks_calibrated_train', 'last'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/True/0.1', 'DPP_on_masks_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/False/0.0', 'DPP_on_masks', 'last'),
        ],
        
    },
    metric=from_model_outputs_calc_rcc_auc,
    subtract=False,
    percents=False
)

eval_table_md

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/False/True/0.1



Unnamed: 0,COLA
baseline (max_prob),13.7±0.5
DPP_on_masks_calibrated_val|last|bald,15.5±5.5
DPP_on_masks_calibrated_val|last|sampled_max_prob,13.5±1.0
DPP_on_masks_calibrated_val|last|variance,15.1±3.8


# NUQ, Maha for misclassification detection

In [9]:
tasks = ['sst2', 'cola', 'mrpc']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue"

eval_table_md = build_eval_table(
    {
        'MRPC' : [
            (f'{path}/{tasks[2]}/{mc_types[2]}/True/', 'Mahalanobis', 'spectral_norm'),
            (f'{path}/{tasks[2]}/{mc_types[1]}/True/', 'NUQ', 'spectral_norm'),      
            
            (f'{path}/{tasks[2]}/{mc_types[2]}/False/', 'Mahalanobis', '-'),
            (f'{path}/{tasks[2]}/{mc_types[1]}/False/', 'NUQ', '-'),

        ],
        'COLA' : [
            (f'{path}/{tasks[1]}/{mc_types[2]}/True/', 'Mahalanobis', 'spectral_norm'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/True/', 'NUQ', 'spectral_norm'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/False/', 'Mahalanobis', '-'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/False/', 'NUQ', '-'),
        ],
        'SST2' : [
            (f'{path}/{tasks[0]}/{mc_types[2]}/True/', 'Mahalanobis', 'spectral_norm'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/True/', 'NUQ', 'spectral_norm'),
           
            (f'{path}/{tasks[0]}/{mc_types[2]}/False/', 'Mahalanobis', '-'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/False/', 'NUQ', '-'),
        ],
    },
    metric=from_model_outputs_calc_rcc_auc,#from_model_outputs_calc_rpp,#'rejection-curve-auc',
    subtract=False,
    percents=False
)

eval_table_md

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/mrpc/maha/True

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/mrpc/nuq/True

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/mrpc/maha/False

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/mrpc/nuq/False

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/cola/maha/True

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/cola/nuq/True

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/cola/maha/False

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/cola/nuq/False

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/sst2/maha/True

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/sst2/nuq/True

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/single_ue/sst2/maha/False

/mnt/users/avazhentsev/uncertain

Unnamed: 0,MRPC,COLA,SST2
baseline (max_prob),13.8±0.5,54.5±3.8,17.3±3.6
Mahalanobis|spectral_norm|mahalanobis_distance,12.6±0.9,45.3±2.4,12.3±1.9
NUQ|spectral_norm|epistemic,12.5±0.8,47.6±1.2,12.3±1.9
NUQ|spectral_norm|aleatoric,12.7±0.6,48.0±1.2,12.6±1.8
NUQ|spectral_norm|total,12.6±0.7,47.7±1.2,12.4±1.9
Mahalanobis|-|mahalanobis_distance,13.3±0.9,44.1±2.2,13.1±2.3
NUQ|-|epistemic,12.9±1.3,45.8±2.6,12.9±2.8
NUQ|-|aleatoric,13.1±1.5,46.1±2.4,12.9±2.9
NUQ|-|total,13.0±1.4,45.9±2.5,12.9±2.8


# DPP

In [14]:
tasks = ['sst2', 'cola', 'mrpc']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated"

eval_table_md = build_eval_table(
    {
        'MRPC' : [
            #0.3 better
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/True/0.0', 'DPP_on_masks_calibrated_train', 'last'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/True/0.1', 'DPP_on_masks_calibrated_val', 'last'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/False/False/0.0', 'DPP_on_masks', 'last'),
            
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/True/True/0.0', 'DPP_with_ood_calibrated_train', 'last'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/True/True/0.1', 'DPP_with_ood_calibrated_val', 'last'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.3/True/False/0.0', 'DPP_with_ood', 'last'),
         
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/False/True/0.0', 'DPP_on_masks_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/False/True/0.1', 'DPP_on_masks_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/False/False/0.0', 'DPP_on_masks_0.4', 'last'),
            
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/True/True/0.0', 'DPP_with_ood_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/True/True/0.1', 'DPP_with_ood_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.4/True/False/0.0', 'DPP_with_ood_0.4', 'last'),
            
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/False/True/0.0', 'DPP_on_masks_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/False/True/0.1', 'DPP_on_masks_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/False/False/0.0', 'DPP_on_masks_0.6', 'last'),
            
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/True/True/0.0', 'DPP_with_ood_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/True/True/0.1', 'DPP_with_ood_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[2]}/{mc_types[0]}/0.6/True/False/0.0', 'DPP_with_ood_0.6', 'last'),
        ],
        'COLA' : [
            #0.4 better
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/False/True/0.0', 'DPP_on_masks_0.3_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/False/True/0.1', 'DPP_on_masks_0.3_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/False/False/0.0', 'DPP_on_masks_0.3', 'last'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/True/True/0.0', 'DPP_with_ood_0.3_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/True/True/0.1', 'DPP_with_ood_0.3_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.3/True/False/0.0', 'DPP_with_ood_0.3', 'last'),
         
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/False/True/0.0', 'DPP_on_masks_calibrated_train', 'last'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/False/True/0.1', 'DPP_on_masks_calibrated_val', 'last'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/False/False/0.0', 'DPP_on_masks', 'last'),
            
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/True/True/0.0', 'DPP_with_ood_calibrated_train', 'last'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/True/True/0.1', 'DPP_with_ood_calibrated_val', 'last'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.4/True/False/0.0', 'DPP_with_ood', 'last'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/False/True/0.0', 'DPP_on_masks_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/False/True/0.1', 'DPP_on_masks_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/False/False/0.0', 'DPP_on_masks_0.6', 'last'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/True/True/0.0', 'DPP_with_ood_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/True/True/0.1', 'DPP_with_ood_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/0.6/True/False/0.0', 'DPP_with_ood_0.6', 'last'),
        ],
        'SST2' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.3/False/True/0.0', 'DPP_on_masks_calibrated_train', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.3/False/True/0.1', 'DPP_on_masks_calibrated_val', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.3/False/False/0.0', 'DPP_on_masks', 'last'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/True/True/0.0', 'DPP_with_ood_0.3_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/True/True/0.1', 'DPP_with_ood_0.3_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.3/True/False/0.0', 'DPP_with_ood_0.3', 'last'),
         
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/False/True/0.0', 'DPP_on_masks_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/False/True/0.1', 'DPP_on_masks_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/False/False/0.0', 'DPP_on_masks_0.4', 'last'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/True/True/0.0', 'DPP_with_ood_0.4_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/True/True/0.1', 'DPP_with_ood_0.4_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.4/True/False/0.0', 'DPP_with_ood_0.4', 'last'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/False/True/0.0', 'DPP_on_masks_0.6_calibrated_train', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/False/True/0.1', 'DPP_on_masks_0.6_calibrated_val', 'last'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/0.6/False/False/0.0', 'DPP_on_masks_0.6', 'last'),
        
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.6/True/True/0.0', 'DPP_with_ood_calibrated_train', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.6/True/True/0.1', 'DPP_with_ood_calibrated_val', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.6/True/False/0.0', 'DPP_with_ood', 'last'),
        ],
    },
    metric='rejection-curve-auc',
    subtract=True
    
)

eval_table_md

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/False/True/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/False/True/0.1

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/False/False/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/True/True/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/True/True/0.1

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/mrpc/DPP/0.3/True/False/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/cola/DPP/0.4/False/True/0.0

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/cola/DPP/0.4/False/True/0.1

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/dpp_calibrated/cola/DPP/0.4/False/False/0.0

/mnt/users/avazhentsev/uncertainty-estimation/

Unnamed: 0,MRPC,COLA,SST2
baseline (max_prob),92.792±0.177,92.251±0.264,93.88±0.206
DPP_on_masks_calibrated_train|last|bald,0.158±0.111,-0.158±0.217,0.024±0.295
DPP_on_masks_calibrated_train|last|sampled_max_prob,0.097±0.138,-0.022±0.023,0.036±0.057
DPP_on_masks_calibrated_train|last|variance,0.143±0.115,-0.097±0.16,0.018±0.185
DPP_on_masks_calibrated_val|last|bald,-0.042±0.535,-0.174±0.196,0.146±0.207
DPP_on_masks_calibrated_val|last|sampled_max_prob,0.017±0.125,-0.044±0.037,0.057±0.101
DPP_on_masks_calibrated_val|last|variance,-0.054±0.353,-0.11±0.165,0.123±0.195
DPP_on_masks|last|bald,0.173±0.125,-0.154±0.212,0.026±0.306
DPP_on_masks|last|sampled_max_prob,0.092±0.132,-0.02±0.024,0.021±0.054
DPP_on_masks|last|variance,0.142±0.127,-0.096±0.16,0.03±0.17


# Fixed Maha

In [2]:
tasks = ['clinc_oos', 'rostd', 'snips']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct"

eval_table_oos = build_eval_table(
    {
        'CLINC|roc-auc' : [
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/3e-05/16', 'Mahalanobis', 'roberta/3e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/5e-05/16', 'Mahalanobis', 'roberta/5e-05/16'),
            
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/1e-05/32', 'Mahalanobis', 'roberta/1e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/5e-05/32', 'Mahalanobis', 'roberta/5e-05/32'),

        ],
        'CLINC|pr-auc' : [
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/3e-05/16', 'Mahalanobis', 'roberta/3e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/5e-05/16', 'Mahalanobis', 'roberta/5e-05/16'),
            
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/1e-05/32', 'Mahalanobis', 'roberta/1e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/5e-05/32', 'Mahalanobis', 'roberta/5e-05/32'),
        ],
        
        'ROSTD|roc-auc' : [
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/3e-05/16', 'Mahalanobis', 'roberta/3e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/5e-05/16', 'Mahalanobis', 'roberta/5e-05/16'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/1e-05/32', 'Mahalanobis', 'roberta/1e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/5e-05/32', 'Mahalanobis', 'roberta/5e-05/32'),

        ],
        'ROSTD|pr-auc' : [
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/3e-05/16', 'Mahalanobis', 'roberta/3e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/5e-05/16', 'Mahalanobis', 'roberta/5e-05/16'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/1e-05/32', 'Mahalanobis', 'roberta/1e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/5e-05/32', 'Mahalanobis', 'roberta/5e-05/32'),
        ],
        
        'SNIPS|roc-auc' : [
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/3e-05/16', 'Mahalanobis', 'roberta/3e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/5e-05/16', 'Mahalanobis', 'roberta/5e-05/16'),
            
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/1e-05/32', 'Mahalanobis', 'roberta/1e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/5e-05/32', 'Mahalanobis', 'roberta/5e-05/32'),

        ],
        'SNIPS|pr-auc' : [
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/3e-05/16', 'Mahalanobis', 'roberta/3e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/5e-05/16', 'Mahalanobis', 'roberta/5e-05/16'),
            
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/1e-05/32', 'Mahalanobis', 'roberta/1e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/5e-05/32', 'Mahalanobis', 'roberta/5e-05/32'),
        ],
        
    },
    oos=True,
    subtract=False
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/1e-05/16

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/3e-05/16

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/5e-05/16

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/1e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/3e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/5e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/1e-05/16

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/3e-05/16

/mnt/users/avazhentsev/uncertain

Unnamed: 0,CLINC|roc-auc,CLINC|pr-auc,ROSTD|roc-auc,ROSTD|pr-auc,SNIPS|roc-auc,SNIPS|pr-auc
baseline (max_prob),96.8±0.3,90.0±0.2,83.5±5.2,73.7±7.7,82.5±5.0,57.5±3.8
Mahalanobis|roberta/1e-05/16|mahalanobis_distance_fixed,98.0±0.1,92.2±0.5,98.2±1.8,97.7±1.4,88.7±5.1,84.3±4.1
Mahalanobis|roberta/1e-05/16|mahalanobis_distance,98.4±0.1,94.6±0.3,99.7±0.0,99.3±0.1,96.8±0.5,92.2±0.7
Mahalanobis|roberta/3e-05/16|mahalanobis_distance_fixed,78.0±20.5,54.6±34.7,81.7±34.5,84.1±28.9,88.5±3.2,83.8±2.5
Mahalanobis|roberta/3e-05/16|mahalanobis_distance,75.3±22.5,53.7±37.4,99.6±0.1,99.1±0.2,96.5±0.6,90.8±1.2
Mahalanobis|roberta/5e-05/16|mahalanobis_distance_fixed,85.5±28.2,76.2±35.2,59.5±22.2,66.5±18.4,76.1±5.5,70.2±5.9
Mahalanobis|roberta/5e-05/16|mahalanobis_distance,88.6±21.9,80.6±30.8,98.8±0.6,97.7±0.9,94.7±1.7,85.9±3.9
Mahalanobis|roberta/1e-05/32|mahalanobis_distance_fixed,98.2±0.1,92.8±0.4,99.6±0.1,98.9±0.2,87.4±3.5,82.9±3.1
Mahalanobis|roberta/1e-05/32|mahalanobis_distance,98.6±0.1,95.4±0.3,99.7±0.1,99.3±0.1,96.9±0.5,92.5±0.4
Mahalanobis|roberta/3e-05/32|mahalanobis_distance_fixed,98.1±0.1,92.2±0.4,89.0±20.4,90.6±16.1,89.6±4.2,84.9±3.0


In [3]:
tasks = ['clinc_oos', 'rostd', 'snips']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct"

eval_table_oos = build_eval_table(
    {
        'CLINC|roc-auc' : [
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/1e-05/16', 'Mahalanobis', 'electra/1e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/3e-05/16', 'Mahalanobis', 'electra/3e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/5e-05/16', 'Mahalanobis', 'electra/5e-05/16'),
            
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/1e-05/32', 'Mahalanobis', 'electra/1e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),

        ],
        'CLINC|pr-auc' : [
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/1e-05/16', 'Mahalanobis', 'electra/1e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/3e-05/16', 'Mahalanobis', 'electra/3e-05/16'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/5e-05/16', 'Mahalanobis', 'electra/5e-05/16'),
            
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/1e-05/32', 'Mahalanobis', 'electra/1e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),
        ],
        
        'ROSTD|roc-auc' : [
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/1e-05/16', 'Mahalanobis', 'electra/1e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/3e-05/16', 'Mahalanobis', 'electra/3e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/5e-05/16', 'Mahalanobis', 'electra/5e-05/16'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/1e-05/32', 'Mahalanobis', 'electra/1e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),

        ],
        'ROSTD|pr-auc' : [
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/1e-05/16', 'Mahalanobis', 'electra/1e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/3e-05/16', 'Mahalanobis', 'electra/3e-05/16'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/5e-05/16', 'Mahalanobis', 'electra/5e-05/16'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/1e-05/32', 'Mahalanobis', 'electra/1e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),
        ],
        
        'SNIPS|roc-auc' : [
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/1e-05/16', 'Mahalanobis', 'electra/1e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/3e-05/16', 'Mahalanobis', 'electra/3e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/5e-05/16', 'Mahalanobis', 'electra/5e-05/16'),
            
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/1e-05/32', 'Mahalanobis', 'electra/1e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),

        ],
        'SNIPS|pr-auc' : [
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/1e-05/16', 'Mahalanobis', 'electra/1e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/3e-05/16', 'Mahalanobis', 'electra/3e-05/16'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/5e-05/16', 'Mahalanobis', 'electra/5e-05/16'),
            
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/1e-05/32', 'Mahalanobis', 'electra/1e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),
            (f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),
        ],
        
    },
    oos=True,
    subtract=False
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/1e-05/16

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/3e-05/16

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/5e-05/16

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/1e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/3e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/5e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/1e-05/16

/mnt/users/av

Unnamed: 0,CLINC|roc-auc,CLINC|pr-auc,ROSTD|roc-auc,ROSTD|pr-auc,SNIPS|roc-auc,SNIPS|pr-auc
baseline (max_prob),92.9±0.6,81.2±1.5,71.8±6.1,56.8±9.4,88.7±1.1,66.7±1.5
Mahalanobis|electra/1e-05/16|mahalanobis_distance_fixed,95.7±0.2,81.2±1.5,87.7±19.4,88.3±15.7,83.6±1.2,77.4±1.4
Mahalanobis|electra/1e-05/16|mahalanobis_distance,94.4±0.4,77.6±1.9,99.2±0.3,97.8±0.9,95.5±0.8,88.0±2.5
Mahalanobis|electra/3e-05/16|mahalanobis_distance_fixed,97.1±0.1,86.7±1.0,88.7±8.6,89.5±6.5,83.2±6.5,76.8±4.5
Mahalanobis|electra/3e-05/16|mahalanobis_distance,97.3±0.1,88.7±0.6,99.2±0.1,97.8±0.3,95.5±0.4,87.5±0.7
Mahalanobis|electra/5e-05/16|mahalanobis_distance_fixed,97.1±0.1,86.9±0.7,96.5±2.7,95.6±2.1,80.6±7.0,73.7±3.9
Mahalanobis|electra/5e-05/16|mahalanobis_distance,97.4±0.1,88.9±0.5,99.2±0.1,98.0±0.2,94.7±1.0,85.6±2.1
Mahalanobis|electra/1e-05/32|mahalanobis_distance_fixed,94.5±0.3,76.7±1.1,82.2±24.6,83.7±20.7,81.0±6.2,74.7±5.2
Mahalanobis|electra/1e-05/32|mahalanobis_distance,90.4±0.5,64.7±2.0,99.0±0.4,97.4±1.2,95.4±0.6,86.0±3.2
Mahalanobis|electra/3e-05/32|mahalanobis_distance_fixed,96.9±0.2,86.2±1.0,95.8±3.5,95.0±2.9,86.4±1.7,80.2±1.3


# spectral norm all

In [7]:
tasks = ['clinc_oos', 'rostd', 'snips']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all"

eval_table_oos = build_eval_table(
    {
        'CLINC|roc-auc' : [
            #(f'{path}/{tasks[0]}/{mc_types[1]}/1e-05/4', 'NUQ', 'electra/1e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/3e-05/4', 'NUQ', 'electra/3e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/5e-05/4', 'NUQ', 'electra/5e-05/4'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/1e-05/8', 'NUQ', 'electra/1e-05/8'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/3e-05/8', 'NUQ', 'electra/3e-05/8'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/5e-05/8', 'NUQ', 'electra/5e-05/8'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/1e-05/4', 'Maha', 'electra/1e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/3e-05/4', 'Maha', 'electra/3e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/5e-05/4', 'Maha', 'electra/5e-05/4'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/1e-05/8', 'Maha', 'electra/1e-05/8'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/3e-05/8', 'Maha', 'electra/3e-05/8'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/5e-05/8', 'Maha', 'electra/5e-05/8'),

        ],
        'CLINC|pr-auc' : [
            #(f'{path}/{tasks[0]}/{mc_types[1]}/1e-05/4', 'NUQ', 'electra/1e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/3e-05/4', 'NUQ', 'electra/3e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/5e-05/4', 'NUQ', 'electra/5e-05/4'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/1e-05/8', 'NUQ', 'electra/1e-05/8'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/3e-05/8', 'NUQ', 'electra/3e-05/8'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/5e-05/8', 'NUQ', 'electra/5e-05/8'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/1e-05/4', 'Maha', 'electra/1e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/3e-05/4', 'Maha', 'electra/3e-05/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/5e-05/4', 'Maha', 'electra/5e-05/4'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/1e-05/8', 'Maha', 'electra/1e-05/8'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/3e-05/8', 'Maha', 'electra/3e-05/8'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/5e-05/8', 'Maha', 'electra/5e-05/8'),
        ],
        
    },
    oos=True,
    subtract=False
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all/clinc_oos/nuq/5e-05/8

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/rostd/maha/google/electra-base-discriminator/3e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all/clinc_oos/nuq/5e-05/8

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/rostd/maha/google/electra-base-discriminator/3e-05/32



Unnamed: 0,CLINC|roc-auc,CLINC|pr-auc
baseline (max_prob),74.5±7.7,60.8±11.9
NUQ|electra/5e-05/8|epistemic,95.5±0.3,80.3±2.0
NUQ|electra/5e-05/8|aleatoric,95.4±0.4,80.1±1.9
NUQ|electra/5e-05/8|total,95.5±0.4,80.2±2.0
Mahalanobis|electra/3e-05/32|mahalanobis_distance,99.3±0.2,98.0±0.8


In [7]:
tasks = ['clinc_oos', 'rostd', 'snips']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            #(f'{path}/{tasks[1]}/{mc_types[1]}/1e-05/4', 'NUQ', 'electra/1e-05/4'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/3e-05/4', 'NUQ', 'electra/3e-05/4'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/5e-05/4', 'NUQ', 'electra/5e-05/4'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/1e-05/8', 'NUQ', 'electra/1e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/3e-05/8', 'NUQ', 'electra/3e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/5e-05/8', 'NUQ', 'electra/5e-05/8'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/1e-05/4', 'Maha', 'electra/1e-05/4'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/3e-05/4', 'Maha', 'electra/3e-05/4'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/5e-05/4', 'Maha', 'electra/5e-05/4'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/1e-05/8', 'Maha', 'electra/1e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/3e-05/8', 'Maha', 'electra/3e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/5e-05/8', 'Maha', 'electra/5e-05/8'),

        ],
        'ROSTD|pr-auc' : [
            #(f'{path}/{tasks[1]}/{mc_types[1]}/1e-05/4', 'NUQ', 'electra/1e-05/4'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/3e-05/4', 'NUQ', 'electra/3e-05/4'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/5e-05/4', 'NUQ', 'electra/5e-05/4'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/1e-05/8', 'NUQ', 'electra/1e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/3e-05/8', 'NUQ', 'electra/3e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/5e-05/8', 'NUQ', 'electra/5e-05/8'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/1e-05/4', 'Maha', 'electra/1e-05/4'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/3e-05/4', 'Maha', 'electra/3e-05/4'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/5e-05/4', 'Maha', 'electra/5e-05/4'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/1e-05/8', 'Maha', 'electra/1e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/3e-05/8', 'Maha', 'electra/3e-05/8'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/5e-05/8', 'Maha', 'electra/5e-05/8')
            
        ],
        
    },
    oos=True,
    subtract=False
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all/rostd/nuq/1e-05/8

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all/rostd/maha/3e-05/4

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all/rostd/nuq/1e-05/8

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn_all/rostd/maha/3e-05/4



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),96.4±0.3,89.9±0.6
NUQ|electra/1e-05/8|epistemic,98.4±0.1,95.6±0.2
NUQ|electra/1e-05/8|aleatoric,97.9±0.0,94.1±0.1
NUQ|electra/1e-05/8|total,98.2±0.1,95.0±0.2
Maha|electra/3e-05/4|mahalanobis_distance,96.4±0.3,89.9±0.6


# SNIPS75

In [8]:
tasks = ['clinc_oos', 'rostd', 'snips']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn"
path1 = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct"
eval_table_oos = build_eval_table(
    {
        'SNIPS|roc-auc' : [
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/32/1e-05', 'NUQ', 'electra/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/32/3e-05', 'NUQ', 'electra/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/32/5e-05', 'NUQ', 'electra/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/32/1e-05', 'NUQ', 'electra/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/32/3e-05', 'NUQ', 'electra/32/32/3e-05'),
            (f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/32/5e-05', 'NUQ', 'electra/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/128/1e-05', 'NUQ', 'electra/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/128/3e-05', 'NUQ', 'electra/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/128/5e-05', 'NUQ', 'electra/16/128/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/128/1e-05', 'NUQ', 'electra/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/128/5e-05', 'NUQ', 'electra/32/128/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/32/1e-05', 'Maha', 'electra/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/32/3e-05', 'Maha', 'electra/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/32/5e-05', 'Maha', 'electra/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/32/1e-05', 'Maha', 'electra/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/32/3e-05', 'Maha', 'electra/32/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/32/5e-05', 'Maha', 'electra/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/128/1e-05', 'Maha', 'electra/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/128/3e-05', 'Maha', 'electra/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/128/5e-05', 'Maha', 'electra/16/128/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/128/1e-05', 'Maha', 'electra/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/128/3e-05', 'Maha', 'electra/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/128/5e-05', 'Maha', 'electra/32/128/5e-05'),
            
            (f'{path1}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),
        ],
        'SNIPS|pr-auc' : [
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/32/1e-05', 'NUQ', 'electra/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/32/3e-05', 'NUQ', 'electra/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/32/5e-05', 'NUQ', 'electra/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/32/1e-05', 'NUQ', 'electra/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/32/3e-05', 'NUQ', 'electra/32/32/3e-05'),
            (f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/32/5e-05', 'NUQ', 'electra/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/128/1e-05', 'NUQ', 'electra/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/128/3e-05', 'NUQ', 'electra/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/16/128/5e-05', 'NUQ', 'electra/16/128/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/128/1e-05', 'NUQ', 'electra/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[0]}/32/128/5e-05', 'NUQ', 'electra/32/128/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/32/1e-05', 'Maha', 'electra/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/32/3e-05', 'Maha', 'electra/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/32/5e-05', 'Maha', 'electra/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/32/1e-05', 'Maha', 'electra/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/32/3e-05', 'Maha', 'electra/32/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/32/5e-05', 'Maha', 'electra/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/128/1e-05', 'Maha', 'electra/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/128/3e-05', 'Maha', 'electra/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/16/128/5e-05', 'Maha', 'electra/16/128/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/128/1e-05', 'Maha', 'electra/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/128/3e-05', 'Maha', 'electra/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/32/128/5e-05', 'Maha', 'electra/32/128/5e-05'),
            
            (f'{path1}/{tasks[2]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),

        ],
        
    },
    oos=True,
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn/snips/nuq/google/electra-base-discriminator/32/32/5e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/snips/maha/google/electra-base-discriminator/3e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn/snips/nuq/google/electra-base-discriminator/32/32/5e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/snips/maha/google/electra-base-discriminator/3e-05/32



Unnamed: 0,SNIPS|roc-auc,SNIPS|pr-auc
baseline (max_prob),87.4±1.5,67.2±2.0
NUQ|electra/32/32/5e-05|epistemic,96.1±0.2,87.8±0.3
NUQ|electra/32/32/5e-05|aleatoric,96.0±0.2,87.2±0.5
NUQ|electra/32/32/5e-05|total,96.1±0.2,87.7±0.3
Mahalanobis|electra/3e-05/32|mahalanobis_distance,95.9±0.6,89.1±1.1


In [9]:
tasks = ['clinc_oos', 'rostd', 'snips']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn"
path1 = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct"
eval_table_oos = build_eval_table(
    {
        'SNIPS|roc-auc' : [
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/32/1e-05', 'NUQ', 'roberta/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/32/3e-05', 'NUQ', 'roberta/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/32/5e-05', 'NUQ', 'roberta/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/32/1e-05', 'NUQ', 'roberta/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/32/3e-05', 'NUQ', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/32/5e-05', 'NUQ', 'roberta/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/128/1e-05', 'NUQ', 'roberta/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/128/3e-05', 'NUQ', 'roberta/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/128/5e-05', 'NUQ', 'roberta/16/128/5e-05'),

            (f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/128/1e-05', 'NUQ', 'roberta/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/128/3e-05', 'NUQ', 'roberta/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/128/5e-05', 'NUQ', 'roberta/32/128/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/32/1e-05', 'Maha', 'roberta/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/32/3e-05', 'Maha', 'roberta/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/32/5e-05', 'Maha', 'roberta/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/32/1e-05', 'Maha', 'roberta/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/32/3e-05', 'Maha', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/32/5e-05', 'Maha', 'roberta/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/128/1e-05', 'Maha', 'roberta/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'Maha', 'roberta/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/128/5e-05', 'Maha', 'roberta/16/128/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/128/1e-05', 'Maha', 'roberta/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/128/3e-05', 'Maha', 'roberta/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/128/5e-05', 'Maha', 'roberta/32/128/5e-05'),
            
            (f'{path1}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
        ],
        'SNIPS|pr-auc' : [
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/32/1e-05', 'NUQ', 'roberta/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/32/3e-05', 'NUQ', 'roberta/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/32/5e-05', 'NUQ', 'roberta/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/32/1e-05', 'NUQ', 'roberta/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/32/3e-05', 'NUQ', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/32/5e-05', 'NUQ', 'roberta/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/128/1e-05', 'NUQ', 'roberta/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/128/3e-05', 'NUQ', 'roberta/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/16/128/5e-05', 'NUQ', 'roberta/16/128/5e-05'),

            (f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/128/1e-05', 'NUQ', 'roberta/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/128/3e-05', 'NUQ', 'roberta/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[1]}/{train_model[1]}/32/128/5e-05', 'NUQ', 'roberta/32/128/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/32/1e-05', 'Maha', 'roberta/16/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/32/3e-05', 'Maha', 'roberta/16/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/32/5e-05', 'Maha', 'roberta/16/32/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/32/1e-05', 'Maha', 'roberta/32/32/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/32/3e-05', 'Maha', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/32/5e-05', 'Maha', 'roberta/32/32/5e-05'),
            
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/128/1e-05', 'Maha', 'roberta/16/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'Maha', 'roberta/16/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/16/128/5e-05', 'Maha', 'roberta/16/128/5e-05'),

            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/128/1e-05', 'Maha', 'roberta/32/128/1e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/128/3e-05', 'Maha', 'roberta/32/128/3e-05'),
            #(f'{path}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/32/128/5e-05', 'Maha', 'roberta/32/128/5e-05'),
            
            (f'{path1}/{tasks[2]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),

        ],
        
    },
    oos=True,
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn/snips/nuq/roberta-large/32/128/1e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/snips/maha/roberta-large/3e-05/32

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test_sn/snips/nuq/roberta-large/32/128/1e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/snips/maha/roberta-large/3e-05/32



Unnamed: 0,SNIPS|roc-auc,SNIPS|pr-auc
baseline (max_prob),87.2±2.7,65.1±6.0
NUQ|roberta/32/128/1e-05|epistemic,97.0±0.4,91.9±1.0
NUQ|roberta/32/128/1e-05|aleatoric,96.8±0.4,91.1±0.8
NUQ|roberta/32/128/1e-05|total,97.0±0.4,91.7±0.9
Mahalanobis|roberta/3e-05/32|mahalanobis_distance,97.0±0.4,91.5±1.3


# ROSTD

## Roberta-large, test, paper params

In [3]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test"
path1 = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/1e-05', 'NUQ', 'roberta/16/1e-5'),
            (f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/2e-05', 'NUQ', 'roberta/16/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/3e-05', 'NUQ', 'roberta/16/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/5e-05', 'NUQ', 'roberta/16/5e-5'),

            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/1e-05', 'NUQ', 'roberta/32/1e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/2e-05', 'NUQ', 'roberta/32/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/3e-05', 'NUQ', 'roberta/32/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/5e-05', 'NUQ', 'roberta/32/5e-5'),

            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/1e-05', 'Mahalanobis', 'roberta/16/1e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/2e-05', 'Mahalanobis', 'roberta/16/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/3e-05', 'Mahalanobis', 'roberta/16/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/5e-05', 'Mahalanobis', 'roberta/16/5e-5'),

            (f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/1e-05', 'Mahalanobis', 'roberta/32/1e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/2e-05', 'Mahalanobis', 'roberta/32/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/3e-05', 'Mahalanobis', 'roberta/32/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/5e-05', 'Mahalanobis', 'roberta/32/5e-5'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/1e-05', 'DPP', 'roberta/16/1e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/2e-05', 'DPP', 'roberta/16/2e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/3e-05', 'DPP', 'roberta/16/3e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/5e-05', 'DPP', 'roberta/16/5e-5'),

            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/1e-05', 'DPP', 'roberta/32/1e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/2e-05', 'DPP', 'roberta/32/2e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/3e-05', 'DPP', 'roberta/32/3e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/5e-05', 'DPP', 'roberta/32/5e-5'),
        ],
        'ROSTD|pr-auc' : [
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/1e-05', 'NUQ', 'roberta/16/1e-5'),
            (f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/2e-05', 'NUQ', 'roberta/16/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/3e-05', 'NUQ', 'roberta/16/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/5e-05', 'NUQ', 'roberta/16/5e-5'),

            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/1e-05', 'NUQ', 'roberta/32/1e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/2e-05', 'NUQ', 'roberta/32/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/3e-05', 'NUQ', 'roberta/32/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/5e-05', 'NUQ', 'roberta/32/5e-5'),

            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/1e-05', 'Mahalanobis', 'roberta/16/1e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/2e-05', 'Mahalanobis', 'roberta/16/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/3e-05', 'Mahalanobis', 'roberta/16/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/5e-05', 'Mahalanobis', 'roberta/16/5e-5'),

            (f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/1e-05', 'Mahalanobis', 'roberta/32/1e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/2e-05', 'Mahalanobis', 'roberta/32/2e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/3e-05', 'Mahalanobis', 'roberta/32/3e-5'),
            #(f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/5e-05', 'Mahalanobis', 'roberta/32/5e-5'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/1e-05', 'DPP', 'roberta/16/1e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/2e-05', 'DPP', 'roberta/16/2e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/3e-05', 'DPP', 'roberta/16/3e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/16/5e-05', 'DPP', 'roberta/16/5e-5'),

            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/1e-05', 'DPP', 'roberta/32/1e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/2e-05', 'DPP', 'roberta/32/2e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/3e-05', 'DPP', 'roberta/32/3e-5'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/5e-05', 'DPP', 'roberta/32/5e-5'),
        ],
        
    },
    oos=True,
    #metric='pr-auc'
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test/rostd/nuq/roberta-large/16/2e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test/rostd/maha/roberta-large/32/1e-05

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/rostd/DPP/roberta-large/32/1e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test/rostd/nuq/roberta-large/16/2e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/oos_test/rostd/maha/roberta-large/32/1e-05

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/rostd/DPP/roberta-large/32/1e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),95.4±0.5,92.3±0.8
NUQ|roberta/16/2e-5|epistemic,99.7±0.1,99.2±0.4
NUQ|roberta/16/2e-5|aleatoric,99.6±0.1,98.8±0.4
NUQ|roberta/16/2e-5|total,99.7±0.1,99.1±0.4
Mahalanobis|roberta/32/1e-5|mahalanobis_distance,97.3±0.3,86.1±2.5
DPP|roberta/32/1e-5|bald,96.5±0.5,94.3±0.6
DPP|roberta/32/1e-5|sampled_max_prob,96.2±0.6,93.7±0.7
DPP|roberta/32/1e-5|variance,96.1±0.5,91.7±0.7
DPP|roberta/32/1e-5|var_ratio,81.5±1.9,71.9±2.8
DPP|roberta/32/1e-5|entropy,96.3±0.6,94.4±0.7


## Roberta-large, test, my params

In [7]:
#roberta, ROSTD, test

tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'ROSTD' : [
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/4', 'NUQ', 'roberta/32/4'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/6', 'NUQ', 'roberta/32/6'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/8', 'NUQ', 'roberta/32/8'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/10', 'NUQ', 'roberta/32/10'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/4', 'Mahalanobis', 'roberta/32/4'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/6', 'Mahalanobis', 'roberta/32/6'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/8', 'Mahalanobis', 'roberta/32/8'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/10', 'Mahalanobis', 'roberta/32/10'),
            
            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/4', 'DPP', 'roberta/32/4'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/6', 'DPP', 'roberta/32/6'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/8', 'DPP', 'roberta/32/8'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[1]}/32/10', 'DPP', 'roberta/32/10'),
        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test/rostd/nuq/roberta-large/32/4

../workdir/results/oos_test/rostd/nuq/roberta-large/32/6

../workdir/results/oos_test/rostd/nuq/roberta-large/32/8

../workdir/results/oos_test/rostd/nuq/roberta-large/32/10

../workdir/results/oos_test/rostd/maha/roberta-large/32/4

../workdir/results/oos_test/rostd/maha/roberta-large/32/6

../workdir/results/oos_test/rostd/maha/roberta-large/32/8

../workdir/results/oos_test/rostd/maha/roberta-large/32/10

../workdir/results/oos_test/rostd/DPP/roberta-large/32/4

../workdir/results/oos_test/rostd/DPP/roberta-large/32/6

../workdir/results/oos_test/rostd/DPP/roberta-large/32/8

../workdir/results/oos_test/rostd/DPP/roberta-large/32/10



Unnamed: 0,ROSTD
baseline (max_prob),72.6±16.3
NUQ|roberta/32/4|epistemic,98.8±0.9
NUQ|roberta/32/4|aleatoric,98.7±1.0
NUQ|roberta/32/4|total,98.8±0.9
NUQ|roberta/32/6|epistemic,99.0±0.0
NUQ|roberta/32/6|aleatoric,98.8±0.0
NUQ|roberta/32/6|total,98.9±0.0
NUQ|roberta/32/8|epistemic,97.3±0.2
NUQ|roberta/32/8|aleatoric,97.0±0.2
NUQ|roberta/32/8|total,97.1±0.2


## Electra, test, my params

In [5]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/4', 'NUQ', 'electra/32/4'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/google/electra-base-discriminator/32/6', 'nuq', 'electra/32/6'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/google/electra-base-discriminator/32/8', 'nuq', 'electra/32/8'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/google/electra-base-discriminator/32/10', 'nuq', 'electra/32/10'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/4', 'Mahalanobis', 'electra/32/4'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/google/electra-base-discriminator/32/6', 'maha', 'electra/32/6'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/google/electra-base-discriminator/32/8', 'maha', 'electra/32/8'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/google/electra-base-discriminator/32/10', 'maha', 'electra/32/10'),
            
            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[0]}/32/4', 'DPP', 'electra/32/4'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/32/6', 'DPP', 'electra/32/6'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/32/8', 'DPP', 'electra/32/8'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/32/10', 'DPP', 'electra/32/10'),
        ],
        'ROSTD|pr-auc' : [
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/4', 'NUQ', 'electra/32/4'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/google/electra-base-discriminator/32/6', 'nuq', 'electra/32/6'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/google/electra-base-discriminator/32/8', 'nuq', 'electra/32/8'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/google/electra-base-discriminator/32/10', 'nuq', 'electra/32/10'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/4', 'Mahalanobis', 'electra/32/4'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/google/electra-base-discriminator/32/6', 'maha', 'electra/32/6'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/google/electra-base-discriminator/32/8', 'maha', 'electra/32/8'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/google/electra-base-discriminator/32/10', 'maha', 'electra/32/10'),
            
            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[0]}/32/4', 'DPP', 'electra/32/4'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/32/6', 'DPP', 'electra/32/6'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/32/8', 'DPP', 'electra/32/8'),
            #(f'{path}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/32/10', 'DPP', 'electra/32/10'),
        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test/rostd/nuq/google/electra-base-discriminator/32/4

../workdir/results/oos_test/rostd/maha/google/electra-base-discriminator/32/4

../workdir/results/oos_test/rostd/DPP/google/electra-base-discriminator/32/4

../workdir/results/oos_test/rostd/nuq/google/electra-base-discriminator/32/4

../workdir/results/oos_test/rostd/maha/google/electra-base-discriminator/32/4

../workdir/results/oos_test/rostd/DPP/google/electra-base-discriminator/32/4



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),78.2±6.5,66.4±10.0
NUQ|electra/32/4|epistemic,99.0±0.0,97.4±0.5
NUQ|electra/32/4|aleatoric,98.7±0.1,96.5±0.8
NUQ|electra/32/4|total,98.9±0.0,97.2±0.6
Mahalanobis|electra/32/4|mahalanobis_distance,93.1±2.7,75.7±5.2
DPP|electra/32/4|bald,82.7±5.6,73.0±9.2
DPP|electra/32/4|sampled_max_prob,79.9±6.3,68.7±9.9
DPP|electra/32/4|variance,81.7±5.8,70.9±9.6
DPP|electra/32/4|var_ratio,61.4±4.1,42.2±6.2
DPP|electra/32/4|entropy,80.1±6.3,69.5±9.8


## Electra, test, my params

In [17]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'ROSTD' : [
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/1e-05', 'NUQ', 'electra/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/2e-05', 'NUQ', 'electra/16/2e-05'),

            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/1e-05', 'Mahalanobis', 'electra/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/2e-05', 'Mahalanobis', 'electra/16/2e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[0]}/16/1e-05', 'DPP', 'electra/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/{train_model[0]}/16/2e-05', 'DPP', 'electra/16/2e-05'),
        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test/rostd/nuq/google/electra-base-discriminator/16/1e-05

../workdir/results/oos_test/rostd/maha/google/electra-base-discriminator/16/2e-05

../workdir/results/oos_test/rostd/DPP/google/electra-base-discriminator/16/2e-05



Unnamed: 0,ROSTD
baseline (max_prob),75.8±5.2
NUQ|electra/16/1e-05|epistemic,99.2±0.2
NUQ|electra/16/1e-05|aleatoric,98.9±0.3
NUQ|electra/16/1e-05|total,99.1±0.2
Mahalanobis|electra/16/2e-05|mahalanobis_distance,93.9±1.4
DPP|electra/16/2e-05|bald,84.5±4.5
DPP|electra/16/2e-05|sampled_max_prob,77.7±3.9
DPP|electra/16/2e-05|variance,83.7±5.4
DPP|electra/16/2e-05|var_ratio,60.3±1.8
DPP|electra/16/2e-05|entropy,77.8±3.9


## Electra, NUQ, ROSTD,  test, SN

In [19]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/1e-05', 'NUQ', 'electra/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/2e-05', 'NUQ', 'electra/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/3e-05', 'NUQ', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/5e-05', 'NUQ', 'electra/128/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/1e-05', 'NUQ', 'electra/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/2e-05', 'NUQ', 'electra/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/3e-05', 'NUQ', 'electra/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/5e-05', 'NUQ', 'electra/256/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/1e-05', 'NUQ', 'electra/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/2e-05', 'NUQ', 'electra/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/5e-05', 'NUQ', 'electra/128/32/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/1e-05', 'NUQ', 'electra/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/2e-05', 'NUQ', 'electra/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/3e-05', 'NUQ', 'electra/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/5e-05', 'NUQ', 'electra/256/32/5e-05'),
            

        ],
        'ROSTD|pr-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/1e-05', 'NUQ', 'electra/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/2e-05', 'NUQ', 'electra/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/3e-05', 'NUQ', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/128/5e-05', 'NUQ', 'electra/128/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/1e-05', 'NUQ', 'electra/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/2e-05', 'NUQ', 'electra/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/3e-05', 'NUQ', 'electra/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/256/5e-05', 'NUQ', 'electra/256/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/1e-05', 'NUQ', 'electra/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/2e-05', 'NUQ', 'electra/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/128/5e-05', 'NUQ', 'electra/128/32/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/1e-05', 'NUQ', 'electra/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/2e-05', 'NUQ', 'electra/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/3e-05', 'NUQ', 'electra/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/5e-05', 'NUQ', 'electra/256/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/32/256/1e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/32/256/1e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),98.9±0.1,97.0±0.5
NUQ|electra/256/32/1e-05|epistemic,99.1±0.1,97.6±0.4
NUQ|electra/256/32/1e-05|aleatoric,98.9±0.2,96.7±0.6
NUQ|electra/256/32/1e-05|total,99.0±0.1,97.3±0.4


In [8]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"
path1 = "../workdir/results/mahalanobis_correct"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [

            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/1e-05', 'NUQ', 'electra/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/1e-05', 'maha', 'electra/256/32/1e-05'),
            (f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),

            

        ],
        'ROSTD|pr-auc' : [

            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/256/1e-05', 'NUQ', 'electra/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/1e-05', 'maha', 'electra/256/32/1e-05'),
            (f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/3e-05/32', 'Mahalanobis', 'electra/3e-05/32'),


        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/32/256/1e-05

../workdir/results/mahalanobis_correct/rostd/maha/google/electra-base-discriminator/3e-05/32

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/32/256/1e-05

../workdir/results/mahalanobis_correct/rostd/maha/google/electra-base-discriminator/3e-05/32



Unnamed: 0,ROSTD|roc-auc,ROSTD|pr-auc
baseline (max_prob),74.5±7.7,60.8±11.9
NUQ|electra/256/32/1e-05|epistemic,99.1±0.1,97.6±0.4
NUQ|electra/256/32/1e-05|aleatoric,98.9±0.2,96.7±0.6
NUQ|electra/256/32/1e-05|total,99.0±0.1,97.3±0.4
Mahalanobis|electra/3e-05/32|mahalanobis_distance,99.3±0.2,98.0±0.8


## Roberta, NUQ, ROSTD,  test, SN

In [20]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/1e-05', 'NUQ', 'roberta/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/2e-05', 'NUQ', 'roberta/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/3e-05', 'NUQ', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/5e-05', 'NUQ', 'roberta/128/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/1e-05', 'NUQ', 'roberta/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/2e-05', 'NUQ', 'roberta/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/3e-05', 'NUQ', 'roberta/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/5e-05', 'NUQ', 'roberta/256/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/1e-05', 'NUQ', 'roberta/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/2e-05', 'NUQ', 'roberta/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/3e-05', 'NUQ', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/5e-05', 'NUQ', 'roberta/128/32/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/1e-05', 'NUQ', 'roberta/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/2e-05', 'NUQ', 'roberta/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/3e-05', 'NUQ', 'roberta/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/5e-05', 'NUQ', 'roberta/256/32/5e-05'),
            

        ],
        'ROSTD|pr-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/1e-05', 'NUQ', 'roberta/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/2e-05', 'NUQ', 'roberta/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/3e-05', 'NUQ', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/128/5e-05', 'NUQ', 'roberta/128/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/1e-05', 'NUQ', 'roberta/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/2e-05', 'NUQ', 'roberta/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/3e-05', 'NUQ', 'roberta/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/256/5e-05', 'NUQ', 'roberta/256/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/1e-05', 'NUQ', 'roberta/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/2e-05', 'NUQ', 'roberta/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/3e-05', 'NUQ', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/5e-05', 'NUQ', 'roberta/128/32/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/1e-05', 'NUQ', 'roberta/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/2e-05', 'NUQ', 'roberta/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/3e-05', 'NUQ', 'roberta/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/256/5e-05', 'NUQ', 'roberta/256/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/128/1e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/128/1e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),99.6±0.0,99.0±0.1
NUQ|roberta/128/32/1e-05|epistemic,99.7±0.0,99.2±0.0
NUQ|roberta/128/32/1e-05|aleatoric,99.5±0.0,98.6±0.1
NUQ|roberta/128/32/1e-05|total,99.6±0.0,99.2±0.0


In [6]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"
path1 = "../workdir/results/mahalanobis_correct"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            (f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/1e-05', 'NUQ', 'roberta/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/1e-05', 'maha', 'roberta/128/32/1e-05'),

            

        ],
        'ROSTD|pr-auc' : [

            (f'{path1}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/3e-05/32', 'Mahalanobis', 'roberta/3e-05/32'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/128/1e-05', 'NUQ', 'roberta/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/1e-05', 'maha', 'roberta/128/32/1e-05'),


        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/mahalanobis_correct/rostd/maha/roberta-large/3e-05/32

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/128/1e-05

../workdir/results/mahalanobis_correct/rostd/maha/roberta-large/3e-05/32

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/128/1e-05



Unnamed: 0,ROSTD|roc-auc,ROSTD|pr-auc
baseline (max_prob),99.6±0.0,99.0±0.1
Mahalanobis|roberta/3e-05/32|mahalanobis_distance,99.7±0.1,99.4±0.2
NUQ|roberta/128/32/1e-05|epistemic,99.7±0.0,99.2±0.0
NUQ|roberta/128/32/1e-05|aleatoric,99.5±0.0,98.6±0.1
NUQ|roberta/128/32/1e-05|total,99.6±0.0,99.2±0.0


## Electra, Maha, ROSTD,  test, SN

In [21]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/1e-05', 'maha', 'electra/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/2e-05', 'maha', 'electra/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/3e-05', 'maha', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/5e-05', 'maha', 'electra/128/16/5e-05'),
        
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/1e-05', 'maha', 'electra/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/2e-05', 'maha', 'electra/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/3e-05', 'maha', 'electra/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/5e-05', 'maha', 'electra/256/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/1e-05', 'maha', 'electra/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/2e-05', 'maha', 'electra/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/3e-05', 'maha', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/5e-05', 'maha', 'electra/128/32/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/1e-05', 'maha', 'electra/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/2e-05', 'maha', 'electra/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/3e-05', 'maha', 'electra/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/5e-05', 'maha', 'electra/256/32/5e-05'),
            

        ],
        'ROSTD|pr-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/1e-05', 'maha', 'electra/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/2e-05', 'maha', 'electra/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/3e-05', 'maha', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/128/5e-05', 'maha', 'electra/128/16/5e-05'),
        
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/1e-05', 'maha', 'electra/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/2e-05', 'maha', 'electra/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/3e-05', 'maha', 'electra/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/256/5e-05', 'maha', 'electra/256/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/1e-05', 'maha', 'electra/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/2e-05', 'maha', 'electra/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/3e-05', 'maha', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/128/5e-05', 'maha', 'electra/128/32/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/1e-05', 'maha', 'electra/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/2e-05', 'maha', 'electra/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/3e-05', 'maha', 'electra/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/256/5e-05', 'maha', 'electra/256/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/rostd/maha/google/electra-base-discriminator/32/256/1e-05

../workdir/results/oos_test_sn/rostd/maha/google/electra-base-discriminator/32/256/1e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),96.6±0.0,89.6±0.0
maha|electra/256/32/1e-05|mahalanobis_distance,96.6±0.0,89.6±0.0


## Roberta, Maha, ROSTD,  test, SN

In [22]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/1e-05', 'maha', 'roberta/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/2e-05', 'maha', 'roberta/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'maha', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/5e-05', 'maha', 'roberta/128/16/5e-05'),
        
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/1e-05', 'maha', 'roberta/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/2e-05', 'maha', 'roberta/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/3e-05', 'maha', 'roberta/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/5e-05', 'maha', 'roberta/256/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/1e-05', 'maha', 'roberta/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/2e-05', 'maha', 'roberta/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/3e-05', 'maha', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/5e-05', 'maha', 'roberta/128/32/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/1e-05', 'maha', 'roberta/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/2e-05', 'maha', 'roberta/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/3e-05', 'maha', 'roberta/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/5e-05', 'maha', 'roberta/256/32/5e-05'),
            

        ],
        'ROSTD|pr-auc' : [
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/1e-05', 'maha', 'roberta/128/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/2e-05', 'maha', 'roberta/128/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'maha', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/128/5e-05', 'maha', 'roberta/128/16/5e-05'),
        
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/1e-05', 'maha', 'roberta/256/16/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/2e-05', 'maha', 'roberta/256/16/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/3e-05', 'maha', 'roberta/256/16/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/256/5e-05', 'maha', 'roberta/256/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/1e-05', 'maha', 'roberta/128/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/2e-05', 'maha', 'roberta/128/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/3e-05', 'maha', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/128/5e-05', 'maha', 'roberta/128/32/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/1e-05', 'maha', 'roberta/256/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/2e-05', 'maha', 'roberta/256/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/3e-05', 'maha', 'roberta/256/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/256/5e-05', 'maha', 'roberta/256/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/rostd/maha/roberta-large/32/128/1e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/32/128/1e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),97.4±0.6,90.9±1.8
maha|roberta/128/32/1e-05|mahalanobis_distance,97.4±0.6,90.9±1.8


## Electra, NUQ, CLINC,  test, SN

In [13]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/3e-05', 'NUQ', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/5e-05', 'NUQ', 'electra/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/7e-05', 'NUQ', 'electra/128/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/0.0001', 'NUQ', 'electra/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/3e-05', 'NUQ', 'electra/32/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/5e-05', 'NUQ', 'electra/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/7e-05', 'NUQ', 'electra/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/0.0001', 'NUQ', 'electra/32/16/1e-04'),
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/5e-05', 'NUQ', 'electra/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/7e-05', 'NUQ', 'electra/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/0.0001', 'NUQ', 'electra/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/3e-05', 'NUQ', 'electra/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/5e-05', 'NUQ', 'electra/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/7e-05', 'NUQ', 'electra/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/0.0001', 'NUQ', 'electra/32/32/1e-04'),

        ],
        'ROSTD|pr-auc' : [
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/3e-05', 'NUQ', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/5e-05', 'NUQ', 'electra/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/7e-05', 'NUQ', 'electra/128/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/128/0.0001', 'NUQ', 'electra/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/3e-05', 'NUQ', 'electra/32/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/5e-05', 'NUQ', 'electra/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/7e-05', 'NUQ', 'electra/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/32/0.0001', 'NUQ', 'electra/32/16/1e-04'),
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/5e-05', 'NUQ', 'electra/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/7e-05', 'NUQ', 'electra/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/0.0001', 'NUQ', 'electra/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/3e-05', 'NUQ', 'electra/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/5e-05', 'NUQ', 'electra/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/7e-05', 'NUQ', 'electra/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/32/0.0001', 'NUQ', 'electra/32/32/1e-04'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/32/128/3e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/32/128/3e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),96.4±0.3,85.4±0.3
NUQ|electra/128/32/3e-05|epistemic,96.8±0.1,86.0±0.2
NUQ|electra/128/32/3e-05|aleatoric,95.8±0.5,80.7±1.8
NUQ|electra/128/32/3e-05|total,96.6±0.2,85.8±0.0


## Electra, Maha, CLINC,  test, SN

In [14]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/3e-05', 'Maha', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/5e-05', 'Maha', 'electra/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/7e-05', 'Maha', 'electra/128/16/7e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/0.0001', 'Maha', 'electra/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/3e-05', 'Maha', 'electra/32/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/5e-05', 'Maha', 'electra/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/7e-05', 'Maha', 'electra/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/0.0001', 'Maha', 'electra/32/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/3e-05', 'Maha', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/5e-05', 'Maha', 'electra/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/7e-05', 'Maha', 'electra/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/0.0001', 'Maha', 'electra/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/3e-05', 'Maha', 'electra/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/5e-05', 'Maha', 'electra/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/7e-05', 'Maha', 'electra/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/0.0001', 'Maha', 'electra/32/32/1e-04'),

        ],
        'ROSTD|pr-auc' : [
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/3e-05', 'Maha', 'electra/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/5e-05', 'Maha', 'electra/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/7e-05', 'Maha', 'electra/128/16/7e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/0.0001', 'Maha', 'electra/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/3e-05', 'Maha', 'electra/32/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/5e-05', 'Maha', 'electra/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/7e-05', 'Maha', 'electra/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/32/0.0001', 'Maha', 'electra/32/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/3e-05', 'Maha', 'electra/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/5e-05', 'Maha', 'electra/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/7e-05', 'Maha', 'electra/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/128/0.0001', 'Maha', 'electra/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/3e-05', 'Maha', 'electra/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/5e-05', 'Maha', 'electra/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/7e-05', 'Maha', 'electra/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/32/0.0001', 'Maha', 'electra/32/32/1e-04'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/16/128/0.0001

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/16/128/0.0001



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),96.6±0.2,85.0±0.8
Maha|electra/128/16/1e-04|mahalanobis_distance,96.6±0.2,85.0±0.8


## Roberta, NUQ, CLINC,  test, SN

In [15]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/3e-05', 'NUQ', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/5e-05', 'NUQ', 'roberta/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/7e-05', 'NUQ', 'roberta/128/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/0.0001', 'NUQ', 'roberta/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/3e-05', 'NUQ', 'roberta/32/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/5e-05', 'NUQ', 'roberta/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/7e-05', 'NUQ', 'roberta/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/0.0001', 'NUQ', 'roberta/32/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/3e-05', 'NUQ', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/5e-05', 'NUQ', 'roberta/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/7e-05', 'NUQ', 'roberta/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/0.0001', 'NUQ', 'roberta/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/3e-05', 'NUQ', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/5e-05', 'NUQ', 'roberta/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/7e-05', 'NUQ', 'roberta/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/0.0001', 'NUQ', 'roberta/32/32/1e-04'),

        ],
        'ROSTD|pr-auc' : [
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/3e-05', 'NUQ', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/5e-05', 'NUQ', 'roberta/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/7e-05', 'NUQ', 'roberta/128/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/128/0.0001', 'NUQ', 'roberta/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/3e-05', 'NUQ', 'roberta/32/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/5e-05', 'NUQ', 'roberta/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/7e-05', 'NUQ', 'roberta/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/0.0001', 'NUQ', 'roberta/32/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/3e-05', 'NUQ', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/5e-05', 'NUQ', 'roberta/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/7e-05', 'NUQ', 'roberta/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/128/0.0001', 'NUQ', 'roberta/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/3e-05', 'NUQ', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/5e-05', 'NUQ', 'roberta/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/7e-05', 'NUQ', 'roberta/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/32/0.0001', 'NUQ', 'roberta/32/32/1e-04'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/32/5e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/32/5e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),97.7±0.5,87.3±4.4
NUQ|roberta/32/16/5e-05|epistemic,97.7±0.5,86.8±5.3
NUQ|roberta/32/16/5e-05|aleatoric,97.6±0.5,87.7±3.7
NUQ|roberta/32/16/5e-05|total,97.7±0.5,86.8±5.3


## Roberta, Maha, CLINC,  test, SN

In [16]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'Maha', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/5e-05', 'Maha', 'roberta/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/7e-05', 'Maha', 'roberta/128/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/0.0001', 'Maha', 'roberta/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/3e-05', 'Maha', 'roberta/32/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/5e-05', 'Maha', 'roberta/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/7e-05', 'Maha', 'roberta/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/0.0001', 'Maha', 'roberta/32/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/3e-05', 'Maha', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/5e-05', 'Maha', 'roberta/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/7e-05', 'Maha', 'roberta/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/0.0001', 'Maha', 'roberta/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/3e-05', 'Maha', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/5e-05', 'Maha', 'roberta/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/7e-05', 'Maha', 'roberta/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/0.0001', 'Maha', 'roberta/32/32/1e-04'),

        ],
        'ROSTD|pr-auc' : [
            
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'Maha', 'roberta/128/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/5e-05', 'Maha', 'roberta/128/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/7e-05', 'Maha', 'roberta/128/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/0.0001', 'Maha', 'roberta/128/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/3e-05', 'Maha', 'roberta/32/16/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/5e-05', 'Maha', 'roberta/32/16/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/7e-05', 'Maha', 'roberta/32/16/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/32/0.0001', 'Maha', 'roberta/32/16/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/3e-05', 'Maha', 'roberta/128/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/5e-05', 'Maha', 'roberta/128/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/7e-05', 'Maha', 'roberta/128/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/128/0.0001', 'Maha', 'roberta/128/32/1e-04'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/3e-05', 'Maha', 'roberta/32/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/5e-05', 'Maha', 'roberta/32/32/5e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/7e-05', 'Maha', 'roberta/32/32/7e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/32/0.0001', 'Maha', 'roberta/32/32/1e-04'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/16/128/3e-05

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/16/128/3e-05



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),97.6±0.1,89.8±0.8
Maha|roberta/128/16/3e-05|mahalanobis_distance,97.6±0.1,89.8±0.8


In [3]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"
path1 = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct"

eval_table_oos = build_eval_table(
    {
        'CLINC|roc-auc' : [
            
            (f'{path1}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'Maha', 'roberta/128/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/5e-05', 'NUQ', 'roberta/32/16/5e-05'),

        ],
        'CLINC|pr-auc' : [
            
            (f'{path1}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/1e-05/16', 'Mahalanobis', 'roberta/1e-05/16'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/128/3e-05', 'Maha', 'roberta/128/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/32/5e-05', 'NUQ', 'roberta/32/16/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/1e-05/16

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/32/5e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/roberta-large/1e-05/16

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/32/5e-05



Unnamed: 0,CLINC|roc-auc,CLINC|pr-auc
baseline (max_prob),97.7±0.5,87.3±4.4
Mahalanobis|roberta/1e-05/16|mahalanobis_distance,98.4±0.1,94.6±0.3
NUQ|roberta/32/16/5e-05|epistemic,97.7±0.5,86.8±5.3
NUQ|roberta/32/16/5e-05|aleatoric,97.6±0.5,87.7±3.7
NUQ|roberta/32/16/5e-05|total,97.7±0.5,86.8±5.3


In [4]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"
path1 = "/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct"

eval_table_oos = build_eval_table(
    {
        'CLINC|roc-auc' : [
            
            (f'{path1}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/0.0001', 'Maha', 'electra/128/16/1e-04'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/128/32/3e-05'),

        ],
        'CLINC|pr-auc' : [
            
            (f'{path1}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/5e-05/32', 'Mahalanobis', 'electra/5e-05/32'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/128/0.0001', 'Maha', 'electra/128/16/1e-04'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/128/3e-05', 'NUQ', 'electra/128/32/3e-05'),

        ],
    },
    oos=True
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/5e-05/32

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/32/128/3e-05

/mnt/users/avazhentsev/uncertainty-estimation/workdir/results/mahalanobis_correct/clinc_oos/maha/google/electra-base-discriminator/5e-05/32

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/32/128/3e-05



Unnamed: 0,CLINC|roc-auc,CLINC|pr-auc
baseline (max_prob),96.4±0.3,85.4±0.3
Mahalanobis|electra/5e-05/32|mahalanobis_distance,97.5±0.1,89.2±0.4
NUQ|electra/128/32/3e-05|epistemic,96.8±0.1,86.0±0.2
NUQ|electra/128/32/3e-05|aleatoric,95.8±0.5,80.7±1.8
NUQ|electra/128/32/3e-05|total,96.6±0.2,85.8±0.0


In [8]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/1e-05', 'NUQ', 'electra/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/2e-05', 'NUQ', 'electra/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/3e-05', 'NUQ', 'electra/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/5e-05', 'NUQ', 'electra/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/1e-05', 'NUQ', 'electra/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/2e-05', 'NUQ', 'electra/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/3e-05', 'NUQ', 'electra/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/5e-05', 'NUQ', 'electra/32/5e-05'),

            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/1e-05', 'Mahalanobis', 'electra/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/2e-05', 'Mahalanobis', 'electra/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/3e-05', 'Mahalanobis', 'electra/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/5e-05', 'Mahalanobis', 'electra/16/5e-05'),
        
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/1e-05', 'Mahalanobis', 'electra/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/2e-05', 'Mahalanobis', 'electra/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/3e-05', 'Mahalanobis', 'electra/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/5e-05', 'Mahalanobis', 'electra/32/5e-05'),

        ],
        'ROSTD|pr-auc' : [
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/1e-05', 'NUQ', 'electra/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/2e-05', 'NUQ', 'electra/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/3e-05', 'NUQ', 'electra/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/16/5e-05', 'NUQ', 'electra/16/5e-05'),
            
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/1e-05', 'NUQ', 'electra/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/2e-05', 'NUQ', 'electra/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/3e-05', 'NUQ', 'electra/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[0]}/32/5e-05', 'NUQ', 'electra/32/5e-05'),

            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/1e-05', 'Mahalanobis', 'electra/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/2e-05', 'Mahalanobis', 'electra/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/3e-05', 'Mahalanobis', 'electra/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/16/5e-05', 'Mahalanobis', 'electra/16/5e-05'),

            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/1e-05', 'Mahalanobis', 'electra/32/1e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/2e-05', 'Mahalanobis', 'electra/32/2e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/3e-05', 'Mahalanobis', 'electra/32/3e-05'),
            #(f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[0]}/32/5e-05', 'Mahalanobis', 'electra/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/16/1e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/16/2e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/16/3e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/16/5e-05

../workdir/results/oos_test_sn/rostd/maha/google/electra-base-discriminator/16/1e-05

../workdir/results/oos_test_sn/rostd/maha/google/electra-base-discriminator/16/2e-05

../workdir/results/oos_test_sn/rostd/maha/google/electra-base-discriminator/16/3e-05

../workdir/results/oos_test_sn/rostd/maha/google/electra-base-discriminator/16/5e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/16/1e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/16/2e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-discriminator/16/3e-05

../workdir/results/oos_test_sn/rostd/nuq/google/electra-base-

Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),89.2±2.7,75.9±3.5
NUQ|electra/16/1e-05|epistemic,98.3±0.3,94.6±0.8
NUQ|electra/16/1e-05|aleatoric,97.8±0.4,93.0±1.5
NUQ|electra/16/1e-05|total,98.0±0.4,93.8±1.1
NUQ|electra/16/2e-05|epistemic,98.1±0.6,93.2±4.0
NUQ|electra/16/2e-05|aleatoric,97.7±0.8,91.8±3.9
NUQ|electra/16/2e-05|total,97.9±0.8,92.3±4.6
NUQ|electra/16/3e-05|epistemic,98.0±0.6,94.0±2.6
NUQ|electra/16/3e-05|aleatoric,97.7±0.7,92.5±3.3
NUQ|electra/16/3e-05|total,97.8±0.6,93.4±3.1


In [9]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'ROSTD|roc-auc' : [
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/1e-05', 'NUQ', 'roberta/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/2e-05', 'NUQ', 'roberta/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/3e-05', 'NUQ', 'roberta/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/5e-05', 'NUQ', 'roberta/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/1e-05', 'NUQ', 'roberta/32/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/2e-05', 'NUQ', 'roberta/32/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/3e-05', 'NUQ', 'roberta/32/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/5e-05', 'NUQ', 'roberta/32/5e-05'),

            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/1e-05', 'Mahalanobis', 'roberta/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/2e-05', 'Mahalanobis', 'roberta/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/3e-05', 'Mahalanobis', 'roberta/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/5e-05', 'Mahalanobis', 'roberta/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/1e-05', 'Mahalanobis', 'roberta/32/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/2e-05', 'Mahalanobis', 'roberta/32/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/3e-05', 'Mahalanobis', 'roberta/32/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/5e-05', 'Mahalanobis', 'roberta/32/5e-05'),
        
        ],
        'ROSTD|pr-auc' : [

            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/1e-05', 'NUQ', 'roberta/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/2e-05', 'NUQ', 'roberta/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/3e-05', 'NUQ', 'roberta/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/16/5e-05', 'NUQ', 'roberta/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/1e-05', 'NUQ', 'roberta/32/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/2e-05', 'NUQ', 'roberta/32/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/3e-05', 'NUQ', 'roberta/32/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[1]}/{train_model[1]}/32/5e-05', 'NUQ', 'roberta/32/5e-05'),

            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/1e-05', 'Mahalanobis', 'roberta/16/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/2e-05', 'Mahalanobis', 'roberta/16/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/3e-05', 'Mahalanobis', 'roberta/16/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/16/5e-05', 'Mahalanobis', 'roberta/16/5e-05'),
            
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/1e-05', 'Mahalanobis', 'roberta/32/1e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/2e-05', 'Mahalanobis', 'roberta/32/2e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/3e-05', 'Mahalanobis', 'roberta/32/3e-05'),
            (f'{path}/{tasks[1]}/{mc_types[2]}/{train_model[1]}/32/5e-05', 'Mahalanobis', 'roberta/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/16/1e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/16/2e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/16/3e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/16/5e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/1e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/2e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/3e-05

../workdir/results/oos_test_sn/rostd/nuq/roberta-large/32/5e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/16/1e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/16/2e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/16/3e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/16/5e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/32/1e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/32/2e-05

../workdir/results/oos_test_sn/rostd/maha/roberta-large/32/3e-05

../workdir/results

Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),97.7±1.1,90.1±1.7
NUQ|roberta/16/1e-05|epistemic,99.6±0.1,98.9±0.1
NUQ|roberta/16/1e-05|aleatoric,99.4±0.1,98.2±0.3
NUQ|roberta/16/1e-05|total,99.5±0.1,98.8±0.2
NUQ|roberta/16/2e-05|epistemic,99.7±0.0,99.1±0.1
NUQ|roberta/16/2e-05|aleatoric,99.6±0.1,98.8±0.1
NUQ|roberta/16/2e-05|total,99.7±0.0,99.1±0.1
NUQ|roberta/16/3e-05|epistemic,99.4±0.3,98.4±0.7
NUQ|roberta/16/3e-05|aleatoric,99.3±0.3,97.9±0.8
NUQ|roberta/16/3e-05|total,99.3±0.3,98.3±0.8


In [10]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'CLINC|roc-auc' : [
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/1e-05', 'NUQ', 'electra/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/2e-05', 'NUQ', 'electra/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/3e-05', 'NUQ', 'electra/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/5e-05', 'NUQ', 'electra/16/5e-05'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/1e-05', 'NUQ', 'electra/32/1e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/2e-05', 'NUQ', 'electra/32/2e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/3e-05', 'NUQ', 'electra/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/5e-05', 'NUQ', 'electra/32/5e-05'),

            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/1e-05', 'Mahalanobis', 'electra/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/2e-05', 'Mahalanobis', 'electra/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/3e-05', 'Mahalanobis', 'electra/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/5e-05', 'Mahalanobis', 'electra/16/5e-05'),
        
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/1e-05', 'Mahalanobis', 'electra/32/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/2e-05', 'Mahalanobis', 'electra/32/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/3e-05', 'Mahalanobis', 'electra/32/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/5e-05', 'Mahalanobis', 'electra/32/5e-05'),

        ],
        'CLINC|pr-auc' : [
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/1e-05', 'NUQ', 'electra/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/2e-05', 'NUQ', 'electra/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/3e-05', 'NUQ', 'electra/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/5e-05', 'NUQ', 'electra/16/5e-05'),
            
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/1e-05', 'NUQ', 'electra/32/1e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/2e-05', 'NUQ', 'electra/32/2e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/3e-05', 'NUQ', 'electra/32/3e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/5e-05', 'NUQ', 'electra/32/5e-05'),

            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/1e-05', 'Mahalanobis', 'electra/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/2e-05', 'Mahalanobis', 'electra/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/3e-05', 'Mahalanobis', 'electra/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/5e-05', 'Mahalanobis', 'electra/16/5e-05'),

            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/1e-05', 'Mahalanobis', 'electra/32/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/2e-05', 'Mahalanobis', 'electra/32/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/3e-05', 'Mahalanobis', 'electra/32/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/5e-05', 'Mahalanobis', 'electra/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/16/1e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/16/2e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/16/3e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/google/electra-base-discriminator/16/5e-05

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/16/1e-05

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/16/2e-05

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/16/3e-05

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/16/5e-05

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/32/1e-05

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/32/2e-05

../workdir/results/oos_test_sn/clinc_oos/maha/google/electra-base-discriminator/32/3e-05

../workdir/res

Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),96.2±0.2,83.1±1.4
NUQ|electra/16/1e-05|epistemic,94.2±0.7,77.6±2.1
NUQ|electra/16/1e-05|aleatoric,89.5±1.1,59.4±1.3
NUQ|electra/16/1e-05|total,93.3±0.8,76.6±2.0
NUQ|electra/16/2e-05|epistemic,96.6±0.1,84.3±1.1
NUQ|electra/16/2e-05|aleatoric,95.6±0.5,79.2±1.5
NUQ|electra/16/2e-05|total,96.4±0.2,84.0±1.3
NUQ|electra/16/3e-05|epistemic,96.5±0.3,80.4±3.6
NUQ|electra/16/3e-05|aleatoric,95.9±0.3,78.2±3.9
NUQ|electra/16/3e-05|total,96.4±0.3,80.2±3.6


In [11]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test_sn"

eval_table_oos = build_eval_table(
    {
        'CLINC|roc-auc' : [
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/1e-05', 'NUQ', 'roberta/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/2e-05', 'NUQ', 'roberta/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/3e-05', 'NUQ', 'roberta/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/5e-05', 'NUQ', 'roberta/16/5e-05'),
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/1e-05', 'NUQ', 'roberta/32/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/2e-05', 'NUQ', 'roberta/32/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/3e-05', 'NUQ', 'roberta/32/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/5e-05', 'NUQ', 'roberta/32/5e-05'),

            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/1e-05', 'Mahalanobis', 'roberta/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/2e-05', 'Mahalanobis', 'roberta/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/3e-05', 'Mahalanobis', 'roberta/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/5e-05', 'Mahalanobis', 'roberta/16/5e-05'),
        
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/1e-05', 'Mahalanobis', 'roberta/32/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/2e-05', 'Mahalanobis', 'roberta/32/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/3e-05', 'Mahalanobis', 'roberta/32/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/5e-05', 'Mahalanobis', 'roberta/32/5e-05'),

        ],
        'CLINC|pr-auc' : [
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/1e-05', 'NUQ', 'roberta/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/2e-05', 'NUQ', 'roberta/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/3e-05', 'NUQ', 'roberta/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/5e-05', 'NUQ', 'roberta/16/5e-05'),
            
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/1e-05', 'NUQ', 'roberta/32/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/2e-05', 'NUQ', 'roberta/32/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/3e-05', 'NUQ', 'roberta/32/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/5e-05', 'NUQ', 'roberta/32/5e-05'),

            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/1e-05', 'Mahalanobis', 'roberta/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/2e-05', 'Mahalanobis', 'roberta/16/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/3e-05', 'Mahalanobis', 'roberta/16/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/5e-05', 'Mahalanobis', 'roberta/16/5e-05'),

            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/1e-05', 'Mahalanobis', 'roberta/32/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/2e-05', 'Mahalanobis', 'roberta/32/2e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/3e-05', 'Mahalanobis', 'roberta/32/3e-05'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/5e-05', 'Mahalanobis', 'roberta/32/5e-05'),

        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/1e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/2e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/3e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/16/5e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/32/1e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/32/2e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/32/3e-05

../workdir/results/oos_test_sn/clinc_oos/nuq/roberta-large/32/5e-05

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/16/1e-05

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/16/2e-05

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/16/3e-05

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/16/5e-05

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/32/1e-05

../workdir/results/oos_test_sn/clinc_oos/maha/roberta-large/32/2e-05

../workdir/results/oos_test_

Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),97.4±0.1,88.9±1.2
NUQ|roberta/16/1e-05|epistemic,97.4±0.4,84.5±5.5
NUQ|roberta/16/1e-05|aleatoric,97.2±0.5,85.4±6.6
NUQ|roberta/16/1e-05|total,97.4±0.5,84.4±5.6
NUQ|roberta/16/2e-05|epistemic,97.0±0.1,79.0±0.3
NUQ|roberta/16/2e-05|aleatoric,97.0±0.3,81.8±3.1
NUQ|roberta/16/2e-05|total,96.9±0.2,78.9±0.3
NUQ|roberta/16/3e-05|epistemic,81.8±26.2,59.3±35.4
NUQ|roberta/16/3e-05|aleatoric,81.2±26.9,58.9±35.5
NUQ|roberta/16/3e-05|total,81.7±26.1,59.2±35.4


# CLINC_OOS

## roberta, test, paper params

In [3]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'clinc_oos|roc-auc' : [
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/4', 'NUQ', 'roberta/16/4'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/4', 'Mahalanobis', 'roberta/16/4'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[1]}/16/4', 'DPP', 'roberta/16/4'),
        ],
        'clinc_oos|pr-auc' : [
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/16/4', 'NUQ', 'roberta/16/4'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/16/4', 'Mahalanobis', 'roberta/16/4'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[1]}/16/4', 'DPP', 'roberta/16/4'),
        ],
    },
    oos=True
)

eval_table_oos

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/clinc_oos/nuq/roberta-large/16/4

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/clinc_oos/maha/roberta-large/16/4

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/clinc_oos/DPP/roberta-large/16/4

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/clinc_oos/nuq/roberta-large/16/4

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/clinc_oos/maha/roberta-large/16/4

/mnt/users/avazhentsev/uncertainty-estimation-params/workdir/results/oos_test/clinc_oos/DPP/roberta-large/16/4
0.0712886424581054 0.2938853587328287
0.15529095888137823 0.9841610654862598
0.005112031405357828 0.06011288733463707
0.0 0.92
1.1913960134246129 4.919537572325503
0.07294227878028448 0.301697885037596
0.15552629649639127 0.9812759801838546
0.005437949466124164 0.07551509897561254
0.0 0.92
1.1954991698543094 4.920801683147808

Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),97.1±0.4,90.7±0.6
NUQ|roberta/16/4|epistemic,97.2±0.6,81.4±5.4
NUQ|roberta/16/4|aleatoric,97.1±0.6,81.6±5.1
NUQ|roberta/16/4|total,97.2±0.6,81.3±5.4
Mahalanobis|roberta/16/4|mahalanobis_distance,97.2±0.1,88.7±0.8
DPP|roberta/16/4|bald,96.9±0.2,90.4±1.0
DPP|roberta/16/4|sampled_max_prob,97.1±0.2,91.5±0.4
DPP|roberta/16/4|variance,41.3±8.5,21.2±4.5
DPP|roberta/16/4|var_ratio,89.6±0.8,78.7±0.6
DPP|roberta/16/4|entropy,97.4±0.2,92.5±0.2


## roberta, test, my params

In [50]:
#roberta, clinc_oos, test
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'clinc_oos' : [
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/4', 'NUQ', 'roberta/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/6', 'NUQ', 'roberta/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/8', 'NUQ', 'roberta/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/10', 'NUQ', 'roberta/32/10'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[1]}/32/40', 'NUQ', 'roberta/32/40'),
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/4', 'Mahalanobis', 'roberta/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/6', 'Mahalanobis', 'roberta/32/6'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[1]}/32/8', 'Mahalanobis', 'roberta/32/8'),
            
            (f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[1]}/32/4', 'DPP', 'roberta/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[1]}/32/6', 'DPP', 'roberta/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[1]}/32/8', 'DPP', 'roberta/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[1]}/32/10', 'DPP', 'roberta/32/10'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[1]}/32/40', 'DPP', 'roberta/32/40'),            
        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test/clinc_oos/nuq/roberta-large/32/4

../workdir/results/oos_test/clinc_oos/maha/roberta-large/32/8

../workdir/results/oos_test/clinc_oos/DPP/roberta-large/32/4



Unnamed: 0,clinc_oos
baseline (max_prob),96.8±0.2
NUQ|roberta/32/4|epistemic,97.2±0.3
NUQ|roberta/32/4|aleatoric,97.1±0.3
NUQ|roberta/32/4|total,97.1±0.3
Mahalanobis|roberta/32/8|mahalanobis_distance,97.5±0.3
DPP|roberta/32/4|bald,96.9±0.3
DPP|roberta/32/4|sampled_max_prob,96.9±0.2
DPP|roberta/32/4|variance,95.1±0.2
DPP|roberta/32/4|var_ratio,84.5±0.8
DPP|roberta/32/4|entropy,97.1±0.2


## electra, test, my params

In [7]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'clinc_oos|roc-auc' : [

            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/4', 'NUQ', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/6', 'NUQ', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/8', 'NUQ', 'electra/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/10', 'NUQ', 'electra/32/10'),
            
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/4', 'SNGP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/6', 'SNGP', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/8', 'SNGP', 'electra/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/10', 'Mahalanobis', 'electra/32/10'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/4', 'DPP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/6', 'DPP', 'electra/32/6'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/8', 'DPP', 'electra/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/10', 'DPP', 'electra/32/10'),
  
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/4', 'SNGP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/6', 'SNGP', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/8', 'SNGP', 'electra/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/10', 'SNGP', 'electra/32/10'),
        ],
        'clinc_oos|pr-auc' : [

            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/4', 'NUQ', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/6', 'NUQ', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/8', 'NUQ', 'electra/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/10', 'NUQ', 'electra/32/10'),
            
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/4', 'SNGP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/6', 'SNGP', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/8', 'SNGP', 'electra/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/10', 'Mahalanobis', 'electra/32/10'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/4', 'DPP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/6', 'DPP', 'electra/32/6'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/8', 'DPP', 'electra/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/10', 'DPP', 'electra/32/10'),
  
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/4', 'SNGP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/6', 'SNGP', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/8', 'SNGP', 'electra/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/10', 'SNGP', 'electra/32/10'),
        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test/clinc_oos/nuq/google/electra-base-discriminator/32/10

../workdir/results/oos_test/clinc_oos/maha/google/electra-base-discriminator/32/10

../workdir/results/oos_test/clinc_oos/DPP/google/electra-base-discriminator/32/8

../workdir/results/oos_test/clinc_oos/nuq/google/electra-base-discriminator/32/10

../workdir/results/oos_test/clinc_oos/maha/google/electra-base-discriminator/32/10

../workdir/results/oos_test/clinc_oos/DPP/google/electra-base-discriminator/32/8



Unnamed: 0,roc-auc,pr-auc
baseline (max_prob),96.0±0.1,84.3±0.4
NUQ|electra/32/10|epistemic,97.1±0.1,81.9±0.6
NUQ|electra/32/10|aleatoric,97.2±0.1,86.0±1.2
NUQ|electra/32/10|total,97.1±0.1,81.9±0.6
Mahalanobis|electra/32/10|mahalanobis_distance,97.1±0.1,87.1±0.2
DPP|electra/32/8|bald,96.0±0.2,86.9±1.1
DPP|electra/32/8|sampled_max_prob,96.0±0.2,85.4±0.4
DPP|electra/32/8|variance,95.0±0.3,78.7±2.0
DPP|electra/32/8|var_ratio,89.4±0.0,75.9±0.4
DPP|electra/32/8|entropy,96.3±0.2,87.3±0.3


In [3]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'clinc_oos' : [

            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/4', 'NUQ', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/6', 'NUQ', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/8', 'NUQ', 'electra/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/32/10', 'NUQ', 'electra/32/10'),
            
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/4', 'SNGP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/6', 'SNGP', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/8', 'SNGP', 'electra/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/32/10', 'Mahalanobis', 'electra/32/10'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/4', 'DPP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/6', 'DPP', 'electra/32/6'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/8', 'DPP', 'electra/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/32/10', 'DPP', 'electra/32/10'),
  
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/4', 'SNGP', 'electra/32/4'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/6', 'SNGP', 'electra/32/6'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/8', 'SNGP', 'electra/32/8'),
            #(f'{path}/{tasks[0]}/{mc_types[3]}/{train_model[0]}/32/10', 'SNGP', 'electra/32/10'),
        ],
    },
    oos=True,
    metric='pr-auc'
)

eval_table_oos

../workdir/results/oos_test/clinc_oos/nuq/google/electra-base-discriminator/32/10

../workdir/results/oos_test/clinc_oos/maha/google/electra-base-discriminator/32/10

../workdir/results/oos_test/clinc_oos/DPP/google/electra-base-discriminator/32/8



Unnamed: 0,clinc_oos
baseline (max_prob),84.3±0.4
NUQ|electra/32/10|epistemic,81.9±0.5
NUQ|electra/32/10|aleatoric,85.7±3.6
NUQ|electra/32/10|total,81.9±0.5
Mahalanobis|electra/32/10|mahalanobis_distance,87.0±0.1
DPP|electra/32/8|bald,86.9±1.1
DPP|electra/32/8|sampled_max_prob,85.4±0.4
DPP|electra/32/8|variance,78.6±2.0
DPP|electra/32/8|var_ratio,81.0±0.7
DPP|electra/32/8|entropy,87.3±0.3


## electra, test, my params

In [16]:
tasks = ['clinc_oos', 'rostd']
train_model = ['google/electra-base-discriminator', 'roberta-large']
mc_types = ['DPP', 'nuq', 'maha', 'sngp']
path = "../workdir/results/oos_test"

eval_table_oos = build_eval_table(
    {
        'clinc_oos' : [

            #(f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/1e-05', 'NUQ', 'electra/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[1]}/{train_model[0]}/16/2e-05', 'NUQ', 'electra/16/2e-05'),

            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/1e-05', 'Mahalanobis', 'electra/16/1e-05'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/{train_model[0]}/16/2e-05', 'Mahalanobis', 'electra/16/2e-05'),
            
            #(f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/16/1e-05', 'DPP', 'electra/16/1e-05'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/{train_model[0]}/16/2e-05', 'DPP', 'electra/16/2e-05'),
        ],
    },
    oos=True
)

eval_table_oos

../workdir/results/oos_test/clinc_oos/nuq/google/electra-base-discriminator/16/2e-05

../workdir/results/oos_test/clinc_oos/DPP/google/electra-base-discriminator/16/2e-05



Unnamed: 0,clinc_oos
baseline (max_prob),92.4±1.3
NUQ|electra/16/2e-05|epistemic,96.8±0.1
NUQ|electra/16/2e-05|aleatoric,95.8±0.2
NUQ|electra/16/2e-05|total,96.7±0.1
DPP|electra/16/2e-05|bald,78.5±0.8
DPP|electra/16/2e-05|sampled_max_prob,92.1±1.3
DPP|electra/16/2e-05|variance,15.7±1.6
DPP|electra/16/2e-05|var_ratio,88.6±1.4
DPP|electra/16/2e-05|entropy,90.5±1.3


## Mahalanobis, val, clinc, electra

In [7]:
tasks = ['clinc_oos']
mc_types = ['mahalanobis']
path = "../workdir/results/maha/"

eval_table_maha = build_eval_table(
    {
        'clinc_oos' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/32/4', 'mahalanobis', 'electra/32/4'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/32/8', 'mahalanobis', 'electra/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/32/12', 'mahalanobis', 'electra/32/12'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/32/16', 'mahalanobis', 'electra/32/16'),
            
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/64/4', 'mahalanobis', 'electra/64/4'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/64/8', 'mahalanobis', 'electra/64/8'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/64/12', 'mahalanobis', 'electra/64/12'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/64/16', 'mahalanobis', 'electra/64/16'),
            
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/32/4', 'mahalanobis', 'bert/32/4'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/32/8', 'mahalanobis', 'bert/32/8'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/32/12', 'mahalanobis', 'bert/32/12'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/32/16', 'mahalanobis', 'bert/32/16'),
            
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/64/4', 'mahalanobis', 'bert/64/4'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/64/8', 'mahalanobis', 'bert/64/8'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/64/12', 'mahalanobis', 'bert/64/12'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/64/16', 'mahalanobis', 'bert/64/16'),
        ]
    },
)

eval_table_maha

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/32/4

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/32/8

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/32/12

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/32/16

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/64/4

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/64/8

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/64/12

../workdir/results/maha/clinc_oos/mahalanobis/google/electra-base-discriminator/64/16

../workdir/results/maha/clinc_oos/mahalanobis/bert-base-uncased/32/4

../workdir/results/maha/clinc_oos/mahalanobis/bert-base-uncased/32/8

../workdir/results/maha/clinc_oos/mahalanobis/bert-base-uncased/32/12

../workdir/results/maha/clinc_oos/mahalanobis/bert-base-uncased/32/16

../workdir/results/maha/cl

Unnamed: 0,clinc_oos
baseline (max_prob),94.4±1.2
mahalanobis|electra/32/4|mahalanobis_distance,96.9±0.4
mahalanobis|electra/32/8|mahalanobis_distance,97.3±0.3
mahalanobis|electra/32/12|mahalanobis_distance,97.3±0.4
mahalanobis|electra/32/16|mahalanobis_distance,97.3±0.3
mahalanobis|electra/64/4|mahalanobis_distance,96.2±0.3
mahalanobis|electra/64/8|mahalanobis_distance,97.2±0.2
mahalanobis|electra/64/12|mahalanobis_distance,97.4±0.3
mahalanobis|electra/64/16|mahalanobis_distance,97.5±0.3
mahalanobis|bert/32/4|mahalanobis_distance,80.2±3.0


## Symptoms, DeepEnsemble

In [2]:
tasks = ['symptoms']
mc_types = ['MC']
path = "../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/"

eval_table_sde = build_eval_table(
    {
        'symptoms' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/10', '-', '-'),
        ]
    },
    metric='rejection-curve-auc',
    de=True
)

eval_table_sde

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/symptoms/MC/last/10



Unnamed: 0,symptoms
baseline (max_prob),56.626±0.162
-|-|bald,-2.381±0.185
-|-|sampled_max_prob,0.949±0.098
-|-|variance,-3.956±0.238
-|-|var_ratio,-2.726±0.311
-|-|entropy,0.587±0.073


## Symptoms, MC

In [3]:
tasks = ['symptoms']
mc_types = ['MC']
path = "../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/"

eval_table_sy = build_eval_table(
    {
        'symptoms' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/10', 'MC', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/all/10', 'MC', 'all'),
        ]
    },
    metric='rejection-curve-auc',
)

eval_table_sy

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/symptoms/MC/last/10

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/symptoms/MC/all/10



Unnamed: 0,symptoms
baseline (max_prob),56.999±0.192
MC|last|bald,-0.665±0.150
MC|last|sampled_max_prob,0.017±0.032
MC|last|variance,-3.065±0.390
MC|last|var_ratio,-7.269±0.240
MC|last|entropy,-0.219±0.082
MC|all|bald,-2.819±0.371
MC|all|sampled_max_prob,0.269±0.070
MC|all|variance,-3.916±0.510
MC|all|var_ratio,-5.172±0.243


In [4]:
pd.concat([eval_table_sde, eval_table_sy.iloc[1:]])

Unnamed: 0,symptoms
baseline (max_prob),56.626±0.162
-|-|bald,-2.381±0.185
-|-|sampled_max_prob,0.949±0.098
-|-|variance,-3.956±0.238
-|-|var_ratio,-2.726±0.311
-|-|entropy,0.587±0.073
MC|last|bald,-0.665±0.150
MC|last|sampled_max_prob,0.017±0.032
MC|last|variance,-3.065±0.390
MC|last|var_ratio,-7.269±0.240


In [11]:
tasks = ['cola', 'mrpc', 'sst2']
mc_types = ['SNGP']
path3 = "../workdir/results/glue_sngp"
batch_size = 64

eval_table = build_eval_table(
    {
        'cola' : [           
            (f'{path3}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/4/{batch_size}', 'SNGP', 'bert/4'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/8/{batch_size}', 'SNGP', 'bert/8'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/12/{batch_size}', 'SNGP', 'bert/12'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/16/{batch_size}', 'SNGP', 'bert/16'),
            
            (f'{path3}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/4/{batch_size}', 'SNGP', 'electra/4'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/8/{batch_size}', 'SNGP', 'electra/8'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/12/{batch_size}', 'SNGP', 'electra/12'),  
            (f'{path3}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/16/{batch_size}', 'SNGP', 'electra/16'),  
        ],
        
        'mrpc' : [           
            (f'{path3}/{tasks[1]}/{mc_types[0]}/bert-base-uncased/4/{batch_size}', 'SNGP', 'bert/4'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/bert-base-uncased/8/{batch_size}', 'SNGP', 'bert/8'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/bert-base-uncased/12/{batch_size}', 'SNGP', 'bert/12'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/bert-base-uncased/16/{batch_size}', 'SNGP', 'bert/16'),
            
            (f'{path3}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/4/{batch_size}', 'SNGP', 'electra/4'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/8/{batch_size}', 'SNGP', 'electra/8'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/12/{batch_size}', 'SNGP', 'electra/12'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/16/{batch_size}', 'SNGP', 'electra/16'),  
        ],
        
        'sst2' : [           
            (f'{path3}/{tasks[2]}/{mc_types[0]}/bert-base-uncased/4/{batch_size}', 'SNGP', 'bert/4'),
        #    (f'{path3}/{tasks[2]}/{mc_types[0]}/bert-base-uncased/8/{batch_size}', 'SNGP', 'bert/8'),
        #    (f'{path3}/{tasks[2]}/{mc_types[0]}/bert-base-uncased/12/{batch_size}', 'SNGP', 'bert/12'),
        #    (f'{path3}/{tasks[2]}/{mc_types[0]}/bert-base-uncased/16/{batch_size}', 'SNGP', 'bert/16'),
        #
            (f'{path3}/{tasks[2]}/{mc_types[0]}/google/electra-base-discriminator/4/{batch_size}', 'SNGP', 'electra/4'),
        #    (f'{path3}/{tasks[2]}/{mc_types[0]}/google/electra-base-discriminator/8/{batch_size}', 'SNGP', 'electra/8'),
        #    (f'{path3}/{tasks[2]}/{mc_types[0]}/google/electra-base-discriminator/12/{batch_size}', 'SNGP', 'electra/12'),
        #    (f'{path3}/{tasks[2]}/{mc_types[0]}/google/electra-base-discriminator/16/{batch_size}', 'SNGP', 'electra/16'),  
        ]
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/glue_sngp/cola/SNGP/bert-base-uncased/4/64

../workdir/results/glue_sngp/cola/SNGP/bert-base-uncased/8/64

../workdir/results/glue_sngp/cola/SNGP/bert-base-uncased/12/64

../workdir/results/glue_sngp/cola/SNGP/bert-base-uncased/16/64

../workdir/results/glue_sngp/cola/SNGP/google/electra-base-discriminator/4/64

../workdir/results/glue_sngp/cola/SNGP/google/electra-base-discriminator/8/64

../workdir/results/glue_sngp/cola/SNGP/google/electra-base-discriminator/12/64

../workdir/results/glue_sngp/cola/SNGP/google/electra-base-discriminator/16/64

../workdir/results/glue_sngp/mrpc/SNGP/bert-base-uncased/4/64

../workdir/results/glue_sngp/mrpc/SNGP/bert-base-uncased/8/64

../workdir/results/glue_sngp/mrpc/SNGP/bert-base-uncased/12/64

../workdir/results/glue_sngp/mrpc/SNGP/bert-base-uncased/16/64

../workdir/results/glue_sngp/mrpc/SNGP/google/electra-base-discriminator/4/64

../workdir/results/glue_sngp/mrpc/SNGP/google/electra-base-discriminator/8/64

../workdir/resul

Unnamed: 0,cola,mrpc,sst2
baseline (max_prob),90.862±0.322,87.581±0.467,93.740±0.238
SNGP|bert/4|bald,88.498±0.391,86.745±0.351,93.203±0.195
SNGP|bert/4|sampled_max_prob,88.494±0.163,86.657±0.304,93.136±0.188
SNGP|bert/4|variance,88.488±0.341,86.702±0.335,93.179±0.184
SNGP|bert/4|var_ratio,87.365±0.284,85.002±0.454,91.149±0.233
SNGP|bert/4|entropy,88.494±0.163,86.657±0.304,93.136±0.188
SNGP|bert/8|bald,88.563±0.112,86.679±0.332,
SNGP|bert/8|sampled_max_prob,88.455±0.383,86.251±0.265,
SNGP|bert/8|variance,88.535±0.113,86.656±0.300,
SNGP|bert/8|var_ratio,87.731±0.496,84.566±0.730,


In [4]:
tasks = ['cola', 'mrpc', 'sst2']
mc_types = ['NUQ']
path3 = "../workdir/results/glue_nuq"

eval_table = build_eval_table(
    {
        'cola' : [           
            (f'{path3}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/4', 'NUQ', 'bert/4'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/8', 'NUQ', 'bert/8'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/12', 'NUQ', 'bert/12'),
            
            (f'{path3}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/4', 'NUQ', 'electra/4'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/8', 'NUQ', 'electra/8'),
            (f'{path3}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/12', 'NUQ', 'electra/12'),        
        ],
        
        'mrpc' : [           
            (f'{path3}/{tasks[1]}/{mc_types[0]}/bert-base-uncased/4', 'NUQ', 'bert/4'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/bert-base-uncased/8', 'NUQ', 'bert/8'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/bert-base-uncased/12', 'NUQ', 'bert/12'),
            
            (f'{path3}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/4', 'NUQ', 'electra/4'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/8', 'NUQ', 'electra/8'),
            (f'{path3}/{tasks[1]}/{mc_types[0]}/google/electra-base-discriminator/12', 'NUQ', 'electra/12'),
        ],
        
        'sst2' : [           
            (f'{path3}/{tasks[2]}/{mc_types[0]}/bert-base-uncased/4', 'NUQ', 'bert/4'),
            (f'{path3}/{tasks[2]}/{mc_types[0]}/bert-base-uncased/8', 'NUQ', 'bert/8'),
            (f'{path3}/{tasks[2]}/{mc_types[0]}/bert-base-uncased/12', 'NUQ', 'bert/12'),
        
            (f'{path3}/{tasks[2]}/{mc_types[0]}/google/electra-base-discriminator/4', 'NUQ', 'electra/4'),
            (f'{path3}/{tasks[2]}/{mc_types[0]}/google/electra-base-discriminator/8', 'NUQ', 'electra/8'),
            (f'{path3}/{tasks[2]}/{mc_types[0]}/google/electra-base-discriminator/12', 'NUQ', 'electra/12'),        
        ],
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/4

../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/8

../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12

../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4

../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/8

../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/12

../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4

../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/8

../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/12

../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/4

../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/8

../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/12

../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/4

../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8

../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/12

../workdir/results/glue_nuq/s

Unnamed: 0,cola,mrpc,sst2
baseline (max_prob),92.137±0.120,87.678±0.353,93.524±0.184
NUQ|bert/4|epistemic,90.207±0.152,86.248±0.816,93.288±0.145
NUQ|bert/4|aleatoric,90.130±0.165,85.949±0.663,93.295±0.125
NUQ|bert/4|total,90.193±0.155,86.210±0.792,93.306±0.127
NUQ|bert/8|epistemic,89.812±0.338,86.140±0.337,92.918±0.355
NUQ|bert/8|aleatoric,89.791±0.368,86.129±0.332,92.918±0.355
NUQ|bert/8|total,89.810±0.338,86.135±0.330,92.922±0.352
NUQ|bert/12|epistemic,89.790±0.394,86.471±0.256,92.612±0.103
NUQ|bert/12|aleatoric,89.790±0.393,86.451±0.274,92.593±0.077
NUQ|bert/12|total,89.791±0.393,86.466±0.258,92.603±0.087


In [10]:
tasks = ['clinc_oos']
mc_types = ['DPP', 'SNGP', 'NUQ']
path1 = "../workdir/results/dpp_oos"
path2 = "../workdir/results/sngp_oos"
path3 = "../workdir/results/nuq_oos"
eval_table = build_eval_table(
    {
        'clinc_oos' : [           
            #(f'{path1}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/20/100/0.3/40', 'DPP', 'electra/0.3'),
            (f'{path1}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/20/100/0.4/40', 'DPP', 'electra/0.4'),
            #(f'{path1}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/20/100/0.3/40', 'DPP', 'bert/0.3'),
            #(f'{path1}/{tasks[0]}/{mc_types[0]}/bert-base-uncased/20/100/0.4/40', 'DPP', 'bert/0.4'),
            
            (f'{path2}/{tasks[0]}/{mc_types[1]}/google/electra-base-discriminator/12', 'SNGP', 'electra/12'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/google/electra-base-discriminator/16', 'SNGP', 'electra/16'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/google/electra-base-discriminator/20', 'SNGP', 'electra/20'),
            #(f'{path2}/{tasks[0]}/{mc_types[1]}/google/electra-base-discriminator/10', 'SNGP', 'electra/10'),
                
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/4', 'SNGP', 'bert/4'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/6', 'SNGP', 'bert/6'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/8', 'SNGP', 'bert/8'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/10', 'SNGP', 'bert/10'),
            
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/4', 'NUQ', 'electra/4'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/6', 'NUQ', 'electra/6'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/8', 'NUQ', 'electra/8'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/10', 'NUQ', 'electra/10'),
            
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/4', 'NUQ', 'bert/4'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/6', 'NUQ', 'bert/6'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/8', 'NUQ', 'bert/8'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/10', 'NUQ', 'bert/10'),

        ],
    },
    task_type='classification',
    oos=True
)

eval_table

../workdir/results/dpp_oos/clinc_oos/DPP/google/electra-base-discriminator/20/100/0.4/40

../workdir/results/sngp_oos/clinc_oos/SNGP/google/electra-base-discriminator/12

../workdir/results/sngp_oos/clinc_oos/SNGP/google/electra-base-discriminator/16

../workdir/results/sngp_oos/clinc_oos/SNGP/google/electra-base-discriminator/20

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/4

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/6

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/8

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/10

../workdir/results/nuq_oos/clinc_oos/NUQ/google/electra-base-discriminator/4

../workdir/results/nuq_oos/clinc_oos/NUQ/google/electra-base-discriminator/6

../workdir/results/nuq_oos/clinc_oos/NUQ/google/electra-base-discriminator/8

../workdir/results/nuq_oos/clinc_oos/NUQ/google/electra-base-discriminator/10

../workdir/results/nuq_oos/clinc_oos/NUQ/bert-base-uncased/4

../workdir/results/nuq_oos/clinc_oo

Unnamed: 0,clinc_oos
baseline (max_prob),95.7±0.5
DPP|electra/0.4|bald,0.6±0.6
DPP|electra/0.4|sampled_max_prob,0.4±0.2
DPP|electra/0.4|variance,-1.3±0.6
DPP|electra/0.4|var_ratio,-2.4±0.5
...,...
NUQ|bert/8|aleatoric,-0.3±0.3
NUQ|bert/8|total,0.1±0.1
NUQ|bert/10|epistemic,0.1±0.1
NUQ|bert/10|aleatoric,-0.7±0.5


In [27]:
eval_table.iloc[[0, 5, 23, 48]]

Unnamed: 0,clinc_oos
baseline (max_prob),95.7±0.5
DPP|electra/0.4|entropy,97.4±0.3
SNGP|electra/20|stds,96.8±0.7
NUQ|electra/4|epistemic,97.5±0.4


In [8]:
tasks = ['clinc_oos']
mc_types = ['DPP', 'SNGP', 'NUQ', 'MC']
path = "../workdir/results/oos"
path1 = "../workdir/results/dpp_oos"
path2 = "../workdir/results/sngp_oos"
path3 = "../workdir/results/nuq_oos"
eval_table = build_eval_table(
    {
        'clinc_oos' : [           
            (f'{path1}/{tasks[0]}/{mc_types[0]}/google/electra-base-discriminator/20/100/0.4/40', 'DPP_on_masks', 'electra/0.4'),

            (f'{path}/{tasks[0]}/{mc_types[0]}/last', 'DPP_with_OOD', 'DPP/last'),
            
            (f'{path}/{tasks[0]}/{mc_types[3]}/last', 'MC', 'MC/last'),
            
            (f'{path}/{tasks[0]}/{mc_types[3]}/all', 'MC', 'MC/all'),

            (f'{path2}/{tasks[0]}/{mc_types[1]}/google/electra-base-discriminator/12', 'SNGP', 'electra/12'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/google/electra-base-discriminator/16', 'SNGP', 'electra/16'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/google/electra-base-discriminator/20', 'SNGP', 'electra/20'),
                
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/4', 'SNGP', 'bert/4'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/6', 'SNGP', 'bert/6'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/8', 'SNGP', 'bert/8'),
            (f'{path2}/{tasks[0]}/{mc_types[1]}/bert-base-uncased/10', 'SNGP', 'bert/10'),
            
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/4', 'NUQ', 'electra/4'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/6', 'NUQ', 'electra/6'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/8', 'NUQ', 'electra/8'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/google/electra-base-discriminator/10', 'NUQ', 'electra/10'),
            
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/4', 'NUQ', 'bert/4'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/6', 'NUQ', 'bert/6'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/8', 'NUQ', 'bert/8'),
            (f'{path3}/{tasks[0]}/{mc_types[2]}/bert-base-uncased/10', 'NUQ', 'bert/10'),
            
        ],
    },
    task_type='classification',
    oos=True
)

eval_table

../workdir/results/dpp_oos/clinc_oos/DPP/google/electra-base-discriminator/20/100/0.4/40

../workdir/results/oos/clinc_oos/DPP/last

../workdir/results/oos/clinc_oos/MC/last

../workdir/results/oos/clinc_oos/MC/all

../workdir/results/sngp_oos/clinc_oos/SNGP/google/electra-base-discriminator/12

../workdir/results/sngp_oos/clinc_oos/SNGP/google/electra-base-discriminator/16

../workdir/results/sngp_oos/clinc_oos/SNGP/google/electra-base-discriminator/20

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/4

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/6

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/8

../workdir/results/sngp_oos/clinc_oos/SNGP/bert-base-uncased/10

../workdir/results/nuq_oos/clinc_oos/NUQ/google/electra-base-discriminator/4

../workdir/results/nuq_oos/clinc_oos/NUQ/google/electra-base-discriminator/6

../workdir/results/nuq_oos/clinc_oos/NUQ/google/electra-base-discriminator/8

../workdir/results/nuq_oos/clinc_oos/NUQ/google/el

Unnamed: 0,clinc_oos
baseline (max_prob),95.7±0.5
DPP_on_masks|electra/0.4|bald,97.2±0.6
DPP_on_masks|electra/0.4|sampled_max_prob,97.0±0.3
DPP_on_masks|electra/0.4|variance,95.3±0.6
DPP_on_masks|electra/0.4|var_ratio,94.2±0.4
...,...
NUQ|bert/8|aleatoric,95.2±0.8
NUQ|bert/8|total,95.6±0.4
NUQ|bert/10|epistemic,95.8±0.6
NUQ|bert/10|aleatoric,95.0±0.7


In [10]:
eval_table.iloc[:21]

Unnamed: 0,clinc_oos
baseline (max_prob),95.7±0.5
DPP_on_masks|electra/0.4|bald,97.2±0.6
DPP_on_masks|electra/0.4|sampled_max_prob,97.0±0.3
DPP_on_masks|electra/0.4|variance,95.3±0.6
DPP_on_masks|electra/0.4|var_ratio,94.2±0.4
DPP_on_masks|electra/0.4|entropy,97.4±0.3
DPP_with_OOD|DPP/last|bald,97.0±0.5
DPP_with_OOD|DPP/last|sampled_max_prob,96.9±0.4
DPP_with_OOD|DPP/last|variance,95.0±0.4
DPP_with_OOD|DPP/last|var_ratio,94.3±0.7


In [6]:
tasks = ['symptoms']
mc_types = ['MC', 'DC_MC', 'DPP']
path = "../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_20"

eval_table = build_eval_table(
    {
        'symptom_checker' : [
            
            (f'{path}/{tasks[0]}/{mc_types[2]}/5/50/0.8', 'DPP', 'last/5/50/0.8'),  
            #(f'{path}/{tasks[0]}/{mc_types[2]}/5/50/0.6', 'DPP', 'last/5/50/0.6'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/5/50/0.4', 'DPP', 'last//5/50/0.4'),  
            
            #(f'{path}/{tasks[0]}/{mc_types[2]}/5/20/0.8', 'DPP', 'last/5/20/0.8'),  
            #(f'{path}/{tasks[0]}/{mc_types[2]}/5/20/0.6', 'DPP', 'last/5/20/0.6'),
            #(f'{path}/{tasks[0]}/{mc_types[2]}/5/20/0.4', 'DPP', 'last/5/20/0.4'),  
            
            (f'{path}/{tasks[0]}/{mc_types[0]}/all/20', 'MC', 'all/20'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/20', 'MC', 'last/20'),
        ],
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_20/symptoms/DPP/5/50/0.8

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_20/symptoms/MC/all/20

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_20/symptoms/MC/last/20



Unnamed: 0,symptom_checker
baseline (max_prob),76.707±0.127
DPP|last/5/50/0.8|bald,-0.629±0.032
DPP|last/5/50/0.8|sampled_max_prob,0.003±0.035
DPP|last/5/50/0.8|variance,-3.079±0.213
DPP|last/5/50/0.8|var_ratio,-1.384±0.086
DPP|last/5/50/0.8|entropy,-0.394±0.041
MC|all/20|bald,-2.290±0.052
MC|all/20|sampled_max_prob,0.335±0.027
MC|all/20|variance,-3.409±0.082
MC|all/20|var_ratio,-2.187±0.031


In [2]:
tasks = ['cola', 'mrpc', 'sst2']
mc_types = ['DC_MC', 'MC']
path = "../workdir/results/dcmc_masks/"
path1 = "../workdir/results/mc_masks/"

eval_table = build_eval_table(
    {
        'SST-2' : [
            (f'{path}/{tasks[2]}/{mc_types[0]}/all/20/0.1', 'DC_MC', 'all'),
            (f'{path}/{tasks[2]}/{mc_types[0]}/last/20/0.5', 'DC_MC', 'last'),
            (f'{path1}/{tasks[2]}/{mc_types[1]}/all/20/0.1', 'MC', 'all'),
            (f'{path1}/{tasks[2]}/{mc_types[1]}/last/20/0.2', 'MC', 'last'),
        ],
        'MRPC' : [
            (f'{path}/{tasks[1]}/{mc_types[0]}/all/20/0.1', 'DC_MC', 'all'),
            (f'{path}/{tasks[1]}/{mc_types[0]}/last/20/0.5', 'DC_MC', 'last'),
            (f'{path1}/{tasks[1]}/{mc_types[1]}/all/20/0.1', 'MC', 'all'),
            (f'{path1}/{tasks[1]}/{mc_types[1]}/last/20/0.2', 'MC', 'last'),
        ],
        'CoLA' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/all/20/0.1', 'DC_MC', 'all'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/20/0.5', 'DC_MC', 'last'),
            (f'{path1}/{tasks[0]}/{mc_types[1]}/all/20/0.1', 'MC', 'all'),
            (f'{path1}/{tasks[0]}/{mc_types[1]}/last/20/0.2', 'MC', 'last'),
        ]
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/dcmc_masks/sst2/DC_MC/all/20/0.1

../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5

../workdir/results/mc_masks/sst2/MC/all/20/0.1

../workdir/results/mc_masks/sst2/MC/last/20/0.2

../workdir/results/dcmc_masks/mrpc/DC_MC/all/20/0.1

../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.5

../workdir/results/mc_masks/mrpc/MC/all/20/0.1

../workdir/results/mc_masks/mrpc/MC/last/20/0.2

../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.1

../workdir/results/dcmc_masks/cola/DC_MC/last/20/0.5

../workdir/results/mc_masks/cola/MC/all/20/0.1

../workdir/results/mc_masks/cola/MC/last/20/0.2



Unnamed: 0,SST-2,MRPC,CoLA
baseline (max_prob),94.485±0.030,92.746±0.167,92.342±0.135
DC_MC|all|bald,0.035±0.138,0.120±0.123,0.500±0.384
DC_MC|all|sampled_max_prob,0.057±0.083,0.299±0.199,0.635±0.218
DC_MC|all|variance,0.061±0.134,0.219±0.115,0.525±0.371
DC_MC|last|bald,-0.165±0.209,0.006±0.134,-0.259±0.303
DC_MC|last|sampled_max_prob,-0.003±0.022,0.043±0.044,0.019±0.025
DC_MC|last|variance,-0.046±0.099,0.098±0.090,-0.120±0.185
MC|all|bald,0.064±0.055,0.685±0.130,0.268±0.225
MC|all|sampled_max_prob,0.053±0.027,0.641±0.146,0.335±0.172
MC|all|variance,0.062±0.050,0.640±0.136,0.292±0.209


In [31]:
tasks = ['cola', 'mrpc', 'sst2']
mc_types = ['DPP_last']
path = "../workdir/results/aug/results"

eval_table = build_eval_table(
    {
        'SST-2' : [
            (f'{path}/{tasks[2]}/', 'DPP_with_OOD_40/100', 'last'),
        ],
        'MRPC' : [
            (f'{path}/{tasks[1]}/', 'DPP_with_OOD_40/100', 'last'),
        ],
        'CoLA' : [
            (f'{path}/{tasks[0]}/', 'DPP_with_OOD_40/100', 'last'),
        ]
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/aug/results/sst2

../workdir/results/aug/results/mrpc

../workdir/results/aug/results/cola



Unnamed: 0,SST-2,MRPC,CoLA
baseline (max_prob),92.418±0.538,88.152±2.055,73.221±3.884
DPP_with_OOD_40/100|last|bald,0.647±0.271,-0.461±0.328,-2.204±0.427
DPP_with_OOD_40/100|last|sampled_max_prob,1.251±0.412,1.473±0.193,-1.081±0.826
DPP_with_OOD_40/100|last|variance,0.764±0.274,0.058±0.231,-2.028±0.467


In [3]:
tasks = ['cola', 'mrpc', 'sst2']
mc_types = ['DPP_last']
path = "../workdir/results/dpp_masks_odd"

eval_table = build_eval_table(
    {
        'SST-2' : [
            (f'{path}/{tasks[2]}/ht_dpp/True/40/100/0.3', 'DPP_with_OOD_40/100', 'last'),
        ],
        'MRPC' : [
            (f'{path}/{tasks[1]}/ht_dpp/True/40/100/0.3', 'DPP_with_OOD_40/100', 'last'),
        ],
        'CoLA' : [
            (f'{path}/{tasks[0]}/ht_dpp/True/40/100/0.3', 'DPP_with_OOD_40/100', 'last'),
        ]
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/dpp_masks_odd/sst2/ht_dpp/True/40/100/0.3

../workdir/results/dpp_masks_odd/mrpc/ht_dpp/True/40/100/0.3

../workdir/results/dpp_masks_odd/cola/ht_dpp/True/40/100/0.3



Unnamed: 0,SST-2,MRPC,CoLA
baseline (max_prob),93.942±0.093,92.895±0.308,92.300±0.275
DPP_with_OOD_40/100|last|bald,0.053±0.022,0.196±0.035,0.012±0.077
DPP_with_OOD_40/100|last|sampled_max_prob,0.034±0.053,0.218±0.103,0.070±0.017
DPP_with_OOD_40/100|last|variance,0.108±0.084,0.159±0.106,0.023±0.057


In [7]:
tasks = ['cola', 'mrpc', 'sst-2']
mc_types = ['MC_last', 'MC_all']
path = "../workdir/results/deep_ensemble_calibrated"

eval_table = build_eval_table(
    {
        'SST-2' : [
            (f'{path}/{tasks[2]}/{mc_types[0]}/0.63/', '-', '-'),
        ],
        'MRPC' : [
            (f'{path}/{tasks[1]}/{mc_types[0]}/0.63/', '-', '-'),
        ],
        'CoLA' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/0.63/', '-', '-'),
        ]
    },
    metric='rejection-curve-auc',#
    #de=True
)

eval_table

../workdir/results/deep_ensemble_calibrated/sst-2/MC_last/0.63

../workdir/results/deep_ensemble_calibrated/mrpc/MC_last/0.63

../workdir/results/deep_ensemble_calibrated/cola/MC_last/0.63



Unnamed: 0,SST-2,MRPC,CoLA
baseline (max_prob),94.449±0.050,91.984±0.072,92.121±0.133
-|-|bald,0.024±0.076,0.551±0.112,0.463±0.179
-|-|sampled_max_prob,0.045±0.054,0.561±0.080,0.538±0.175
-|-|variance,0.035±0.057,0.545±0.104,0.498±0.160


In [3]:
tasks = ['conll2003']
mc_types = ['DPP']
path = "../workdir/results/dpp_ood"

eval_table = build_eval_table(
    {
        'CoNLL-2003 (token level)' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/20/0.3', 'DPP_with_OOD/20/0.3', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/20/0.4', 'DPP_with_OOD/20/0.4', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/50/0.3', 'DPP_with_OOD/50/0.3', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/50/0.4', 'DPP_with_OOD/50/0.4', 'last'),
        ],
        'CoNLL-2003 (sequence level)' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/20/0.3', 'DPP_with_OOD/20/0.3', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/20/0.4', 'DPP_with_OOD/20/0.4', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/50/0.3', 'DPP_with_OOD/50/0.3', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/50/0.4', 'DPP_with_OOD/50/0.4', 'last'),
        ]
    },
    task_type='ner',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/dpp_ood/conll2003/DPP/last/20/0.3

../workdir/results/dpp_ood/conll2003/DPP/last/20/0.4

../workdir/results/dpp_ood/conll2003/DPP/last/50/0.3

../workdir/results/dpp_ood/conll2003/DPP/last/50/0.4

../workdir/results/dpp_ood/conll2003/DPP/last/20/0.3

../workdir/results/dpp_ood/conll2003/DPP/last/20/0.4

../workdir/results/dpp_ood/conll2003/DPP/last/50/0.3

../workdir/results/dpp_ood/conll2003/DPP/last/50/0.4



Unnamed: 0,CoNLL-2003 (token level),CoNLL-2003 (sequence level)
baseline (max_prob),93.892±0.082,86.471±0.728
DPP_with_OOD/20/0.3|last|bald,-0.348±0.338,-4.235±1.049
DPP_with_OOD/20/0.3|last|var_ratio,0.427±0.051,-3.324±0.954
DPP_with_OOD/20/0.3|last|sampled_max_prob,-0.034±0.025,0.193±0.370
DPP_with_OOD/20/0.3|last|variance,-0.189±0.324,-5.361±1.137
DPP_with_OOD/20/0.3|last|entropy,-0.698±0.334,-5.814±1.316
DPP_with_OOD/20/0.4|last|bald,-0.256±0.303,-3.973±1.116
DPP_with_OOD/20/0.4|last|var_ratio,0.375±0.070,-3.169±0.935
DPP_with_OOD/20/0.4|last|sampled_max_prob,-0.021±0.019,0.110±0.223
DPP_with_OOD/20/0.4|last|variance,-0.108±0.301,-5.103±1.210


In [12]:
tasks = ['cola', 'mrpc', 'sst2']
mc_types = ['DPP_last']
path = "../workdir/results/dpp_masks_calibrate"

eval_table = build_eval_table(
    {
        'SST-2' : [
            (f'{path}/{tasks[2]}/ht_dpp/rbf/True/50/0.3/', 'DPP_on_masks', 'last'),
        ],
        'MRPC' : [
            (f'{path}/{tasks[1]}/ht_dpp/rbf/True/50/0.3/', 'DPP_on_masks', 'last'),
        ],
        'CoLA' : [
            (f'{path}/{tasks[0]}/ht_dpp/cosine/True/50/0.6/', 'DPP_on_masks', 'last'),
        ]
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/dpp_masks_calibrate/sst2/ht_dpp/rbf/True/50/0.3

../workdir/results/dpp_masks_calibrate/mrpc/ht_dpp/rbf/True/50/0.3

../workdir/results/dpp_masks_calibrate/cola/ht_dpp/cosine/True/50/0.6



Unnamed: 0,SST-2,MRPC,CoLA
baseline (max_prob),93.985±0.189,92.917±0.309,92.236±0.201
DPP_on_masks|last|bald,0.142±0.221,0.025±0.205,-0.022±0.225
DPP_on_masks|last|sampled_max_prob,0.033±0.071,0.051±0.103,0.009±0.018
DPP_on_masks|last|variance,0.095±0.203,0.014±0.174,-0.008±0.104


In [8]:
tasks = ['cola', 'mrpc', 'sst2']
mc_types = ['DPP_last', 'DC_MC']
path1 = "../workdir/results/dpp_masks_odd"
path2 = "../workdir/results/dpp_masks_calibrate"
path = "../workdir/results/deep_ensemble_calibrated"
path3 = "../workdir/results/dcmc_masks/"

eval_table = build_eval_table(
    {
        'SST-2' : [
            (f'{path3}/{tasks[2]}/{mc_types[1]}/all/20/0.1', 'DC_MC', 'all'),
            (f'{path3}/{tasks[2]}/{mc_types[1]}/last/20/0.5', 'DC_MC', 'last'),
            (f'{path1}/{tasks[2]}/ht_dpp/True/40/100/0.3', 'DPP_with_OOD', 'last'),
            (f'{path2}/{tasks[2]}/ht_dpp/rbf/True/50/0.3/', 'DPP_on_masks', 'last'),
            (f'{path}/sst-2/MC_last/0.63/', 'DeepEnsemble', '-'),
        ],
        'MRPC' : [
            (f'{path3}/{tasks[1]}/{mc_types[1]}/all/20/0.1', 'DC_MC', 'all'),
            (f'{path3}/{tasks[1]}/{mc_types[1]}/last/20/0.5', 'DC_MC', 'last'),
            (f'{path1}/{tasks[1]}/ht_dpp/True/40/100/0.3', 'DPP_with_OOD', 'last'),
            (f'{path2}/{tasks[1]}/ht_dpp/rbf/True/50/0.3/', 'DPP_on_masks', 'last'),
            (f'{path}/{tasks[1]}/MC_last/0.63/', 'DeepEnsemble', '-'),
        ],
        'CoLA' : [
            (f'{path3}/{tasks[0]}/{mc_types[1]}/all/20/0.1', 'DC_MC', 'all'),
            (f'{path3}/{tasks[0]}/{mc_types[1]}/last/20/0.5', 'DC_MC', 'last'),
            (f'{path1}/{tasks[0]}/ht_dpp/True/40/100/0.3', 'DPP_with_OOD', 'last'),
            (f'{path2}/{tasks[0]}/ht_dpp/cosine/True/50/0.6/', 'DPP_on_masks', 'last'),
            (f'{path}/{tasks[0]}/MC_last/0.63/', 'DeepEnsemble', '-'),
        ]
    },
    task_type='classification',
    metric='rejection-curve-auc'
)

eval_table

../workdir/results/dcmc_masks/sst2/DC_MC/all/20/0.1

../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5

../workdir/results/dpp_masks_odd/sst2/ht_dpp/True/40/100/0.3

../workdir/results/dpp_masks_calibrate/sst2/ht_dpp/rbf/True/50/0.3

../workdir/results/deep_ensemble_calibrated/sst-2/MC_last/0.63

../workdir/results/dcmc_masks/mrpc/DC_MC/all/20/0.1

../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.5

../workdir/results/dpp_masks_odd/mrpc/ht_dpp/True/40/100/0.3

../workdir/results/dpp_masks_calibrate/mrpc/ht_dpp/rbf/True/50/0.3

../workdir/results/deep_ensemble_calibrated/mrpc/MC_last/0.63

../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.1

../workdir/results/dcmc_masks/cola/DC_MC/last/20/0.5

../workdir/results/dpp_masks_odd/cola/ht_dpp/True/40/100/0.3

../workdir/results/dpp_masks_calibrate/cola/ht_dpp/cosine/True/50/0.6

../workdir/results/deep_ensemble_calibrated/cola/MC_last/0.63



Unnamed: 0,SST-2,MRPC,CoLA
baseline (max_prob),94.449±0.050,91.984±0.072,92.121±0.133
DC_MC|all|bald,0.035±0.138,0.120±0.123,0.500±0.384
DC_MC|all|sampled_max_prob,0.057±0.083,0.299±0.199,0.635±0.218
DC_MC|all|variance,0.061±0.134,0.219±0.115,0.525±0.371
DC_MC|last|bald,-0.165±0.209,0.006±0.134,-0.259±0.303
DC_MC|last|sampled_max_prob,-0.003±0.022,0.043±0.044,0.019±0.025
DC_MC|last|variance,-0.046±0.099,0.098±0.090,-0.120±0.185
DPP_with_OOD|last|bald,0.053±0.022,0.196±0.035,0.012±0.077
DPP_with_OOD|last|sampled_max_prob,0.034±0.053,0.218±0.103,0.070±0.017
DPP_with_OOD|last|variance,0.108±0.084,0.159±0.106,0.023±0.057


In [33]:
!ls ../workdir/results

conll2003   de	     dpp_oos  glue_nuq	oos	  oos_test_sn  sngp_oos
dcmc_masks  dpp_ood  exps     nuq_oos	oos_test  rostd


In [44]:
!ls 

DropConnect_mnist.ipynb			run_glue_for_model_series.py
Mahalanobis.ipynb			run_mnist.py
OOS.ipynb				run_oos.py
SNGP.ipynb				run_symptom_checker.py
__pycache__				run_tasks_on_multiple_gpus.py
alpaca_calibrator.py			run_train_ensemble_series.py
analyze_results.py			run_train_models.py
analyze_results_2.ipynb			runs
analyze_ue2.ipynb			symptom_checker_test.ipynb
analyze_ue3.ipynb			tmp4t8ffryvwandb-media
analyze_ue4.ipynb			tmp4wpvl15fwandb
analyze_ue_ner.ipynb			tmp9xoq4omvwandb-media
deep_ensemble.ipynb			tmp9yx5mqtuwandb-media
estimators_debug.py			tmpe1688i0hwandb
generate_misclassification_chart.ipynb	tmpelo2_k9_wandb-artifacts
generate_series_of_exps.ipynb		tmpetdhkrsawandb
nohup.out				tmpkmzzn7o9wandb-media
paper_results.ipynb			tmpmk6p06dnwandb-artifacts
paper_results_final.ipynb		ue4nlp
plot_error_detection.py			uncertainty_ratio_results.ipynb
project_results.ipynb			utils_electra.py
run_average_results.py			utils_exps.py
run_calc_ues_metrics.py			utils_

In [43]:
!tar -czvf ../workdir/results/dcmc_masks.tar.gz ../workdir/results/dcmc_masks/ 

tar: Removing leading `../' from member names
../workdir/results/dcmc_masks/
../workdir/results/dcmc_masks/mrpc/
../workdir/results/dcmc_masks/mrpc/DC_MC/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/51/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/51/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/42/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/42/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/101/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/101/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/17/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/17/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/20/0.2/102/91/
../workdir/re

../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/42/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/42/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/101/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/101/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/17/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/17/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/91/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/101/91/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/103/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/103/51/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/103/51/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/103/42/
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/103/42/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/last/10/0.5/103/101/
../workdir/results/dcmc_masks/mrpc/DC_

../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/51/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/51/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/42/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/42/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/101/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/101/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/17/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/17/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/91/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/102/91/dev_inference.json
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/101/
../workdir/results/dcmc_masks/mrpc/DC_MC/all/10/0.2/101/51/
../workdir/results/dcmc_masks/mr

../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/101/101/
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/101/101/dev_inference.json
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/101/17/
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/101/17/dev_inference.json
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/101/91/
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/101/91/dev_inference.json
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/51/
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/51/dev_inference.json
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/42/
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/42/dev_inference.json
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/101/
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/101/dev_inference.json
../workdir/results/dcmc_masks/sst2/DC_MC/last/20/0.5/103/17/
../workdir/results/dcmc_masks/sst2/DC

../workdir/results/dcmc_masks/cola/DC_MC/last/20/0.5/103/91/
../workdir/results/dcmc_masks/cola/DC_MC/last/20/0.5/103/91/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/last/10/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/51/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/51/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/42/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/42/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/101/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/101/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/17/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/17/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/91/
../workdir/results/dcmc_masks/cola/DC_MC/last/10/0.2/102/91/dev_inference.j

../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/42/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/42/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/101/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/101/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/17/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/17/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/91/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/101/91/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/103/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/103/51/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/103/51/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/103/42/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/103/42/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/103/101/
../workdir/results/dcmc_masks/cola/DC_MC/all/20/0.5/

../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/51/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/51/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/42/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/42/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/101/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/101/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/17/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/17/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/91/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/102/91/dev_inference.json
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/101/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/101/51/
../workdir/results/dcmc_masks/cola/DC_MC/all/10/0.1/101/51/dev_inference.json
../

In [52]:
!shopt -s globstar; tar -czvf ../workdir/results/glue_nuq.tar.gz ../workdir/results/glue_nuq/**/*.json

tar: Removing leading `../' from member names
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/101/config.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/101/dev_inference.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/101/special_tokens_map.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/101/tokenizer.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/101/tokenizer_config.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/17/config.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/17/dev_inference.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/17/special_tokens_map.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/17/tokenizer.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/17/tokenizer_config.json
../workdir/results/glue_nuq/cola/NUQ/bert-base-uncased/12/101/42/config.json
../workdir/results/glue_nuq/cola/NUQ/b

../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/12/101/91/tokenizer_config.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/101/config.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/101/dev_inference.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/101/special_tokens_map.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/101/tokenizer.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/101/tokenizer_config.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/17/config.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/17/dev_inference.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/17/special_tokens_map.json
../workdir/results/glue_nuq/cola/NUQ/google/electra-base-discriminator/4/101/17/tokenizer.json
../workdir

../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/42/tokenizer_config.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/51/config.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/51/dev_inference.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/51/special_tokens_map.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/51/tokenizer.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/51/tokenizer_config.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/91/config.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/91/dev_inference.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/91/special_tokens_map.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/91/tokenizer.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/4/101/91/tokenizer_config.json
../workdir/results/glue_nuq/mrpc/NUQ/bert-base-uncased/8/101/101/config.json
../workdir/re

../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/101/tokenizer_config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/17/config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/17/dev_inference.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/17/special_tokens_map.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/17/tokenizer.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/17/tokenizer_config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/42/config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/42/dev_inference.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/42/special_tokens_map.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/20/101/42/tokenizer.json
../wo

../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/40/101/51/tokenizer_config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/40/101/91/config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/40/101/91/dev_inference.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/40/101/91/special_tokens_map.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/40/101/91/tokenizer.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/40/101/91/tokenizer_config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/8/101/101/config.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/8/101/101/dev_inference.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/8/101/101/special_tokens_map.json
../workdir/results/glue_nuq/mrpc/NUQ/google/electra-base-discriminator/8/101/101/tokenizer.json
../wor

../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/42/tokenizer_config.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/51/config.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/51/dev_inference.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/51/special_tokens_map.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/51/tokenizer.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/51/tokenizer_config.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/91/config.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/91/dev_inference.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/91/special_tokens_map.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/91/tokenizer.json
../workdir/results/glue_nuq/sst2/NUQ/bert-base-uncased/8/101/91/tokenizer_config.json
../workdir/results/glue_nuq/sst2/NUQ/google/electra-base-discriminator/12/101/101/config.j

In [53]:
!shopt -s globstar; tar -czvf ../workdir/results/glue_sngp.tar.gz ../workdir/results/glue_sngp/**/*.json

tar: Removing leading `../' from member names
tar: ../workdir/results/glue_sngp/**/*.json: Cannot stat: No such file or directory
tar: Exiting with failure status due to previous errors


In [56]:
!ls ../workdir/results/

conll2003	   de	    exps	     nuq_oos   oos_test_sn
dcmc_masks	   dpp_ood  glue_nuq	     oos       rostd
dcmc_masks.tar.gz  dpp_oos  glue_nuq.tar.gz  oos_test  sngp_oos


In [62]:
eval_table.iloc[0]

SST-2    94.449±0.050
MRPC     91.984±0.072
CoLA     92.121±0.133
Name: baseline (max_prob), dtype: object

In [63]:
93.2 - 94.449, 93.8 - 94.449

(-1.2489999999999952, -0.6490000000000009)

In [64]:
91.8 - 91.984, 91.4 - 91.984

(-0.1839999999999975, -0.583999999999989)

In [65]:
90.0 - 92.121, 90.7 - 92.121

(-2.120999999999995, -1.4209999999999923)

In [66]:
determenistic_res = pd.DataFrame({'index': ['DUQ|-|-', 'SNGP|-|-'], 
                                  'SST-2': ['-1.248±2.100', '-0.649±0.200'], 
                                  'MRPC': ['-0.184±0.400', '-0.584±0.400'], 
                                  'CoLA': ['-2.120±1.200', '-1.421±0.200']}).set_index('index')

In [67]:
eval_table = pd.concat([eval_table, determenistic_res])

In [3]:
eval_table = eval_table_md

In [45]:
import numpy as np

def best_bold(val):
    try:
        value, std = val.split('±')
    except:
        value, std = 0, 1
    bold = 'bold' if value >= std else ''
    return 'font-weight: %s' % bold

def update_acquisition(table):
    def conv(val):
        label_map = {
            'bald': 'BALD',
            'sampled_max_prob': 'Sampled max. prob.',
            'variance': 'Variance'
        }
        
        return label_map.get(val, val)
        
    table['Acquisition'] = table['Acquisition'].apply(conv)

supp_cols = np.array([['No (baseline)', '-', 'Max. prob.']] + [e.split('|') for e in eval_table.index[1:]])

show_table = eval_table.copy()
show_table.insert(0, 'Layers', supp_cols[:, 1])
show_table.insert(0, 'Acquisition', supp_cols[:, 2])
dropout_type = ['_'.join(c.split('_')[:3]) for c in supp_cols[:, 0]]
calibration_type = [c.split('_')[4] if len(c.split('_')) > 3 else '-' for c in supp_cols[:, 0]]
show_table.insert(0, 'Calibration Dataset', calibration_type)
show_table.insert(0, 'Dropout Type', dropout_type)
update_acquisition(show_table)
show_table = show_table.style.applymap(best_bold)
show_table

Unnamed: 0,Dropout Type,Calibration Dataset,Acquisition,Layers,MRPC,COLA,SST2
baseline (max_prob),No (baseline),-,Max. prob.,-,92.792±0.177,92.251±0.264,93.88±0.206
DPP_on_masks_calibrated_train|last|bald,DPP_on_masks,train,BALD,last,0.158±0.111,-0.158±0.217,0.024±0.295
DPP_on_masks_calibrated_train|last|sampled_max_prob,DPP_on_masks,train,Sampled max. prob.,last,0.097±0.138,-0.022±0.023,0.036±0.057
DPP_on_masks_calibrated_train|last|variance,DPP_on_masks,train,Variance,last,0.143±0.115,-0.097±0.16,0.018±0.185
DPP_on_masks_calibrated_val|last|bald,DPP_on_masks,val,BALD,last,-0.042±0.535,-0.174±0.196,0.146±0.207
DPP_on_masks_calibrated_val|last|sampled_max_prob,DPP_on_masks,val,Sampled max. prob.,last,0.017±0.125,-0.044±0.037,0.057±0.101
DPP_on_masks_calibrated_val|last|variance,DPP_on_masks,val,Variance,last,-0.054±0.353,-0.11±0.165,0.123±0.195
DPP_on_masks|last|bald,DPP_on_masks,-,BALD,last,0.173±0.125,-0.154±0.212,0.026±0.306
DPP_on_masks|last|sampled_max_prob,DPP_on_masks,-,Sampled max. prob.,last,0.092±0.132,-0.02±0.024,0.021±0.054
DPP_on_masks|last|variance,DPP_on_masks,-,Variance,last,0.142±0.127,-0.096±0.16,0.03±0.17


In [43]:
print(str(show_table.to_latex(index=False)).replace('±', '$\pm$'))

AttributeError: 'Styler' object has no attribute 'to_latex'