In [1]:
from analyze_results import aggregate_runs_rejection_table, format_arc_table_results, default_methods, collect_configs, aggregate_runs_de, calc_rejection_curve_auc_seq
from pathlib import Path
import pandas as pd
from ue4nlp.ue_scores import *

def create_section_col(runs_dir, de):
    runs_dir = Path(runs_dir) #/ 'results'
    if not runs_dir.is_dir():
        raise ValueError()
    print(runs_dir)
    default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
        "var_ratio": var_ratio,
        "entropy": mean_entropy,
    }
    agg_res = aggregate_runs_rejection_table(runs_dir, methods=default_methods, de=de)

    if agg_res.empty:
        print('Broken')
        raise ValueError()
        
    improvement = format_arc_table_results(agg_res, baseline_col='max_prob')
    improvement = improvement.loc[['max_prob'] + list(default_methods.keys())]
    improvement.index = ['max_prob (baseline)'] + list(improvement.index[1:])
    return improvement


def build_eval_table(dataset_paths, de = False):
    BASELINE_INDEX = 'baseline (max_prob)'
    
    columns = []
    names = []
    for name, paths in dataset_paths.items():
        method_results = []
        baseline = None
        for path, dropout_type, layer in paths:
            method_batch = create_section_col(path, de)
            method_batch.index = [f'{dropout_type}|{layer}|{e}' for e in method_batch.index]
            
            method_results.append(method_batch)
        
        col = pd.concat(method_results, axis=0)
        
        columns.append(col)
        names.append(name)
    
    name_map = {i : n for i,n in enumerate(names)}
    
    return pd.concat(columns, axis=1).rename(columns=name_map)

In [3]:
tasks = ['symptoms']
mc_types = ['MC']
path = "../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/"

eval_table_sy = build_eval_table(
    {
        'symptoms' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/10', 'MC', 'last'),
            (f'{path}/{tasks[0]}/{mc_types[0]}/all/10', 'MC', 'all'),
        ]
    },
    de = False
)

eval_table_sy

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/symptoms/MC/last/10
../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/symptoms/MC/all/10


Unnamed: 0,0%,5%,10%,20%,30%,40%,50%,60%,70%,80%,90%,99%
MC|last|max_prob (baseline),43.3±0.3,44.9±0.3,46.4±0.3,49.5±0.4,52.9±0.5,56.7±0.4,60.8±0.5,64.8±0.5,69.1±0.5,73.4±0.9,77.9±1.0,82.5±4.1
MC|last|bald,43.3±0.2,44.5±0.3,45.7±0.3,48.5±0.3,51.7±0.3,55.2±0.3,59.2±0.3,63.7±0.5,68.3±0.6,72.6±0.9,77.3±1.0,82.1±3.8
MC|last|sampled_max_prob,43.3±0.2,44.8±0.3,46.4±0.3,49.5±0.4,52.9±0.4,56.7±0.5,60.8±0.5,64.8±0.5,69.2±0.5,73.4±0.9,77.9±0.9,82.7±3.9
MC|last|variance,43.3±0.2,43.7±0.3,44.3±0.2,45.7±0.3,47.7±0.4,50.3±0.5,53.7±0.6,58.3±0.7,64.1±0.8,70.4±1.1,76.5±1.1,82.3±3.9
MC|last|var_ratio,43.3±0.2,44.5±0.3,45.8±0.3,47.6±0.5,47.6±0.6,47.5±0.6,47.5±0.7,47.4±0.8,47.4±0.7,47.3±0.6,47.2±0.9,47.5±3.0
MC|last|entropy,43.3±0.2,44.7±0.3,46.1±0.3,49.2±0.4,52.5±0.3,55.9±0.4,59.7±0.5,64.7±0.5,69.3±0.6,73.3±0.8,78.0±1.0,82.9±3.9
MC|all|max_prob (baseline),43.3±0.3,44.8±0.3,46.4±0.3,49.5±0.4,52.9±0.5,56.6±0.5,60.8±0.5,64.8±0.5,69.1±0.5,73.5±0.9,77.9±1.0,82.7±4.2
MC|all|bald,43.8±0.2,44.9±0.3,46.1±0.3,48.3±0.3,50.3±0.3,52.5±0.4,55.1±0.6,58.1±0.9,62.1±1.1,67.7±1.1,75.5±1.2,84.1±3.6
MC|all|sampled_max_prob,43.8±0.2,45.4±0.3,47.0±0.3,50.2±0.4,53.6±0.4,57.5±0.5,61.7±0.5,65.8±0.5,69.9±0.5,74.0±0.8,78.3±1.1,83.7±3.9
MC|all|variance,43.8±0.2,44.4±0.3,44.9±0.4,46.2±0.4,47.9±0.5,50.0±0.6,52.8±0.8,56.3±1.0,60.8±1.1,67.5±1.1,76.4±1.1,84.0±2.5


In [4]:
tasks = ['symptoms']
mc_types = ['MC']
path = "../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/"

eval_table_sde = build_eval_table(
    {
        'symptoms' : [
            (f'{path}/{tasks[0]}/{mc_types[0]}/last/10', '-', '-'),
        ]
    },
    de=True
)

eval_table_sde

../../sbermed_ue/uncertainty-estimation/workdir/results/mc_symptoms_train_63/symptoms/MC/last/10


Unnamed: 0,0%,5%,10%,20%,30%,40%,50%,60%,70%,80%,90%,99%
-|-|max_prob (baseline),43.3±0.2,44.7±0.2,46.4±0.2,49.4±0.1,52.9±0.3,56.6±0.3,60.6±0.5,64.8±0.4,68.9±0.4,73.0±0.7,77.3±0.8,79.9±2.7
-|-|bald,44.8±0.2,45.9±0.3,47.2±0.3,49.3±0.3,51.4±0.3,53.4±0.5,55.7±0.5,58.7±0.5,62.7±0.4,69.2±0.4,77.0±0.3,84.1±4.5
-|-|sampled_max_prob,44.8±0.2,46.5±0.2,48.2±0.3,51.4±0.3,54.8±0.3,58.8±0.3,63.0±0.4,67.2±0.5,71.1±0.5,75.1±0.5,78.6±0.4,85.9±4.7
-|-|variance,44.8±0.2,45.5±0.3,45.9±0.3,46.8±0.4,48.6±0.4,50.5±0.4,53.0±0.4,56.1±0.4,60.9±0.4,68.5±0.2,76.4±0.7,86.3±3.7
-|-|var_ratio,44.8±0.2,46.3±0.2,47.7±0.3,50.4±0.2,53.2±0.4,56.3±0.4,58.1±0.7,58.0±1.0,58.0±1.0,58.2±1.1,57.5±1.5,57.3±1.7
-|-|entropy,44.8±0.2,46.3±0.3,47.9±0.3,51.2±0.3,54.3±0.3,57.9±0.4,61.4±0.5,66.4±0.6,71.4±0.7,74.9±0.5,78.7±0.8,86.1±3.5


In [7]:
eval_table_sde.index = ['DeepEnsemble|' + idx.split('|')[-1] for idx in eval_table_sde.index]

In [11]:
res = pd.concat([eval_table_sde, eval_table_sy])
res = res[res.columns[:-1]]

In [18]:
res.index = ['max_prob (baseline)'] + list(res.index[1:])

In [20]:
res.iloc[[0, 8, 14, 2]]

Unnamed: 0,0%,5%,10%,20%,30%,40%,50%,60%,70%,80%,90%
max_prob (baseline),43.3±0.2,44.7±0.2,46.4±0.2,49.4±0.1,52.9±0.3,56.6±0.3,60.6±0.5,64.8±0.4,68.9±0.4,73.0±0.7,77.3±0.8
MC|last|sampled_max_prob,43.3±0.2,44.8±0.3,46.4±0.3,49.5±0.4,52.9±0.4,56.7±0.5,60.8±0.5,64.8±0.5,69.2±0.5,73.4±0.9,77.9±0.9
MC|all|sampled_max_prob,43.8±0.2,45.4±0.3,47.0±0.3,50.2±0.4,53.6±0.4,57.5±0.5,61.7±0.5,65.8±0.5,69.9±0.5,74.0±0.8,78.3±1.1
DeepEnsemble|sampled_max_prob,44.8±0.2,46.5±0.2,48.2±0.3,51.4±0.3,54.8±0.3,58.8±0.3,63.0±0.4,67.2±0.5,71.1±0.5,75.1±0.5,78.6±0.4


In [4]:
tasks = ['amazon']
mc_types = ['mahalanobis']
path = "../workdir/run_glue_for_model_series/electra_raw_sn/"

eval_table = build_eval_table(
    {
        'Amazon' : [
            (f'{path}/{tasks[0]}/0.0/{mc_types[0]}', 'MD SN', '-'),
        ]
    },
    de = False
)

eval_table

../workdir/run_glue_for_model_series/electra_raw_sn/amazon/0.0/mahalanobis


Unnamed: 0,0%,5%,10%,20%,30%,40%,50%,60%,70%,80%,90%,99%
MD SN|-|max_prob (baseline),0.73±0.0,0.75±0.0,0.76±0.0,0.8±0.0,0.83±0.0,0.86±0.0,0.89±0.0,0.91±0.0,0.93±0.0,0.95±0.0,0.97±0.0,0.98±0.01
MD SN|-|bald,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.01
MD SN|-|sampled_max_prob,0.03±0.0,0.03±0.0,0.03±0.0,0.04±0.0,0.04±0.0,0.04±0.01,0.04±0.01,0.05±0.01,0.05±0.01,0.06±0.02,0.08±0.01,0.14±0.02
MD SN|-|variance,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.01
MD SN|-|var_ratio,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.01
MD SN|-|entropy,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.0,0.03±0.01
