In [1]:
import sys
sys.path.insert(0,'..')

import yaml
import os
from yaml import Loader as Loader
from pathlib import Path
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_auc_score

from analyze_results import (
    extract_result,
    aggregate_runs,
    from_model_outputs_calc_rcc_auc,
)
from analyze_results import (
    format_results2,
    improvement_over_baseline,
    from_model_outputs_calc_rpp_ner, 
    from_model_outputs_calc_rcc_auc_ner,
    extract_result_ner,
    load_and_preprocess_ner,
)

#from utils.utils_wandb import init_wandb, wandb
from ue4nlp.ue_scores import *

In [2]:
def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type#from_model_outputs_calc_arc_auc_ner_mc_maha#
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner#from_model_outputs_calc_rcc_auc_ner_mc_maha#
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner#from_model_outputs_calc_rpp_ner_mc_maha#
    else:
        raise ValueError("Wrong metric type!")
        
def get_one_table(runs_dir, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], level='token', baseline=None, methods=None):
    default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
        #"sampled_entropy": mean_entropy,
        #"var.ratio": var_ratio,
    }
    if methods is None:
        methods = default_methods

    table = []
    
    avg_type = 'max' if ('nuq' in list(methods.keys())[0] or 
                         'mahalanobis' in list(methods.keys())[0] or 
                         'stds' in list(methods.keys())[0]) else 'sum'

    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            runs_dir, methods=methods, metric=metric, task_type=f'ner-{level}', avg_type=avg_type
        )
    
        if agg_res.empty:
            print("Broken\n")
            continue

        if metric_type == "rcc-auc":
            final_score = format_results2(agg_res, percents=False)
        elif metric_type == "rpp":
            final_score = format_results2(agg_res, percents=True)
        else:
            final_score = improvement_over_baseline(agg_res, baseline_col="max_prob", metric=metric_type, percents=True, subtract=True, baseline=baseline)
        table.append(final_score)
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    # fix for rcc-auc and rpp
    if 'baseline (max_prob)' not in res_table.index:
        res_table.loc['baseline (max_prob)'] = 0
    for metric in ['rcc-auc', 'rpp']:
        try:
            res_table[metric].loc['baseline (max_prob)'] = res_table[metric].loc['max_prob']
        except:
            pass
    try:
        res_table = res_table.drop(['max_prob', 'count'])
    except:
        res_table = res_table.drop(['max_prob'])
    return res_table


def collect_tables(run_dirs, names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], level='token', baseline=None, methods=None):
    all_tables = []
    for run_dir, name in zip(run_dirs, names):
        buf_table = get_one_table(run_dir, metric_types, level, baseline, methods)
        #print(buf_table)
        # add name to index
        indices = [(name, ind) for ind in list(buf_table.index)]
        baseline_name = 'baseline|'+'|'.join(name.split('|')[-2:])
        buf_table.loc[baseline_name] = buf_table.loc['baseline (max_prob)']
        # add reindex
        indices = indices + [(baseline_name, 'max_prob')]
        
        index = pd.MultiIndex.from_tuples(indices, names=['Method', 'UE Score'])
        buf_table.index = index
        buf_table.drop((name, 'baseline (max_prob)'), inplace=True)
        # add buf_table to final_table
        all_tables.append(buf_table)
    return pd.concat(all_tables)


def collect_datasets(runs_dirs, names, dataset_names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], level='token', baselines={}, methods=None):
    all_tables = []
    for run_dir, dataset_name in zip(runs_dirs, dataset_names):
        dataset_table = collect_tables(run_dir, names, metric_types, level, baselines.get(level, None), methods)
        columns = pd.MultiIndex.from_tuples([(dataset_name, ind) for ind in list(dataset_table.columns)])
        dataset_table.columns = columns
        all_tables.append(dataset_table)
    return pd.concat(all_tables, axis=1)

In [None]:
import json

with open('../../ner_metrics/conll2003-10-baseline-metrics/metrics_seq_rejection-curve-auc.json') as json_file:
    seq = json.load(json_file)
    
with open('../../ner_metrics/conll2003-10-baseline-metrics/metrics_token_rejection-curve-auc.json') as json_file:
    token = json.load(json_file)

In [None]:
tok_mp = []
for k in token['max_prob']:
    tok_mp.append(token['max_prob'][k])
seq_mp = []
for k in seq['max_prob']:
    seq_mp.append(token['max_prob'][k])

In [None]:
raw_baselines = {'seq': pd.DataFrame({'rejection-curve-auc': [np.mean(seq_mp)]}),
                 'token': pd.DataFrame({'rejection-curve-auc': [np.mean(tok_mp)]})}

# MC-DPP 

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.5, 0.6]
comsizes = [20]

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for reg in regs:
                for level in ['token', 'sequence']:
                    run_dirs = []
                    names = [f'ddpp_{method}|{max_frac}|{cs}']
                    for name in dataset_fnames:
                        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/{name}/0.1/{method}_{max_frac}_{cs}'
                        run_dirs.append([model_series_dir])
                        print(model_series_dir)
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level)
                    res_df.columns = correct_cols(res_df.columns, level)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])

<<<<<<< local


empty dir ['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.3_20']
Not exists one of this dirs: [['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.3_20']]
empty dir ['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.3_20']
Not exists one of this dirs: [['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.3_20']]
empty dir ['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.4_20']
Not exists one of this dirs: [['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.4_20']]
empty dir ['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.4_20']
Not exists one of this dirs: [['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.4_20']]
empty dir ['../workdir/run_glue_for_model_series/electra-raw/conll2003/0.1/ddpp_ood_0.5_20']
Not exists one of this dirs: [['../workdir/run_glue_for_model_series/electra-ra



/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.3_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.3_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.4_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.4_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.5_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.5_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.6_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_0.6_20
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/elec

>>>>>>> remote


In [None]:
table_dpp = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)

ddpp_dpp - 0.6 (both) \
ddpp_ood - 0.6

In [None]:
table_dpp.sort_values(by=('CoNLL-2003 (sequence level)', 'rcc-auc')).iloc[:50]

Unnamed: 0_level_0,Unnamed: 1_level_0,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
ddpp_ddpp_ood|0.6|20,variance,0.01±0.02,11.27±5.38,0.16±0.07,0.87±0.29,11.18±2.40,1.45±0.28
ddpp_ddpp_ood|0.3|20,variance,-0.02±0.02,18.35±8.59,0.19±0.07,0.89±0.25,11.36±2.72,1.47±0.32
ddpp_ddpp_ood|0.5|20,variance,0.01±0.02,11.40±5.77,0.16±0.07,0.85±0.21,11.64±2.60,1.52±0.32
ddpp_ddpp_ood|0.6|20,sampled_max_prob,0.01±0.00,11.28±5.40,0.16±0.07,0.86±0.17,11.73±2.60,1.50±0.34
ddpp_ddpp_ood|0.4|20,variance,0.00±0.05,13.63±7.93,0.16±0.08,0.94±0.29,11.76±3.02,1.42±0.29
ddpp_ddpp_dpp|0.6|20,sampled_max_prob,0.00±0.00,11.77±5.11,0.17±0.07,0.85±0.18,11.86±2.78,1.53±0.35
ddpp_ddpp_dpp|0.6|20,variance,0.00±0.02,12.06±4.99,0.17±0.06,0.83±0.25,11.90±2.59,1.53±0.29
ddpp_ddpp_dpp|0.4|20,sampled_max_prob,-0.00±0.01,12.15±5.35,0.17±0.07,0.81±0.16,12.01±2.79,1.56±0.35
ddpp_ddpp_dpp|0.5|20,sampled_max_prob,-0.00±0.01,12.05±5.28,0.17±0.07,0.80±0.18,12.04±2.88,1.55±0.36
ddpp_ddpp_ood|0.5|20,sampled_max_prob,0.01±0.01,10.96±5.23,0.16±0.07,0.84±0.14,12.09±2.79,1.53±0.34


In [None]:
import os 

default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
raw_baselines = {}
for ds_fname, ds_name in zip(dataset_fnames, dataset_names):
    for level in ['token', 'sequence']:
        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/{ds_fname}/0.1/mahalanobis'
        table = []
        for metric_type in metric_types:
            metric = choose_metric(metric_type=metric_type)

            agg_res = aggregate_runs(
                model_series_dir, methods=default_methods, metric=metric, task_type=f'ner-{level}'
            )

            mean_res = agg_res.mean(axis=0)
            final_results = mean_res.T
            table.append(final_results.loc[['max_prob']])
        res_table = pd.concat(table, axis=1)
        res_table.columns = metric_types
        raw_baselines[level] = res_table#{k:v for k,v in zip(res_table.columns.values.tolist(), res_table.values[0].tolist())}

In [None]:
raw_baselines

{'token':           rejection-curve-auc  rcc-auc       rpp
 max_prob             0.949329  6.70994  0.001056,
 'sequence':           rejection-curve-auc    rcc-auc       rpp
 max_prob             0.919287  19.131262  0.023111}

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp_best', 'ddpp_ood_best', 'mc_all']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for level in ['token', 'sequence']:
            run_dirs = []
            names = [f'{method}|{reg}']
            for name in dataset_fnames:
                model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_{reg}_no_sn/{name}/0.1/{method}'
                run_dirs.append([model_series_dir])
                print(model_series_dir)
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines)
            res_df.columns = correct_cols(res_df.columns, level)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no_sn/conll2003/0.1/ddpp_dpp_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no_sn/conll2003/0.1/ddpp_dpp_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_metric_no_sn/conll2003/0.1/ddpp_dpp_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_metric_no_sn/conll2003/0.1/ddpp_dpp_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_ood_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_ood_best
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp_best', 'ddpp_ood_best', 'mc_all']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
baselines = []
for method in methods:
    for reg in regs:
        run_dirs = []
        names = [f'{method}|{reg}']
        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_{reg}_no_sn/{name}/0.1/{method}'
        output_dir = f'../workdir/run_calc_ues_metrics/electra_{reg}_no_sn/{name}/{method}'

        print(f'HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir={model_series_dir} extract_config=False output_dir={output_dir};', end='')

HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_best extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/ddpp_dpp_best;HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no_sn/conll2003/0.1/ddpp_dpp_best extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_reg_no_sn/conll2003/ddpp_dpp_best;HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_metric_no_sn/conll2003/0.1/ddpp_dpp_best extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_metric_no_sn/conll2003/ddpp_dpp_best;HYDRA_CONFIG_PATH=../configs/ru

In [None]:
def preproc_regs(x):
    reg = x.split('|')[-1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw' or reg == 'Deep Ensemble':
        return '-'
    else:
        return reg

def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in x:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'DE'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ues(x):
    if x == 'bald':
        return 'BALD'
    if x == 'sampled_max_prob':
        return 'SMP'
    if x == 'variance':
        return 'PV'
    if x == 'var.ratio':
        return 'VR'
    return 'MP'

#for tab in tables:
#    tab.index = pd.MultiIndex.from_tuples([(val[0].split('|')[0], val[1]) for val in tab.index], names=('Method', 'UE Score'))
table_dpp_reg = pd.concat([pd.concat(tables[0::2]), pd.concat(tables[1::2])], axis=1)#, pd.concat(baselines[-2:])])
table_dpp_reg = table_dpp_reg.reset_index()
table_dpp_reg['Reg. Type'] = table_dpp_reg.Method.apply(lambda x: preproc_regs(x))
table_dpp_reg['Method'] = table_dpp_reg['Method'].apply(lambda x: preproc_method(x))
table_dpp_reg['UE Score'] = table_dpp_reg['UE Score'].apply(lambda x: preproc_ues(x))
table_dpp_reg = table_dpp_reg[list(table_dpp_reg.columns[:1]) + list(table_dpp_reg.columns[-1:]) + list(table_dpp_reg.columns[1:-1])]

In [None]:
table_dpp_reg

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+DPP) (ours),-,BALD,-0.01±0.03,7.11±2.58,0.11±0.04,-0.47±1.05,22.39±6.37,2.80±0.81
1,DDPP (+DPP) (ours),-,SMP,-0.00±0.03,6.75±2.54,0.11±0.03,-0.08±0.83,19.31±4.52,2.36±0.57
2,DDPP (+DPP) (ours),-,PV,-0.00±0.03,6.91±2.45,0.11±0.03,0.18±0.87,17.54±4.57,2.13±0.56
3,DDPP (+DPP) (ours),-,VR,-0.95±0.19,115.14±41.88,0.97±0.20,-3.98±1.38,55.63±16.55,6.01±1.08
4,DDPP (+DPP) (ours),CER,BALD,-0.01±0.02,7.21±1.43,0.12±0.02,-0.41±0.60,21.90±3.91,2.75±0.47
5,DDPP (+DPP) (ours),CER,SMP,0.00±0.01,6.40±1.33,0.10±0.02,0.02±0.57,19.20±3.34,2.30±0.48
6,DDPP (+DPP) (ours),CER,PV,-0.01±0.02,6.83±1.34,0.11±0.02,0.21±0.53,17.88±2.93,2.14±0.46
7,DDPP (+DPP) (ours),CER,VR,-0.87±0.10,103.25±33.84,0.91±0.12,-3.12±0.91,49.33±13.13,5.22±0.82
8,DDPP (+DPP) (ours),metric,BALD,-0.04±0.04,10.22±6.37,0.14±0.04,-0.81±1.09,26.03±8.57,3.27±0.64
9,DDPP (+DPP) (ours),metric,SMP,-0.02±0.02,7.58±1.85,0.12±0.02,-0.23±0.68,21.37±3.95,2.67±0.27


In [None]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for level in levels:
    run_dirs = []
    names = [f'Deep Ensemble']
    for name in dataset_fnames:
        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines)
    ens_tab.columns = correct_cols(ens_tab.columns, level)
    baselines.append(ens_tab.iloc[-1:])
    tables.append(ens_tab.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results
/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results


In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp_best', 'ddpp_ood_best', 'mc_all']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
run_dirs = []
names = [f'{method}|{reg}']
model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results'
output_dir = f'../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/deepensemble'

print(f'HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir={model_series_dir} extract_config=False output_dir={output_dir};', end='')

HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/deepensemble;

In [None]:
def preproc_ues(x):
    if x == 'bald':
        return 'BALD'
    if x == 'sampled_max_prob':
        return 'SMP'
    if x == 'variance':
        return 'PV'
    if x == 'var.ratio':
        return 'VR'
    return 'MP'

def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = pd.concat(tables, axis=1)
ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = '-'
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab['UE Score'] = ens_tab['UE Score'].apply(lambda x: preproc_ues(x))
ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-1:]) + list(ens_tab.columns[1:-1])]

In [None]:
ens_tab

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,-,BALD,0.02±0.02,4.95±1.27,0.07±0.03,0.72±0.71,15.33±4.04,1.78±0.50
1,Deep Ensemble,-,SMP,0.02±0.02,5.00±1.61,0.07±0.02,0.80±0.67,15.07±3.99,1.71±0.51
2,Deep Ensemble,-,PV,0.02±0.02,5.10±0.90,0.07±0.02,0.72±0.67,15.33±3.97,1.80±0.50
3,Deep Ensemble,-,VR,-0.28±0.13,37.99±17.31,0.38±0.09,0.24±1.16,23.26±11.87,2.45±0.55


# Mahalanobis

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type

    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner

    elif metric_type == "pr-auc":
        return from_model_outputs_calc_pr_auc_ner

    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner

    else:
        raise ValueError("Wrong metric type!")
        
metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', "no_sn"]:
            for level in ['token', 'sequence']:
                run_dirs = []
                names = [f'{method}|{reg}|{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.1/{method}'
                    run_dirs.append([model_series_dir])
                    print(model_series_dir)
                if level == "token":
                    maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
                elif level == "seq":
                    maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)
                agg_methods = {"mahalanobis_distance": maha_dist}
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines, methods=agg_methods)
                res_df.columns = correct_cols(res_df.columns, level)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no_sn/conll2003/0.1/mahalanobis
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_metric_sn/conll2003/0.1/mahalanobis
/hom

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
for method in methods:
    for reg in regs:
        for sn in ['sn', "no_sn"]:
            for name in dataset_fnames:
                model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.1/{method}'      
                output_dir = f'../workdir/run_calc_ues_metrics/electra_{reg}_{sn}/{name}/{method}'
                print(f'HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir={model_series_dir} extract_config=False output_dir={output_dir};', end='')

HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_sn/conll2003/0.1/mahalanobis extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_raw_sn/conll2003/mahalanobis;HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/mahalanobis extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/mahalanobis;HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_sn/conll2003/0.1/mahalanobis extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_reg_sn/conll2003/mahalanobis;HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python

In [None]:
def preproc_regs(x):
    reg = x.split('|')[1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw' or reg == 'Deep Ensemble':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'DE'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif x == 'mahalanobis':
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    return 'MD'

table_det = pd.concat([pd.concat(tables[0::2]), pd.concat(tables[1::2])], axis=1)#, pd.concat(baselines[-2:])])
table_det = table_det.reset_index()#.iloc[::2]
table_det['Reg. Type'] = table_det.Method.apply(lambda x: preproc_regs(x))
table_det['Method'] = table_det.Method.apply(lambda x: preproc_method(x))
table_det['UE Score'] = table_det['UE Score'].apply(lambda x: preproc_ue(x))
table_det = table_det[list(table_det.columns[:1]) + list(table_det.columns[-1:]) + list(table_det.columns[1:-1])].reset_index(drop=True)

In [None]:
table_det

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,MD SN (ours),-,MD,-0.04±0.04,9.34±3.19,0.14±0.04,0.01±0.85,18.97±5.08,2.22±0.58
1,MD,-,MD,-0.03±0.02,8.83±2.50,0.13±0.03,0.28±0.84,17.08±5.07,1.99±0.47
2,MD SN (ours),CER,MD,-0.01±0.02,7.53±1.88,0.12±0.02,0.22±0.54,18.02±3.25,2.01±0.35
3,MD,CER,MD,-0.03±0.04,9.46±3.50,0.14±0.04,0.19±0.58,17.99±3.89,2.17±0.32
4,MD SN (ours),metric,MD,-0.02±0.02,7.90±1.51,0.12±0.02,0.15±0.53,18.38±3.02,2.02±0.42
5,MD,metric,MD,-0.03±0.02,9.24±1.83,0.14±0.02,0.32±0.54,17.34±4.03,2.11±0.20


In [None]:
def preproc_regs(x):
    reg = x.split('|')[1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw' or reg == 'Deep Ensemble':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'DE'
    elif 'baseline|raw|no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif x == 'mahalanobis':
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    return 'MD'

baselines_tab = pd.concat([pd.concat(baselines[0::2]), pd.concat(baselines[1::2])], axis=1)
baselines_tab = baselines_tab.reset_index()
baselines_tab['Reg. Type'] = baselines_tab.Method.apply(lambda x: preproc_regs(x))
baselines_tab['Method'] = baselines_tab.Method.apply(lambda x: preproc_method(x))
baselines_tab['UE Score'] = baselines_tab['UE Score'].apply(lambda x: preproc_ue(x))
baselines_tab = baselines_tab[list(baselines_tab.columns[:1]) + list(baselines_tab.columns[-1:]) + list(baselines_tab.columns[1:-1])].reset_index(drop=True)
baselines_tab = baselines_tab.iloc[[2,3,4,5,0,1]].reset_index(drop=True)

In [None]:
baselines_tab

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,SR SN,CER,MD,94.92±0.02,7.56±1.60,0.12±0.02,91.30±0.59,23.19±3.87,2.85±0.43
1,SR,CER,MD,94.94±0.02,6.23±1.42,0.10±0.02,92.00±0.58,19.23±3.09,2.31±0.52
2,SR SN,metric,MD,94.91±0.02,7.85±1.84,0.12±0.03,91.36±0.53,23.17±3.84,2.75±0.41
3,SR,metric,MD,94.92±0.02,7.39±1.88,0.12±0.02,91.79±0.68,20.39±3.94,2.57±0.42
4,SR SN,-,MD,94.92±0.03,7.99±2.56,0.12±0.04,91.47±0.75,22.41±5.25,2.68±0.67
5,SR (baseline),-,MD,94.93±0.02,6.71±2.36,0.11±0.03,91.93±0.78,19.13±4.27,2.31±0.53


In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
agg_methods = {
    "nuq_aleatoric": nuq_aleatoric,
    "nuq_epistemic": nuq_epistemic,
    "nuq_total": nuq_total,
}

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type

    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner

    elif metric_type == "pr-auc":
        return from_model_outputs_calc_pr_auc_ner

    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner

    else:
        raise ValueError("Wrong metric type!")
        
metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['nuq']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in ['sn', "no_sn"]:
            for level in ['token', 'sequence']:
                run_dirs = []
                names = [f'{method}|{reg}|{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_{reg}_{sn}/{name}/0.1/{method}'
                    run_dirs.append([model_series_dir])
                    print(model_series_dir)
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines, methods=agg_methods)
                res_df.columns = correct_cols(res_df.columns, level)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_raw_no_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_reg_no_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_metric_sn/conll2003/0.1/nuq
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra_

In [None]:
def preproc_regs(x):
    reg = x.split('|')[1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw' or reg == 'Deep Ensemble':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'DE'
    elif 'baseline|raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif x == 'mahalanobis':
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MD'

table_nuq = pd.concat([pd.concat(tables[0::2]), pd.concat(tables[1::2])], axis=1)#, pd.concat(baselines[-2:])])
table_nuq = table_nuq.reset_index()
table_nuq['Reg. Type'] = table_nuq.Method.apply(lambda x: preproc_regs(x))
table_nuq['Method'] = table_nuq.Method.apply(lambda x: preproc_method(x))
table_nuq['UE Score'] = table_nuq['UE Score'].apply(lambda x: preproc_ue(x))
table_nuq = table_nuq[list(table_nuq.columns[:1]) + list(table_nuq.columns[-1:]) + list(table_nuq.columns[1:-1])].reset_index(drop=True)

In [None]:
table_nuq

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,NUQ SN,-,aleatoric,-0.08±0.08,14.64±7.78,0.19±0.08,-0.18±0.86,20.60±5.88,2.44±0.56
1,NUQ SN,-,epistemic,-0.01±0.03,7.50±2.06,0.12±0.03,-0.40±0.75,20.02±4.92,2.65±0.60
2,NUQ SN,-,total,-0.01±0.03,7.35±2.60,0.12±0.04,-0.17±0.85,19.23±5.19,2.45±0.61
3,NUQ,-,aleatoric,-0.09±0.04,15.29±4.52,0.19±0.05,0.05±0.91,18.95±6.18,2.23±0.58
4,NUQ,-,epistemic,0.00±0.01,6.53±1.64,0.10±0.02,0.28±0.74,16.57±4.12,2.06±0.45
5,NUQ,-,total,0.01±0.02,5.88±1.83,0.09±0.02,0.41±0.78,15.92±4.52,1.92±0.47
6,NUQ SN,CER,aleatoric,-0.03±0.03,9.82±3.23,0.13±0.03,0.12±0.57,18.40±3.70,2.10±0.36
7,NUQ SN,CER,epistemic,-0.00±0.02,7.32±1.88,0.12±0.03,-0.40±0.79,20.43±4.33,2.68±0.60
8,NUQ SN,CER,total,-0.00±0.02,6.59±1.68,0.11±0.02,-0.00±0.68,18.47±4.07,2.28±0.51
9,NUQ,CER,aleatoric,-0.07±0.08,14.86±9.51,0.18±0.08,0.03±0.67,18.96±4.44,2.34±0.44


In [None]:
pd.concat([table_nuq, baselines_tab])

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,NUQ SN,-,aleatoric,-0.08±0.08,14.64±7.78,0.19±0.08,-0.18±0.86,20.60±5.88,2.44±0.56
1,NUQ SN,-,epistemic,-0.01±0.03,7.50±2.06,0.12±0.03,-0.40±0.75,20.02±4.92,2.65±0.60
2,NUQ SN,-,total,-0.01±0.03,7.35±2.60,0.12±0.04,-0.17±0.85,19.23±5.19,2.45±0.61
3,NUQ,-,aleatoric,-0.09±0.04,15.29±4.52,0.19±0.05,0.05±0.91,18.95±6.18,2.23±0.58
4,NUQ,-,epistemic,0.00±0.01,6.53±1.64,0.10±0.02,0.28±0.74,16.57±4.12,2.06±0.45
5,NUQ,-,total,0.01±0.02,5.88±1.83,0.09±0.02,0.41±0.78,15.92±4.52,1.92±0.47
6,NUQ SN,CER,aleatoric,-0.03±0.03,9.82±3.23,0.13±0.03,0.12±0.57,18.40±3.70,2.10±0.36
7,NUQ SN,CER,epistemic,-0.00±0.02,7.32±1.88,0.12±0.03,-0.40±0.79,20.43±4.33,2.68±0.60
8,NUQ SN,CER,total,-0.00±0.02,6.59±1.68,0.11±0.02,-0.00±0.68,18.47±4.07,2.28±0.51
9,NUQ,CER,aleatoric,-0.07±0.08,14.86±9.51,0.18±0.08,0.03±0.67,18.96±4.44,2.34±0.44


In [None]:
res_tab = pd.concat([table_dpp_reg, ens_tab, table_det, table_sngp[table_det.columns], baselines_tab]).reset_index(drop=True)
res_tab = res_tab[res_tab['UE Score'] != 'VR']

In [None]:
res_tab

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+DPP) (ours),-,BALD,-0.01±0.03,7.11±2.58,0.11±0.04,-0.47±1.05,22.39±6.37,2.80±0.81
1,DDPP (+DPP) (ours),-,SMP,-0.00±0.03,6.75±2.54,0.11±0.03,-0.08±0.83,19.31±4.52,2.36±0.57
2,DDPP (+DPP) (ours),-,PV,-0.00±0.03,6.91±2.45,0.11±0.03,0.18±0.87,17.54±4.57,2.13±0.56
4,DDPP (+DPP) (ours),CER,BALD,-0.01±0.02,7.21±1.43,0.12±0.02,-0.41±0.60,21.90±3.91,2.75±0.47
5,DDPP (+DPP) (ours),CER,SMP,0.00±0.01,6.40±1.33,0.10±0.02,0.02±0.57,19.20±3.34,2.30±0.48
6,DDPP (+DPP) (ours),CER,PV,-0.01±0.02,6.83±1.34,0.11±0.02,0.21±0.53,17.88±2.93,2.14±0.46
8,DDPP (+DPP) (ours),metric,BALD,-0.04±0.04,10.22±6.37,0.14±0.04,-0.81±1.09,26.03±8.57,3.27±0.64
9,DDPP (+DPP) (ours),metric,SMP,-0.02±0.02,7.58±1.85,0.12±0.02,-0.23±0.68,21.37±3.95,2.67±0.27
10,DDPP (+DPP) (ours),metric,PV,-0.02±0.02,7.80±1.71,0.13±0.02,-0.09±0.67,20.27±3.85,2.55±0.27
12,DDPP (+OOD) (ours),-,BALD,-0.08±0.19,16.84±24.34,0.19±0.19,-0.77±1.05,25.24±8.62,3.05±0.74


# DE + Mahalanabis

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
baselines = []
model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-11-08/15-47-06/final_results'      
output_dir = f'../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/de_maha'
print(f'HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir={model_series_dir} extract_config=False output_dir={output_dir};', end='')

HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir=/home/user/uncertainty-estimation/workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-11-08/15-47-06/final_results extract_config=False output_dir=../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/de_maha;

In [None]:
import os
<<<<<<< local
=======

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type

    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner

    elif metric_type == "pr-auc":
        return from_model_outputs_calc_pr_auc_ner

    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner

    else:
        raise ValueError("Wrong metric type!")
>>>>>>> remote
        
def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for level in levels:
    run_dirs = []
    names = [f'Deep Ensemble']
    for name in dataset_fnames:
<<<<<<< local
        model_series_dir = f'../workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-10-20/10-58-37/final_results/'
=======
        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-11-08/15-47-06/final_results'
>>>>>>> remote
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    if level == "token":
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
    elif level == "seq":
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
    agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines, methods=agg_methods)
    tab.columns = correct_cols(tab.columns, level)
    baselines.append(tab.iloc[-1:])
    tables.append(tab.iloc[:-1])

<<<<<<< local <modified: >


../workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-10-20/10-58-37/final_results/
../workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-10-20/10-58-37/final_results/




/home/user/uncertainty-estimation/workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-11-08/15-47-06/final_results
/home/user/uncertainty-estimation/workdir/run_conll2003_for_mahalanobis_ensemble_series/2021-11-08/15-47-06/final_results


>>>>>>> remote <modified: >


<span style="color:red">**<<<<<<< local**</span>

In [None]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

maha_ens_tab = pd.concat(tables, axis=1).reset_index()
maha_ens_tab['Reg. Type'] = '-'
maha_ens_tab['Method'] = maha_ens_tab['Method'].apply(lambda x: 'DE + MD')
maha_ens_tab = maha_ens_tab[list(maha_ens_tab.columns[:1]) + list(maha_ens_tab.columns[-2:]) + list(maha_ens_tab.columns[1:-2])]

<span style="color:red">**=======**</span>

In [None]:
def preproc_ues(x):
    if x == 'bald':
        return 'BALD'
    if x == 'sampled_max_prob':
        return 'SMP'
    if x == 'variance':
        return 'PV'
    if x == 'var.ratio':
        return 'VR'
    return 'MP'

def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_maha_tab = pd.concat(tables, axis=1)
ens_maha_tab = ens_maha_tab.reset_index()#.iloc[1:]
ens_maha_tab['Reg. Type'] = '-'
ens_maha_tab['Method'] = ens_maha_tab['Method'].apply(lambda x: x.split('|')[0])
ens_maha_tab['UE Score'] = 'SMD'
ens_maha_tab = ens_maha_tab[list(ens_maha_tab.columns[:1]) + list(ens_maha_tab.columns[-1:]) + list(ens_maha_tab.columns[1:-1])]

<span style="color:red">**>>>>>>> remote**</span>

In [None]:
<<<<<<< local
maha_ens_tab
=======
ens_maha_tab
>>>>>>> remote

<<<<<<< local


Unnamed: 0_level_0,Method,CoNLL-2003 (sequence level),Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,rpp,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc
0,DE + MD,1.70±0.31,-,mahalanobis_distance,2.08±0.02,14.47±3.12,0.18±0.02,9.13±1.89,21.29±6.06
1,DE + MD,7.02±0.99,-,sampled_mahalanobis_distance,1.27±0.31,87.76±25.34,1.06±0.31,3.90±1.13,56.48±12.07




Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,-,SMD,-0.02±0.06,8.79±4.06,0.12±0.06,0.40±0.58,15.83±3.95,1.97±0.49


>>>>>>> remote


In [None]:
res_tab = pd.concat([table_dpp_reg[ens_tab.columns], ens_tab, table_det, table_sngp[table_det.columns], ens_maha_tab[ens_tab.columns], baselines_tab[ens_tab.columns]]).reset_index(drop=True)
res_tab = res_tab[res_tab['UE Score'] != 'VR'].reset_index(drop=True)

In [None]:
res_tab.iloc[:18]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+DPP) (ours),-,BALD,-0.01±0.03,7.11±2.58,0.11±0.04,-0.47±1.05,22.39±6.37,2.80±0.81
1,DDPP (+DPP) (ours),-,SMP,-0.00±0.03,6.75±2.54,0.11±0.03,-0.08±0.83,19.31±4.52,2.36±0.57
2,DDPP (+DPP) (ours),-,PV,-0.00±0.03,6.91±2.45,0.11±0.03,0.18±0.87,17.54±4.57,2.13±0.56
3,DDPP (+DPP) (ours),CER,BALD,-0.01±0.02,7.21±1.43,0.12±0.02,-0.41±0.60,21.90±3.91,2.75±0.47
4,DDPP (+DPP) (ours),CER,SMP,0.00±0.01,6.40±1.33,0.10±0.02,0.02±0.57,19.20±3.34,2.30±0.48
5,DDPP (+DPP) (ours),CER,PV,-0.01±0.02,6.83±1.34,0.11±0.02,0.21±0.53,17.88±2.93,2.14±0.46
6,DDPP (+DPP) (ours),metric,BALD,-0.04±0.04,10.22±6.37,0.14±0.04,-0.81±1.09,26.03±8.57,3.27±0.64
7,DDPP (+DPP) (ours),metric,SMP,-0.02±0.02,7.58±1.85,0.12±0.02,-0.23±0.68,21.37±3.95,2.67±0.27
8,DDPP (+DPP) (ours),metric,PV,-0.02±0.02,7.80±1.71,0.13±0.02,-0.09±0.67,20.27±3.85,2.55±0.27
9,DDPP (+OOD) (ours),-,BALD,-0.08±0.19,16.84±24.34,0.19±0.19,-0.77±1.05,25.24±8.62,3.05±0.74


In [None]:
res_tab.iloc[18:]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
18,MC dropout,-,BALD,0.01±0.01,6.13±1.39,0.09±0.02,0.32±0.84,16.30±4.69,1.98±0.56
19,MC dropout,-,SMP,0.01±0.01,5.96±1.58,0.09±0.02,0.39±0.76,16.21±4.32,1.90±0.53
20,MC dropout,-,PV,0.01±0.01,6.05±1.39,0.09±0.02,0.35±0.84,15.91±4.66,1.93±0.56
21,MC dropout,CER,BALD,0.02±0.01,5.65±0.81,0.09±0.01,0.38±0.55,16.48±3.42,1.95±0.42
22,MC dropout,CER,SMP,0.02±0.01,5.30±0.85,0.08±0.01,0.49±0.45,16.11±2.74,1.87±0.38
23,MC dropout,CER,PV,0.02±0.01,5.48±0.80,0.08±0.01,0.38±0.55,16.45±3.61,1.95±0.46
24,MC dropout,metric,BALD,-0.01±0.02,7.50±2.09,0.11±0.02,0.43±0.51,16.40±2.66,2.02±0.22
25,MC dropout,metric,SMP,-0.00±0.01,6.93±1.84,0.11±0.02,0.41±0.49,16.75±2.35,2.04±0.24
26,MC dropout,metric,PV,-0.01±0.02,7.32±2.16,0.11±0.02,0.45±0.52,16.23±2.82,2.02±0.25
27,Deep Ensemble,-,BALD,0.02±0.02,4.95±1.27,0.07±0.03,0.72±0.71,15.33±4.04,1.78±0.50


In [None]:
res_tab.to_csv('../../new_conll2003.csv', index=False, header=True)

# Deep Ensemble

In [None]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner

    else:
        raise ValueError("Wrong metric type!")

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for level in levels:
    run_dirs = []
    names = [f'Deep Ensemble']
    for name in dataset_fnames:
        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines)
    ens_tab.columns = correct_cols(ens_tab.columns, level)
    baselines.append(ens_tab.iloc[-1:])
    tables.append(ens_tab.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results
/home/user/uncertainty-estimation/workdir/run_conll2003_for_ensemble_series/2021-11-08/12-13-11/final_results


In [None]:
ens_tab = pd.concat(tables, axis=1)

In [None]:
def preproc_ues(x):
    if x == 'bald':
        return 'BALD'
    if x == 'sampled_max_prob':
        return 'SMP'
    if x == 'variance':
        return 'PV'
    if x == 'var.ratio':
        return 'VR'
    return 'MP'

def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = '-'
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab['UE Score'] = ens_tab['UE Score'].apply(lambda x: preproc_ues(x))

ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-1:]) + list(ens_tab.columns[1:-1])]

In [None]:
<<<<<<< REMOTE CELL DELETED >>>>>>>
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = '-'
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
#ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-2:]) + list(ens_tab.columns[1:-2])]

In [None]:
ens_tab

<<<<<<< local


Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,Unnamed: 9_level_1
0,Deep Ensemble,bald,2.02±0.06,22.64±6.06,0.30±0.08,4.74±1.23,33.03±5.51,2.86±0.23,-
1,Deep Ensemble,sampled_max_prob,0.16±0.22,242.47±21.03,2.20±0.10,2.27±1.16,56.72±6.47,5.80±0.64,-
2,Deep Ensemble,variance,1.88±0.06,32.68±7.92,0.46±0.10,3.78±1.10,38.23±8.72,3.69±0.43,-
3,Deep Ensemble,var.ratio,1.74±0.06,47.99±4.98,0.49±0.08,4.85±1.55,31.83±1.97,2.24±0.40,-




Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,-,BALD,0.02±0.02,4.95±1.27,0.07±0.03,0.72±0.71,15.33±4.04,1.78±0.50
1,Deep Ensemble,-,SMP,0.02±0.02,5.00±1.61,0.07±0.02,0.80±0.67,15.07±3.99,1.71±0.51
2,Deep Ensemble,-,PV,0.02±0.02,5.10±0.90,0.07±0.02,0.72±0.67,15.33±3.97,1.80±0.50
3,Deep Ensemble,-,VR,-0.28±0.13,37.99±17.31,0.38±0.09,0.24±1.16,23.26±11.87,2.45±0.55


>>>>>>> remote


In [None]:
res = pd.concat([ens_tab.iloc[:3], maha_ens_tab.iloc[1:]]).reset_index(drop=True)

In [None]:
res['UE Score'] = ['BALD', 'SMP', 'PV', 'SMD']

In [None]:
res[list(res.columns[-3:]) + list(res.columns[:-3])]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rcc-auc,rejection-curve-auc,rpp,rcc-auc,rejection-curve-auc,rpp
0,Deep Ensemble,-,BALD,33.03±5.51,4.74±1.23,2.86±0.23,22.64±6.06,2.02±0.06,0.30±0.08
1,Deep Ensemble,-,SMP,56.72±6.47,2.27±1.16,5.80±0.64,242.47±21.03,0.16±0.22,2.20±0.10
2,Deep Ensemble,-,PV,38.23±8.72,3.78±1.10,3.69±0.43,32.68±7.92,1.88±0.06,0.46±0.10
3,DE + MD,-,SMD,56.48±12.07,3.90±1.13,7.02±0.99,87.76±25.34,1.27±0.31,1.06±0.31


In [None]:
print(str(ens_tab.to_latex(index=False)).replace('±', '$\pm$'))

# SNGP

In [None]:
from analyze_results import *

def calc_rcc_aucs_seq_sngp(probabilities, labels, predictions, stds):
    risk_binary = [1.0 * (l != p) for l, p in zip(labels, predictions)]

    results = {}

    ue_scores_stds = np.zeros(len(labels))
    for i in range(len(labels)):
        sent = np.asarray(stds[i])
        ue_scores_stds[i] = sent.max()
    results['sngp'] = rcc_auc(-ue_scores_stds, risk_binary)

    n_examples = len(risk_binary)
    ue_scores_max = np.zeros(n_examples)
    for i in range(n_examples):
        sent = probabilities[i]
        true_probs_max = np.asarray([np.max(proba) for proba in sent])
        ue_scores_max[i] = np.mean(true_probs_max)
    results["max_prob"] = rcc_auc(ue_scores_max, risk_binary)
    return results


def calc_rpp_seq_sngp(probabilities, labels, predictions, stds):
    risk_binary = [1.0 * (l != p) for l, p in zip(labels, predictions)]

    results = {}

    ue_scores_stds = np.zeros(len(labels))
    for i in range(len(labels)):
        sent = np.asarray(stds[i])
        ue_scores_stds[i] = sent.max()
    results['sngp'] = rpp(-ue_scores_stds, risk_binary)

    n_examples = len(risk_binary)
    ue_scores_max = np.zeros(n_examples)
    for i in range(n_examples):
        sent = probabilities[i]
        true_probs_max = np.asarray([np.max(proba) for proba in sent])
        ue_scores_max[i] = np.mean(true_probs_max)
    results["max_prob"] = rpp(ue_scores_max, risk_binary)
    return results

def from_model_outputs_calc_rpp_ner_sngp(model_outputs, methods, level="token"):
    probs = np.asarray(model_outputs["probabilities"])
    probs_toks = probs.reshape(-1, probs.shape[-1])
    
    sampled_probs = np.asarray(model_outputs["sampled_probabilities"])
    sampled_probs_toks = sampled_probs.reshape(
        sampled_probs.shape[0], sampled_probs.shape[1] * sampled_probs.shape[2], -1
    )

    stds = np.asarray(model_outputs["stds"]).mean(-1)
    stds_toks = stds.reshape(-1)

    labels = np.asarray(model_outputs["true_labels"])
    labels_toks = labels.reshape(-1)

    use_idx = labels_toks != -100
    if level == "token":
        
        res = calc_rpp_from_ue_scores([stds_toks[use_idx]], 
                                      ['sngp'], 
                                      probs_toks[use_idx], 
                                      labels_toks[use_idx])
    else:
        # sequence level
        _, _, predictions, _ = unpad_preds(
            probs, sampled_probs, np.argmax(probs, axis=-1), labels
        )
        
        sampled_probs, probs, stds, labels = unpad_preds(
            probs, sampled_probs, stds, labels
        )
        res = calc_rpp_seq_sngp(
            probs,
            labels,
            predictions,
            stds,
        )
    return res

def from_model_outputs_calc_rcc_auc_ner_sngp(model_outputs, methods, level="token"):
    probs = np.asarray(model_outputs["probabilities"])
    probs_toks = probs.reshape(-1, probs.shape[-1])

    stds = np.asarray(model_outputs["stds"]).mean(-1)
    stds_toks = stds.reshape(-1)
    
    sampled_probs = np.asarray(model_outputs["sampled_probabilities"])
    sampled_probs_toks = sampled_probs.reshape(
        sampled_probs.shape[0], sampled_probs.shape[1] * sampled_probs.shape[2], -1
    )

    labels = np.asarray(model_outputs["true_labels"])
    labels_toks = labels.reshape(-1)

    use_idx = labels_toks != -100
    if level == "token":
        
        res = calc_rcc_aucs_from_ue_scores([stds_toks[use_idx]], 
                                      ['sngp'], 
                                      probs_toks[use_idx], 
                                      labels_toks[use_idx])
    else:
        # sequence level
        _, _, predictions, _ = unpad_preds(
            probs, sampled_probs, np.argmax(probs, axis=-1), labels
        )
        
        sampled_probs, probs, stds, labels = unpad_preds(
            probs, sampled_probs, stds, labels
        )
        res = calc_rcc_aucs_seq_sngp(
            probs,
            labels,
            predictions,
            stds,
        )
    return res

In [None]:
def from_model_outputs_calc_arc_auc_ner_sngp(model_outputs, methods, level="token"):
    probs = np.asarray(model_outputs["probabilities"])
    probs_toks = probs.reshape(-1, probs.shape[-1])

    stds = np.asarray(model_outputs["stds"]).mean(-1)
    stds_toks = stds.reshape(-1)
    
    sampled_probs = np.asarray(model_outputs["sampled_probabilities"])
    sampled_probs_toks = sampled_probs.reshape(
        sampled_probs.shape[0], sampled_probs.shape[1] * sampled_probs.shape[2], -1
    )

    labels = np.asarray(model_outputs["true_labels"])
    labels_toks = labels.reshape(-1)

    use_idx = labels_toks != -100
    if level == "token":
        res = calc_aucs_sngp(labels_toks[use_idx], 
                              stds_toks[use_idx], 
                              probs_toks[use_idx])
    else:
        # sequence level
        _, _, predictions, _ = unpad_preds(
            probs, sampled_probs, np.argmax(probs, axis=-1), labels
        )
        
        sampled_probs, probs, stds, labels = unpad_preds(
            probs, sampled_probs, stds, labels
        )
        res = calc_arc_aucs_seq_sngp(
            probs,
            labels,
            predictions,
            stds,
        )
    return res

def calc_aucs_sngp(eval_labels, stds, probabilities):
    
    predictions = np.argmax(probabilities, axis=-1)
    errors = (eval_labels!=predictions).astype('uint8')

    results = {}
    ratio_list = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    sorted_indexes_ensemble = np.argsort(-stds)
    ens_scores = [get_score_ratio(sorted_indexes_ensemble, predictions, eval_labels, ratio) for ratio in ratio_list]
    results['sngp']  = auc(ratio_list, ens_scores)
    
    model_ues = 1 - np.max(probabilities, axis=1)
    sorted_indexes_model = np.argsort(-model_ues)
    model_scores = [get_score_ratio(sorted_indexes_model, predictions, eval_labels, ratio) for ratio in ratio_list]    
    results['max_prob'] = auc(ratio_list, model_scores)

    return results


def calc_arc_aucs_seq_sngp(probabilities, labels, predictions, stds):
    risk_binary = [1.0 * (l != p) for l, p in zip(labels, predictions)]

    results = {}
    ratio_list = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    
    ue_scores_stds = np.zeros(len(labels))
    for i in range(len(labels)):
        sent = np.asarray(stds[i])
        ue_scores_stds[i] = sent.max()
    sorted_indexes_ensemble = np.argsort(-ue_scores_stds)
    ens_scores = [get_score_ratio_seq(sorted_indexes_ensemble, predictions, labels, ratio) for ratio in ratio_list]
    results['sngp']  = auc(ratio_list, ens_scores)
    
    n_examples = len(risk_binary)
    ue_scores_max = np.zeros(n_examples)
    for i in range(n_examples):
        sent = probabilities[i]
        true_probs_max = np.asarray([np.max(proba) for proba in sent])
        ue_scores_max[i] = np.mean(true_probs_max)
    
    sorted_indexes_ensemble = np.argsort(-ue_scores_max)
    ens_scores = [get_score_ratio_seq(sorted_indexes_ensemble, predictions, labels, ratio) for ratio in ratio_list]
    results['max_prob'] = auc(ratio_list, ens_scores)
    return results

In [None]:
def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return from_model_outputs_calc_arc_auc_ner_sngp
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner_sngp
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner_sngp
    else:
        raise ValueError("Wrong metric type!")

In [None]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
regs = ['raw']#, 'reg']
methods = ['sngp']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for level in levels:
            run_dirs = []
            names = [f'{method}|{reg}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_ner_for_model_series/electra-{reg}-sngp/{name}/0.1/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines)
            res_df.columns = correct_cols(res_df.columns, level)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])

In [None]:
table_sngp = pd.concat([pd.concat([tab for tab in tables[0::2]]), pd.concat([tab for tab in tables[1::2]])], axis=1)

In [None]:
table_sngp

Unnamed: 0_level_0,Unnamed: 1_level_0,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
sngp|raw,sngp,-1.03±7.74,340.93±800.82,1.99±3.37,-2.51±11.71,94.93±89.05,4.97±2.28


In [None]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
regs = ['raw']
methods = ['sngp']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']

ridge_factors = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]
momentums = [0.999, 0.99, 0.9]

names = []
tables = []
baselines = []

for method in methods:
    for ridge_factor in ridge_factors:
            for momentum in momentums:
                for level in levels:
                    run_dirs = []
                    names = [f'{method}|{ridge_factor}_{momentum}_0.1']
                    for name in dataset_fnames:
                        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/{ridge_factor}_{momentum}_0.1/'
                        print(model_series_dir)
                        run_dirs.append([model_series_dir])
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines={})
                    res_df.columns = correct_cols(res_df.columns, level)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/1e-05_0.999_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/1e-05_0.999_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/1e-05_0.99_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/1e-05_0.99_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/0.0001_0.999_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/0.0001_0.999_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/0.0001_0.99_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/0.0001_0.99_0.1/
/home/user/uncertainty-estimation/workdir/run_glue_for_model_series/electra-raw-sngp/conll2003/0.0001_0.

In [None]:
table_sngp = pd.concat([pd.concat([tab for tab in tables[0::2]]), pd.concat([tab for tab in tables[1::2]])], axis=1)

In [None]:
table_sngp.iloc[::3]

Unnamed: 0_level_0,Unnamed: 1_level_0,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
sngp|1e-05_0.999_0.1,sngp,-0.66±0.57,281.26±197.93,2.04±1.64,6.47±4.06,109.89±108.08,4.44±1.78
sngp|0.0001_0.999_0.1,sngp,-0.59±0.54,264.66±186.94,1.85±1.54,6.49±3.67,96.78±101.57,4.44±1.60
sngp|0.001_0.999_0.1,sngp,-0.60±0.51,265.51±187.72,1.87±1.53,6.44±3.69,97.48±101.44,4.50±1.59
sngp|0.01_0.999_0.1,sngp,-0.58±0.53,264.43±187.84,1.85±1.54,6.43±3.68,97.43±102.19,4.48±1.61
sngp|0.1_0.999_0.1,sngp,-0.59±0.53,261.46±187.16,1.85±1.54,6.45±3.68,97.30±101.96,4.50±1.57
sngp|1_0.999_0.1,sngp,-0.59±0.52,262.59±185.27,1.85±1.53,6.44±3.67,97.87±102.68,4.49±1.53


# BEST

In [None]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type

    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner

    elif metric_type == "pr-auc":
        return from_model_outputs_calc_pr_auc_ner

    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner

    else:
        raise ValueError("Wrong metric type!")
        
metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
regs = ['raw']
methods = ['sngp']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for level in levels:
            run_dirs = []
            names = [f'{method}|{reg}']
            for name in dataset_fnames:
                model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_ner_for_model_series/electra-raw-sngp/conll2003/0.1'
                print(model_series_dir)
                run_dirs.append([model_series_dir]) 
            if level == "token":
                maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
            elif level == "seq":
                maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)
            agg_methods = {"stds": maha_dist}
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines, methods=agg_methods)
            res_df.columns = correct_cols(res_df.columns, level)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_ner_for_model_series/electra-raw-sngp/conll2003/0.1
/home/user/uncertainty-estimation/workdir/run_ner_for_model_series/electra-raw-sngp/conll2003/0.1


In [None]:
table_sngp = pd.concat([pd.concat([tab for tab in tables[0::2]]), pd.concat([tab for tab in tables[1::2]])], axis=1)

In [None]:
table_sngp = table_sngp.reset_index()

In [None]:
table_sngp['Method'] = 'SNGP'
table_sngp['UE Score'] = 'SNGP'
table_sngp['Reg. Type'] = '-'

In [None]:
table_sngp

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),Reg. Type
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp,Unnamed: 9_level_1
0,SNGP,SNGP,-5.05±3.08,224.42±167.31,1.63±1.44,-27.28±3.03,110.89±103.49,5.05±1.97,-


In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_ner_for_model_series/electra-raw-sngp/conll2003/0.1'      
output_dir = f'../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/sngp'
print(f'HYDRA_CONFIG_PATH=../configs/run_calc_ues_metrics.yaml python ./run_calc_ues_metrics_ner.py runs_dir={model_series_dir} extract_config=False output_dir={output_dir};', end='')

# DeBERTA MC-DPP

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.4, 0.5, 0.6]
comsizes = [20]

dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for method in methods:
    for max_frac in max_fracs:
        for cs in comsizes:
            for reg in regs:
                for level in ['token', 'sequence']:
                    run_dirs = []
                    names = [f'ddpp_{method}|{max_frac}|{cs}']
                    for name in dataset_fnames:
                        model_series_dir = f'/home/user/uncertainty-estimation/workdir/run_conll2003_for_model_series_dpp_hp/deberta_raw_no_sn/{name}/0.1/{method}_{max_frac}_{cs}'
                        run_dirs.append([model_series_dir])
                        print(model_series_dir)
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level)
                    res_df.columns = correct_cols(res_df.columns, level)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])

/home/user/uncertainty-estimation/workdir/run_conll2003_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_0.3_20


FileNotFoundError: [Errno 2] No such file or directory: '/home/user/uncertainty-estimation/workdir/run_conll2003_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_0.3_20'

In [None]:
table_dpp = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)

ValueError: No objects to concatenate

In [None]:
table_dpp.sort_values(by=('CoNLL-2003 (token level)', 'rcc-auc')).iloc[:50]

NameError: name 'table_dpp' is not defined

In [None]:
{'token':{'ddpp_dpp': 0.4, 'ddpp_ood': 0.6},
 'sequence':{'ddpp_dpp': 0.4, 'ddpp_ood': 0.6}}

{'token': {'ddpp_dpp': 0.4, 'ddpp_ood': 0.6},
 'sequence': {'ddpp_dpp': 0.4, 'ddpp_ood': 0.6}}

In [None]:
pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
baseline|0.3|20,max_prob,94.95±0.02,6.01±3.04,0.09±0.04,92.81±0.64,12.56±3.62,1.85±0.56
baseline|0.4|20,max_prob,94.95±0.02,4.72±1.58,0.07±0.02,92.95±0.71,11.70±3.86,1.72±0.66
baseline|0.5|20,max_prob,94.95±0.02,4.72±1.58,0.07±0.02,92.95±0.71,11.70±3.86,1.72±0.66
baseline|0.6|20,max_prob,94.95±0.02,4.72±1.58,0.07±0.02,92.95±0.71,11.70±3.86,1.72±0.66
baseline|0.3|20,max_prob,94.95±0.02,4.72±1.58,0.07±0.02,92.95±0.71,11.70±3.86,1.72±0.66
baseline|0.4|20,max_prob,94.95±0.02,4.72±1.58,0.07±0.02,92.95±0.71,11.70±3.86,1.72±0.66
baseline|0.5|20,max_prob,94.95±0.02,4.72±1.58,0.07±0.02,92.95±0.71,11.70±3.86,1.72±0.66
baseline|0.6|20,max_prob,94.95±0.02,4.72±1.58,0.07±0.02,92.95±0.71,11.70±3.86,1.72±0.66


# DeBERTA All

In [None]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type#from_model_outputs_calc_arc_auc_ner_mc_maha#
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner#from_model_outputs_calc_rcc_auc_ner_mc_maha#
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner#from_model_outputs_calc_rpp_ner_mc_maha#
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
methods = ['ddpp_ood', 'ddpp_dpp', 'mc_all', 'mahalanobis', 'nuq', 'mc_mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn', 'no_sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for level in ['token', 'sequence']:
                if sn == 'sn' and method not in ['nuq', 'mahalanobis', 'mc_mahalanobis']:
                    continue
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}_{sn}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/deberta_{reg}_{sn}/{name}/0.1/{method}'
                    print(level, model_series_dir)
                    run_dirs.append([model_series_dir])
                agg_func = choose_agg_func(method, level)
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                res_df.columns = correct_cols(res_df.columns, level)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])

token ../workdir/run_glue_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_ood
sequence ../workdir/run_glue_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_ood
token ../workdir/run_glue_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_ood
sequence ../workdir/run_glue_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_ood
token ../workdir/run_glue_for_model_series/deberta_metric_no_sn/conll2003/0.1/ddpp_ood
sequence ../workdir/run_glue_for_model_series/deberta_metric_no_sn/conll2003/0.1/ddpp_ood
token ../workdir/run_glue_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp
sequence ../workdir/run_glue_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp
token ../workdir/run_glue_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_dpp
sequence ../workdir/run_glue_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_dpp
token ../workdir/run_glue_for_model_series/deberta_metric_no_sn/conll2003/0.1/ddpp_dpp
sequence ../workdir/run_glue_for_model_series/debert

In [None]:
table_all = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)

In [None]:
baseline_tab = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-6:]

In [None]:
import os 

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for level in levels:
    run_dirs = []
    names = [f'Deep Ensemble|raw_no_sn']
    for name in dataset_fnames:
        model_series_dir = f'../workdir/run_glue_for_ensemble_series/deberta/{name}/final_results/'
        print(level, model_series_dir)
        run_dirs.append([model_series_dir])
    ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines={})
    ens_tab.columns = correct_cols(ens_tab.columns, level)
    baselines.append(ens_tab.iloc[-1:])
    tables.append(ens_tab.iloc[:-1])

token ../workdir/run_glue_for_ensemble_series/deberta/conll2003/final_results/
sequence ../workdir/run_glue_for_ensemble_series/deberta/conll2003/final_results/


In [None]:
ens_tab = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)

In [None]:
def preproc_regs(x):
    reg = x.split('|')[-1].split('_')[0]
    if reg == 'reg':
        return 'CER'
    elif reg == 'raw':
        return '-'
    else:
        return reg
    
def preproc_method(x):
    method = x.split('|')[0]
    sn = x.split('|')[-1].split('_', 1)[-1]
    if method == 'mahalanobis' and not 'no_sn' in sn:
        return 'MD SN (ours)'
    elif method == 'mahalanobis':
        return 'MD'
    elif method == 'mc_mahalanobis' and not 'no_sn' in sn:
        return 'SMD SN (ours)'
    elif method == 'mc_mahalanobis':
        return 'SMD'
    elif method == 'nuq' and not 'no_sn' in sn:
        return 'NUQ SN'
    elif method == 'nuq':
        return 'NUQ'
    elif 'ddpp_dpp' in method:
        return 'DDPP (+DPP) (ours)'
    elif 'ddpp_ood' in method:
        return 'DDPP (+OOD) (ours)'
    elif 'mc_all' in method:
        return 'MC dropout'
    elif 'Deep' in method:
        return 'Deep Ensemble'
    elif 'baseline' in x and 'raw_no_sn' in x:
        return 'SR (baseline)'
    elif 'baseline' in x and not 'no_sn' in x:
        return 'SR SN'
    return 'SR'

def preproc_ue(x):
    if x == 'bald':
        return 'BALD'
    elif 'sampled_mahalanobis_distance' in x:
        return 'SMD'
    elif 'mahalanobis_distance' in x:
        return 'MD'
    elif 'sampled_max_prob' in x:
        return 'SMP'
    elif 'variance' in x:
        return 'PV'
    elif 'aleatoric' in x:
        return 'aleatoric'
    elif 'epistemic' in x:
        return 'epistemic'
    elif 'total' in x:
        return 'total'
    return 'MP'

table_full = pd.concat([table_all, ens_tab, baseline_tab]).reset_index()
table_full['Reg. Type'] = table_full.Method.apply(lambda x: preproc_regs(x))
table_full['Method'] = table_full.Method.apply(lambda x: preproc_method(x))
table_full['UE Score'] = table_full['UE Score'].apply(lambda x: preproc_ue(x))
table_full = table_full[list(table_full.columns[:1]) + list(table_full.columns[-1:]) + list(table_full.columns[1:-1])].reset_index(drop=True)

In [None]:
table_full.iloc[:18]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,DDPP (+OOD) (ours),-,BALD,-0.01±0.02,6.13±1.55,0.09±0.03,-0.09±0.32,18.92±5.26,2.28±0.56
1,DDPP (+OOD) (ours),-,SMP,0.00±0.01,5.47±0.90,0.08±0.02,0.13±0.28,16.80±4.41,2.06±0.55
2,DDPP (+OOD) (ours),-,PV,-0.00±0.02,5.86±1.56,0.09±0.02,0.29±0.19,15.51±5.28,1.89±0.59
3,DDPP (+OOD) (ours),CER,BALD,-0.01±0.01,7.62±1.54,0.11±0.03,-0.20±0.47,23.04±5.37,2.86±0.65
4,DDPP (+OOD) (ours),CER,SMP,-0.01±0.01,7.06±1.26,0.11±0.02,0.09±0.55,20.98±4.43,2.59±0.57
5,DDPP (+OOD) (ours),CER,PV,-0.01±0.01,7.30±1.38,0.11±0.02,0.30±0.36,19.17±3.53,2.36±0.44
6,DDPP (+OOD) (ours),metric,BALD,-0.00±0.01,5.78±1.54,0.09±0.03,-0.32±0.28,20.70±4.25,2.59±0.40
7,DDPP (+OOD) (ours),metric,SMP,-0.00±0.01,5.65±1.35,0.09±0.02,-0.02±0.19,18.01±3.12,2.27±0.32
8,DDPP (+OOD) (ours),metric,PV,-0.00±0.01,5.69±1.55,0.09±0.03,0.24±0.10,16.19±3.05,2.03±0.24
9,DDPP (+DPP) (ours),-,BALD,-0.01±0.01,6.82±0.85,0.10±0.02,-0.34±0.45,20.92±5.30,2.54±0.71


In [None]:
table_full.iloc[18:]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
18,MC dropout,-,BALD,0.01±0.01,5.29±1.08,0.08±0.01,0.36±0.25,14.19±4.50,1.76±0.47
19,MC dropout,-,SMP,0.01±0.01,4.68±0.55,0.07±0.01,0.44±0.27,13.96±3.98,1.69±0.45
20,MC dropout,-,PV,0.01±0.01,5.27±0.97,0.08±0.01,0.34±0.25,14.15±4.37,1.75±0.46
21,MC dropout,CER,BALD,0.01±0.02,5.73±1.96,0.08±0.03,0.90±0.19,14.33±2.78,1.70±0.37
22,MC dropout,CER,SMP,0.02±0.01,5.11±1.22,0.08±0.02,0.89±0.24,14.24±2.77,1.69±0.38
23,MC dropout,CER,PV,0.01±0.01,5.56±1.72,0.08±0.03,0.95±0.20,13.81±2.73,1.66±0.35
24,MC dropout,metric,BALD,0.01±0.02,4.93±1.21,0.07±0.02,0.47±0.21,14.16±3.29,1.74±0.28
25,MC dropout,metric,SMP,0.01±0.01,4.62±0.93,0.07±0.01,0.53±0.22,14.21±2.87,1.67±0.26
26,MC dropout,metric,PV,0.01±0.02,4.89±1.17,0.07±0.02,0.48±0.19,14.14±3.44,1.70±0.32
27,MD SN (ours),-,MD,-0.01±0.01,5.36±1.24,0.08±0.01,0.23±0.18,16.72±5.36,1.90±0.20


In [None]:
table_final = table_full[table_full.columns[[0,1,2,4,5,7,8]]]
table_final = table_final.iloc[list(range(18,24))+list(range(3))+list(range(9, 12))+list(range(57, 60))+[27,28,29,63,65,61]].reset_index(drop=True)

In [None]:
table_final

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rcc-auc,rpp,rcc-auc,rpp
0,MC dropout,-,BALD,5.29±1.08,0.08±0.01,14.19±4.50,1.76±0.47
1,MC dropout,-,SMP,4.68±0.55,0.07±0.01,13.96±3.98,1.69±0.45
2,MC dropout,-,PV,5.27±0.97,0.08±0.01,14.15±4.37,1.75±0.46
3,MC dropout,CER,BALD,5.73±1.96,0.08±0.03,14.33±2.78,1.70±0.37
4,MC dropout,CER,SMP,5.11±1.22,0.08±0.02,14.24±2.77,1.69±0.38
5,MC dropout,CER,PV,5.56±1.72,0.08±0.03,13.81±2.73,1.66±0.35
6,DDPP (+OOD) (ours),-,BALD,6.13±1.55,0.09±0.03,18.92±5.26,2.28±0.56
7,DDPP (+OOD) (ours),-,SMP,5.47±0.90,0.08±0.02,16.80±4.41,2.06±0.55
8,DDPP (+OOD) (ours),-,PV,5.86±1.56,0.09±0.02,15.51±5.28,1.89±0.59
9,DDPP (+DPP) (ours),-,BALD,6.82±0.85,0.10±0.02,20.92±5.30,2.54±0.71


In [None]:
table_full.to_csv('../../deberta_all_conll2003.csv', header=True, index=False)

In [3]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['raw', 'reg', 'metric']
spectralnorm = ['sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
sn_values = [0.2, 0.4, 0.6, 0.8, 1, 2, 3]

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for sn_value in sn_values:
                for level in ['token', 'sequence']:
                    run_dirs = []
                    name_sn = ''
                    names = [f'{method}|{reg}|{sn}_{sn_value}']
                    for name in dataset_fnames:
                        model_series_dir = f'../../workdir/run_tasks_for_model_series_sn_params/electra_{reg}_{sn}/{name}/0.1/{sn_value}/{method}'
                        print(level, model_series_dir)
                        run_dirs.append([model_series_dir])
                    try:
                        agg_func = choose_agg_func(method, level)
                        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                        res_df.columns = correct_cols(res_df.columns, level)
                        baselines.append(res_df.iloc[-1:])
                        tables.append(res_df.iloc[:-1])
                    except:
                        pass

token ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.2/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.2/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.4/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.4/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.6/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.6/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.8/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/0.8/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_sn/conll2003/0.1/1/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_raw_

In [4]:
table_all = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_tab = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full = pd.concat([table_all, baseline_tab]).reset_index()

def to_float(x):
    return float(x.split('±')[0])

table_full.sort_values(by=('CoNLL-2003 (token level)', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp
19,mahalanobis|metric|sn_2,mahalanobis_distance,8.89±3.35,0.12±0.04,9.46±2.00,1.11±0.20
20,mahalanobis|metric|sn_3,mahalanobis_distance,8.89±3.35,0.12±0.04,9.46±2.00,1.11±0.20
13,mahalanobis|reg|sn_3,mahalanobis_distance,8.94±3.09,0.12±0.04,9.33±1.63,1.14±0.15
12,mahalanobis|reg|sn_2,mahalanobis_distance,8.94±3.09,0.12±0.04,9.33±1.63,1.14±0.15
6,mahalanobis|raw|sn_3,mahalanobis_distance,9.18±4.10,0.13±0.06,9.94±2.17,1.26±0.21
5,mahalanobis|raw|sn_2,mahalanobis_distance,9.20±4.05,0.13±0.06,10.05±2.07,1.29±0.14
4,mahalanobis|raw|sn_1,mahalanobis_distance,9.60±3.24,0.14±0.04,9.91±2.23,1.19±0.21
18,mahalanobis|metric|sn_1,mahalanobis_distance,9.85±3.80,0.13±0.05,9.78±1.76,1.18±0.22
11,mahalanobis|reg|sn_1,mahalanobis_distance,9.91±3.77,0.14±0.05,9.54±1.64,1.17±0.09
17,mahalanobis|metric|sn_0.8,mahalanobis_distance,10.45±3.69,0.14±0.05,10.44±2.64,1.25±0.26


# Tune SN values

In [31]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['metric']#'raw', 'reg', 
spectralnorm = ['sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
sn_values = [0.2, 0.4, 0.6, 0.8, 1, 2, 3]

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for sn_value in sn_values:
                for level in ['token', 'sequence']:
                    run_dirs = []
                    name_sn = ''
                    names = [f'{method}|{reg}|{sn}_{sn_value}']
                    for name in dataset_fnames:
                        model_series_dir = f'../../workdir/run_tasks_for_model_series_sn_params/electra_{reg}_{sn}/{name}/0.1/{sn_value}/{method}'
                        print(level, model_series_dir)
                        run_dirs.append([model_series_dir])
                    try:
                        agg_func = choose_agg_func(method, level)
                        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                        res_df.columns = correct_cols(res_df.columns, level)
                        baselines.append(res_df.iloc[-1:])
                        tables.append(res_df.iloc[:-1])
                    except:
                        pass

token ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.2/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.2/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.4/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.4/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.6/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.6/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.8/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/0.8/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/electra_metric_sn/conll2003/0.1/1/mahalanobis
sequence ../../workdir/run_tasks_for_model_se

In [32]:
table_all = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_tab = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full = pd.concat([table_all, baseline_tab]).reset_index()

def to_float(x):
    return float(x.split('±')[0])

table_full.sort_values(by=('CoNLL-2003 (token level)', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp
5,mahalanobis|metric|sn_2,mahalanobis_distance,6.96±2.27,0.10±0.03,9.16±1.39,1.15±0.15
6,mahalanobis|metric|sn_3,mahalanobis_distance,6.96±2.27,0.10±0.03,9.16±1.39,1.15±0.15
4,mahalanobis|metric|sn_1,mahalanobis_distance,9.66±4.69,0.14±0.06,9.74±1.48,1.28±0.27
2,mahalanobis|metric|sn_0.6,mahalanobis_distance,10.37±4.59,0.14±0.05,11.08±2.02,1.34±0.23
12,baseline|metric|sn_2,max_prob,11.26±3.96,0.16±0.06,17.86±1.72,2.56±0.16
13,baseline|metric|sn_3,max_prob,11.26±3.94,0.16±0.05,17.86±1.72,2.57±0.16
3,mahalanobis|metric|sn_0.8,mahalanobis_distance,11.51±4.70,0.16±0.06,9.81±2.24,1.23±0.25
1,mahalanobis|metric|sn_0.4,mahalanobis_distance,12.33±3.20,0.17±0.05,11.47±2.39,1.39±0.19
11,baseline|metric|sn_1,max_prob,14.06±6.87,0.19±0.08,19.55±2.43,2.84±0.37
0,mahalanobis|metric|sn_0.2,mahalanobis_distance,14.51±4.23,0.20±0.06,14.21±2.88,1.71±0.22


In [6]:
sn_values = {
    'conll2003': {
        'metric': 3,
        'reg': 1,
        'raw': 2,
    },
}

## Deberta

In [3]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
methods = ['mahalanobis']
regs = ['metric', 'raw', 'reg']
spectralnorm = ['sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
sn_values = [0.4, 0.6, 0.8, 1, 2, 3]

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for sn_value in sn_values:
                for level in ['token', 'sequence']:
                    run_dirs = []
                    name_sn = ''
                    names = [f'{method}|{reg}|{sn}_{sn_value}']
                    for name in dataset_fnames:
                        model_series_dir = f'../../workdir/run_tasks_for_model_series_sn_params/deberta_{reg}_{sn}/{name}/0.1/{sn_value}/{method}'
                        print(level, model_series_dir)
                        run_dirs.append([model_series_dir])
                    try:
                        agg_func = choose_agg_func(method, level)
                        res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                        res_df.columns = correct_cols(res_df.columns, level)
                        baselines.append(res_df.iloc[-1:])
                        tables.append(res_df.iloc[:-1])
                    except:
                        pass

token ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.2/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.2/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.4/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.4/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.6/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.6/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.8/mahalanobis
sequence ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/0.8/mahalanobis
token ../../workdir/run_tasks_for_model_series_sn_params/deberta_metric_sn/conll2003/0.1/1/mahalanobis
sequence ../../workdir/run_tasks_for_model_se

In [5]:
table_all = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_tab = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full = pd.concat([table_all, baseline_tab]).reset_index()

def to_float(x):
    return float(x.split('±')[0])

table_full.sort_values(by=('CoNLL-2003 (sequence level)', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp
4,mahalanobis|metric|sn_2,mahalanobis_distance,4.22±1.62,0.06±0.02,6.80±1.71,0.96±0.25
5,mahalanobis|metric|sn_3,mahalanobis_distance,4.22±1.62,0.06±0.02,6.80±1.71,0.96±0.25
9,mahalanobis|raw|sn_1,mahalanobis_distance,3.97±1.33,0.06±0.02,6.95±1.60,0.98±0.15
3,mahalanobis|metric|sn_1,mahalanobis_distance,4.26±2.51,0.06±0.03,7.04±2.35,0.89±0.32
1,mahalanobis|metric|sn_0.6,mahalanobis_distance,5.04±2.65,0.07±0.03,7.34±1.69,1.00±0.22
8,mahalanobis|raw|sn_0.8,mahalanobis_distance,4.62±1.83,0.06±0.03,7.34±2.66,0.97±0.39
15,mahalanobis|reg|sn_1,mahalanobis_distance,3.91±1.29,0.06±0.02,7.37±1.67,1.04±0.21
10,mahalanobis|raw|sn_2,mahalanobis_distance,4.18±1.64,0.06±0.02,7.52±1.65,1.08±0.24
11,mahalanobis|raw|sn_3,mahalanobis_distance,4.18±1.64,0.06±0.02,7.52±1.65,1.08±0.24
2,mahalanobis|metric|sn_0.8,mahalanobis_distance,4.55±2.59,0.06±0.03,7.55±1.87,0.94±0.17


In [3]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
spectralnorm = ['no_sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
comsizes = [20]
kernels = ['rbf']

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for max_frac in max_fracs:
                for cs in comsizes:
                    for kernel in kernels:
                        for level in ['token', 'sequence']:
                            run_dirs = []
                            name_sn = ''
                            names = [f'{method}|{reg}|{kernel}_{max_frac}']
                            for name in dataset_fnames:
                                model_series_dir = f'../../workdir/run_tasks_for_model_series_dpp_hp/electra_{reg}_{sn}/{name}/0.1/{method}_{kernel}_{max_frac}_{cs}'
                                print(level, model_series_dir)
                                run_dirs.append([model_series_dir])
                            try:
                                agg_func = choose_agg_func(method, level)
                                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                                res_df.columns = correct_cols(res_df.columns, level)
                                baselines.append(res_df.iloc[-1:])
                                tables.append(res_df.iloc[:-1])
                            except:
                                pass

token ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.3_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.3_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.35_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.35_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.4_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.4_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.45_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.45_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/electra_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.5_20
sequence ../../workdir/run_tas

In [4]:
table_all = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_tab = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full = pd.concat([table_all, baseline_tab]).reset_index()

In [22]:
ds_to_mf = {
    'conll2003': {'ddpp_ood': 0.6, 'ddpp_dpp': 0.6}
}

In [21]:
def to_float(x):
    return float(x.split('±')[0])

table_full.sort_values(by=('CoNLL-2003 (sequence level)', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp
41,ddpp_ood|raw|rbf_0.6,variance,11.67±4.69,0.16±0.06,10.99±1.83,1.35±0.25
29,ddpp_ood|raw|rbf_0.4,variance,12.11±4.96,0.16±0.06,11.01±1.82,1.35±0.26
20,ddpp_dpp|raw|rbf_0.6,variance,11.38±4.22,0.16±0.06,11.03±1.76,1.35±0.22
17,ddpp_dpp|raw|rbf_0.55,variance,11.40±4.20,0.16±0.06,11.10±1.50,1.36±0.21
26,ddpp_ood|raw|rbf_0.35,variance,11.35±4.25,0.16±0.06,11.12±1.76,1.38±0.27
11,ddpp_dpp|raw|rbf_0.45,variance,11.88±4.23,0.17±0.06,11.22±1.56,1.40±0.22
32,ddpp_ood|raw|rbf_0.45,variance,11.36±4.03,0.16±0.06,11.24±1.55,1.39±0.23
28,ddpp_ood|raw|rbf_0.4,sampled_max_prob,11.44±4.33,0.16±0.06,11.25±1.39,1.39±0.23
25,ddpp_ood|raw|rbf_0.35,sampled_max_prob,11.36±4.07,0.16±0.05,11.26±1.39,1.40±0.23
40,ddpp_ood|raw|rbf_0.6,sampled_max_prob,11.44±4.38,0.16±0.06,11.28±1.60,1.40±0.25


In [6]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
spectralnorm = ['no_sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw']
max_fracs = [0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6]
comsizes = [20]
kernels = ['rbf']

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for max_frac in max_fracs:
                for cs in comsizes:
                    for kernel in kernels:
                        for level in ['token', 'sequence']:
                            run_dirs = []
                            name_sn = ''
                            names = [f'{method}|{reg}|{kernel}_{max_frac}']
                            for name in dataset_fnames:
                                model_series_dir = f'../../workdir/run_tasks_for_model_series_dpp_hp/deberta_{reg}_{sn}/{name}/0.1/{method}_{kernel}_{max_frac}_{cs}'
                                print(level, model_series_dir)
                                run_dirs.append([model_series_dir])
                            try:
                                agg_func = choose_agg_func(method, level)
                                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                                res_df.columns = correct_cols(res_df.columns, level)
                                baselines.append(res_df.iloc[-1:])
                                tables.append(res_df.iloc[:-1])
                            except:
                                pass

token ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.3_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.3_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.35_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.35_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.4_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.4_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.45_20
sequence ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.45_20
token ../../workdir/run_tasks_for_model_series_dpp_hp/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp_rbf_0.5_20
sequence ../../workdir/run_tas

In [None]:
table_all_deberta = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_tab_deberta = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full_deberta = pd.concat([table_all_deberta, baseline_tab_deberta]).reset_index()

In [None]:
deberta_ds_to_mf = {
    'conll2003': {'ddpp_ood': 0.45, 'ddpp_dpp': 0.3}
}

In [None]:
def to_float(x):
    return float(x.split('±')[0])

table_full_deberta.sort_values(by=('CoNLL-2003 (sequence level)', 'rcc-auc'), key=lambda x: x.apply(to_float))

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp
31,ddpp_ood|raw|rbf_0.45,sampled_max_prob,5.98±2.54,0.09±0.03,7.57±1.80,1.07±0.20
34,ddpp_ood|raw|rbf_0.5,sampled_max_prob,5.80±2.22,0.09±0.03,7.59±1.81,1.08±0.21
35,ddpp_ood|raw|rbf_0.5,variance,6.00±2.33,0.09±0.03,7.60±1.76,1.05±0.22
38,ddpp_ood|raw|rbf_0.55,variance,6.43±1.80,0.09±0.02,7.61±1.70,1.05±0.20
2,ddpp_dpp|raw|rbf_0.3,variance,9.42±4.96,0.11±0.04,7.64±1.67,1.08±0.20
37,ddpp_ood|raw|rbf_0.55,sampled_max_prob,5.80±2.13,0.09±0.03,7.64±1.82,1.07±0.21
28,ddpp_ood|raw|rbf_0.4,sampled_max_prob,6.04±2.77,0.09±0.04,7.64±1.79,1.07±0.19
40,ddpp_ood|raw|rbf_0.6,sampled_max_prob,5.73±2.14,0.09±0.03,7.66±1.83,1.07±0.20
7,ddpp_dpp|raw|rbf_0.4,sampled_max_prob,6.03±2.56,0.09±0.03,7.66±1.93,1.08±0.22
1,ddpp_dpp|raw|rbf_0.3,sampled_max_prob,7.20±2.91,0.10±0.03,7.71±1.83,1.11±0.20


# Final results

## Electra

In [25]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
spectralnorm = ['no_sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw', 'reg', 'metric']

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for level in ['token', 'sequence']:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}|{kernel}_{max_frac}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series/electra_{reg}_{sn}/{name}/0.1/{method}'
                    print(level, model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    agg_func = choose_agg_func(method, level)
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                    res_df.columns = correct_cols(res_df.columns, level)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    pass

token ../../workdir/run_tasks_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp
sequence ../../workdir/run_tasks_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_dpp
token ../../workdir/run_tasks_for_model_series/electra_reg_no_sn/conll2003/0.1/ddpp_dpp
sequence ../../workdir/run_tasks_for_model_series/electra_reg_no_sn/conll2003/0.1/ddpp_dpp
token ../../workdir/run_tasks_for_model_series/electra_metric_no_sn/conll2003/0.1/ddpp_dpp
sequence ../../workdir/run_tasks_for_model_series/electra_metric_no_sn/conll2003/0.1/ddpp_dpp
token ../../workdir/run_tasks_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_ood
sequence ../../workdir/run_tasks_for_model_series/electra_raw_no_sn/conll2003/0.1/ddpp_ood
token ../../workdir/run_tasks_for_model_series/electra_reg_no_sn/conll2003/0.1/ddpp_ood
sequence ../../workdir/run_tasks_for_model_series/electra_reg_no_sn/conll2003/0.1/ddpp_ood
token ../../workdir/run_tasks_for_model_series/electra_metric_no_sn/conll2003/0.1/ddpp_ood
sequence

In [26]:
table_res = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_res = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full_res = pd.concat([table_res, baseline_res]).reset_index()

In [27]:
table_full_res

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp
0,ddpp_dpp|raw|rbf_0.6,bald,6.39±0.64,0.10±0.01,21.53±4.77,2.63±0.45
1,ddpp_dpp|raw|rbf_0.6,sampled_max_prob,6.08±0.62,0.10±0.01,17.71±2.77,2.05±0.23
2,ddpp_dpp|raw|rbf_0.6,variance,6.12±0.71,0.10±0.01,16.78±2.44,1.93±0.20
3,ddpp_dpp|reg|rbf_0.6,bald,7.90±1.95,0.12±0.01,26.20±6.41,3.11±0.56
4,ddpp_dpp|reg|rbf_0.6,sampled_max_prob,6.91±1.13,0.11±0.02,20.66±1.53,2.31±0.08
5,ddpp_dpp|reg|rbf_0.6,variance,6.98±0.98,0.11±0.02,19.44±1.15,2.13±0.17
6,ddpp_dpp|metric|rbf_0.6,bald,8.01±2.08,0.13±0.03,22.44±4.78,2.67±0.49
7,ddpp_dpp|metric|rbf_0.6,sampled_max_prob,6.92±1.32,0.11±0.02,19.11±2.14,2.16±0.22
8,ddpp_dpp|metric|rbf_0.6,variance,7.33±1.53,0.12±0.02,18.93±2.09,2.11±0.25
9,ddpp_ood|raw|rbf_0.6,bald,6.59±0.69,0.11±0.01,20.56±3.09,2.50±0.26


## Deberta

In [None]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
spectralnorm = ['no_sn']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
methods = ['ddpp_dpp', 'ddpp_ood']
regs = ['raw', 'reg', 'metric']

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for level in ['token', 'sequence']:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series/deberta_{reg}_{sn}/{name}/0.1/{method}'
                    print(level, model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    agg_func = choose_agg_func(method, level)
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                    res_df.columns = correct_cols(res_df.columns, level)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    pass

token ../../workdir/run_tasks_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp
sequence ../../workdir/run_tasks_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_dpp
token ../../workdir/run_tasks_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_dpp
sequence ../../workdir/run_tasks_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_dpp
token ../../workdir/run_tasks_for_model_series/deberta_metric_no_sn/conll2003/0.1/ddpp_dpp
sequence ../../workdir/run_tasks_for_model_series/deberta_metric_no_sn/conll2003/0.1/ddpp_dpp
token ../../workdir/run_tasks_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_ood
sequence ../../workdir/run_tasks_for_model_series/deberta_raw_no_sn/conll2003/0.1/ddpp_ood
token ../../workdir/run_tasks_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_ood
sequence ../../workdir/run_tasks_for_model_series/deberta_reg_no_sn/conll2003/0.1/ddpp_ood


In [None]:
table_res = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_res = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full_res = pd.concat([table_res, baseline_res]).reset_index()

In [None]:
table_full_res

In [3]:
import os 

def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner
    else:
        raise ValueError("Wrong metric type!")

def choose_agg_func(method, level):
    agg_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
    }
    if method=='nuq':
        nuq_aleatoric = lambda x: np.squeeze(x[:, 0], axis=-1)
        nuq_epistemic = lambda x: np.squeeze(x[:, 1], axis=-1)
        nuq_total = lambda x: np.squeeze(x[:, 2], axis=-1)
        agg_methods = {
            "nuq_aleatoric": nuq_aleatoric,
            "nuq_epistemic": nuq_epistemic,
            "nuq_total": nuq_total,
        }
    elif method=='mahalanobis':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"mahalanobis_distance": maha_dist}
    elif method=='sngp':
        if level == "token":
            maha_dist = lambda x: np.squeeze(x[:, 0], axis=-1)
        elif level == "sequence":
            maha_dist = lambda x: np.squeeze(np.expand_dims(x[:, 0], axis=1), axis=-1)            
        agg_methods = {"stds": maha_dist}
    elif method=='mc_mahalanobis':
        sm_maha_dist = lambda x: np.squeeze(x[:, 1:], axis=-1).max(1)
        agg_methods = {"sampled_mahalanobis_distance": sm_maha_dist}
    return agg_methods

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)
    
metric_types=["rcc-auc", 'rpp']
spectralnorm = ['sngp']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
methods = ['sngp']
regs = ['raw']

names = []
tables = []
baselines = []
for method in methods:
    for reg in regs:
        for sn in spectralnorm:
            for level in ['token', 'sequence']:
                run_dirs = []
                name_sn = ''
                names = [f'{method}|{reg}']
                for name in dataset_fnames:
                    model_series_dir = f'../../workdir/run_tasks_for_model_series/electra_{reg}_{sn}/{name}/{method}'
                    print(level, model_series_dir)
                    run_dirs.append([model_series_dir])
                try:
                    agg_func = choose_agg_func(method, level)
                    res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, baselines={}, methods=agg_func, level=level)
                    res_df.columns = correct_cols(res_df.columns, level)
                    baselines.append(res_df.iloc[-1:])
                    tables.append(res_df.iloc[:-1])
                except:
                    pass

token ../../workdir/run_tasks_for_model_series/electra_raw_sngp/conll2003/sngp
sequence ../../workdir/run_tasks_for_model_series/electra_raw_sngp/conll2003/sngp


In [4]:
table_res = pd.concat([pd.concat(tables[::2]), pd.concat(tables[1::2])], axis=1)
baseline_res = pd.concat([pd.concat(baselines[::2]), pd.concat(baselines[1::2])], axis=1).iloc[-18:]
table_full_res = pd.concat([table_res, baseline_res]).reset_index()

In [5]:
table_full_res

Unnamed: 0_level_0,Method,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,rcc-auc,rpp,rcc-auc,rpp
0,sngp|raw,stds,8.15±3.11,0.12±0.04,27.34±5.59,3.47±0.53
1,baseline|sngp|raw,max_prob,11.90±3.47,0.17±0.03,26.29±3.63,3.32±0.33
