In [2]:
import yaml
import os
from yaml import Loader as Loader
from pathlib import Path
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_auc_score

from analyze_results import (
    extract_result,
    aggregate_runs,
    from_model_outputs_calc_rcc_auc,
)
from analyze_results import (
    format_results2,
    improvement_over_baseline,
    from_model_outputs_calc_rpp_ner, 
    from_model_outputs_calc_rcc_auc_ner,
    extract_result_ner,
    load_and_preprocess_ner,
    from_model_outputs_calc_rcc_auc_ner_mc_maha,
    from_model_outputs_calc_rpp_ner_mc_maha,
    from_model_outputs_calc_arc_auc_ner_mc_maha
)

from utils.utils_wandb import init_wandb, wandb
from ue4nlp.ue_scores import *

In [2]:
def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return metric_type#from_model_outputs_calc_arc_auc_ner_mc_maha#
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner#from_model_outputs_calc_rcc_auc_ner_mc_maha#
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner#from_model_outputs_calc_rpp_ner_mc_maha#

    else:
        raise ValueError("Wrong metric type!")


def get_one_table(runs_dir, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], level='token', baseline=None):
    default_methods = {
        "bald": bald,
        "sampled_max_prob": sampled_max_prob,
        "variance": probability_variance,
        "sampled_entropy": mean_entropy,
        "var.ratio": var_ratio,
    }

    table = []
    for metric_type in metric_types:
        metric = choose_metric(metric_type=metric_type)

        agg_res = aggregate_runs(
            runs_dir, methods=default_methods, metric=metric, task_type=f'ner-{level}'
        )

        if agg_res.empty:
            print("Broken\n")
            continue

        if metric_type == "rcc-auc":
            final_score = format_results2(agg_res, percents=False)
        elif metric_type == "rpp":
            final_score = format_results2(agg_res, percents=True)
        else:
            final_score = improvement_over_baseline(agg_res, baseline_col="max_prob", metric=metric_type, percents=True, subtract=True, baseline=baseline)
        table.append(final_score)
    res_table = pd.concat(table, axis=1)
    res_table.columns = metric_types
    # fix for rcc-auc and rpp
    if 'baseline (max_prob)' not in res_table.index:
        res_table.loc['baseline (max_prob)'] = 0
    for metric in ['rcc-auc', 'rpp']:
        try:
            res_table[metric].loc['baseline (max_prob)'] = res_table[metric].loc['max_prob']
        except:
            pass
    try:
        res_table = res_table.drop(['max_prob', 'count'])
    except:
        res_table = res_table.drop(['max_prob'])
    return res_table


def collect_tables(run_dirs, names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], level='token', baseline=None):
    all_tables = []
    for run_dir, name in zip(run_dirs, names):
        buf_table = get_one_table(run_dir, metric_types, level, baseline)
        #print(buf_table)
        # add name to index
        indices = [(name, ind) for ind in list(buf_table.index)]
        baseline_name = 'baseline|'+'|'.join(name.split('|')[2:])
        buf_table.loc[baseline_name] = buf_table.loc['baseline (max_prob)']
        # add reindex
        indices = indices + [(baseline_name, 'max_prob')]
        
        index = pd.MultiIndex.from_tuples(indices, names=['Method', 'UE Score'])
        buf_table.index = index
        buf_table.drop((name, 'baseline (max_prob)'), inplace=True)
        # add buf_table to final_table
        all_tables.append(buf_table)
    return pd.concat(all_tables)


def collect_datasets(runs_dirs, names, dataset_names, metric_types=["rejection-curve-auc", "roc-auc", "rcc-auc", "pr-auc", "rpp"], level='token', baselines={}):
    all_tables = []
    for run_dir, dataset_name in zip(runs_dirs, dataset_names):
        dataset_table = collect_tables(run_dir, names, metric_types, level, baselines.get(level, None))
        columns = pd.MultiIndex.from_tuples([(dataset_name, ind) for ind in list(dataset_table.columns)])
        dataset_table.columns = columns
        all_tables.append(dataset_table)
    return pd.concat(all_tables, axis=1)

In [3]:
max_prob_path = '../../ner_metrics/conll2003-10-baseline-metrics/metrics_seq_rejection-curve-auc.json'

with open(Path(max_prob_path)) as f:
    model_outputs = json.load(f)
    
raw_baselines = {'sequence' : {'rejection-curve-auc': pd.Series([np.mean([model_outputs['max_prob'][k] for k in model_outputs['max_prob']])])}}


max_prob_path = '../../ner_metrics/conll2003-10-baseline-metrics/metrics_token_rejection-curve-auc.json'

with open(Path(max_prob_path)) as f:
    model_outputs = json.load(f)
    
raw_baselines['token'] = {'rejection-curve-auc': pd.Series([np.mean([model_outputs['max_prob'][k] for k in model_outputs['max_prob']])])}

# Deep Ensemble

In [4]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []
for level in levels:
    run_dirs = []
    names = [f'Deep Ensemble']
    for name in dataset_fnames:
        model_series_dir = f'../workdir/run_conll2003_for_ensemble_series/'
        model_series_dir += np.sort(os.listdir(model_series_dir))[-1]
        model_series_dir += f'/{np.sort(os.listdir(model_series_dir))[-1]}/final_results/'
        print(model_series_dir)
        run_dirs.append([model_series_dir])
    ens_tab = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines)
    ens_tab.columns = correct_cols(ens_tab.columns, level)
    baselines.append(ens_tab.iloc[-1:])
    tables.append(ens_tab.iloc[:-1])

../workdir/run_conll2003_for_ensemble_series/2021-10-04/14-52-04/final_results/
../workdir/run_conll2003_for_ensemble_series/2021-10-04/14-52-04/final_results/


In [5]:
ens_tab = pd.concat(tables, axis=1)

In [6]:
pd.concat(baselines, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
baseline|,max_prob,93.35±0.28,191.86±27.05,1.74±0.29,86.66±0.90,51.80±10.33,5.84±0.48


In [7]:
def preproc_regs(x):
    regs = x.split('|')
    return regs[-1]

ens_tab = ens_tab.reset_index()
ens_tab['Reg. Type'] = 'raw'
ens_tab['Dropout Layers'] = '-'
ens_tab['Method'] = ens_tab['Method'].apply(lambda x: x.split('|')[0])
ens_tab = ens_tab[list(ens_tab.columns[:1]) + list(ens_tab.columns[-2:]) + list(ens_tab.columns[1:-2])]

In [8]:
ens_tab

Unnamed: 0_level_0,Method,Reg. Type,Dropout Layers,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,raw,-,bald,1.82±0.11,24.90±1.93,0.36±0.06,4.89±1.45,37.37±8.69,3.53±0.55
1,Deep Ensemble,raw,-,sampled_max_prob,0.50±0.29,221.74±40.94,2.06±0.43,4.01±0.40,52.43±12.09,5.37±0.74
2,Deep Ensemble,raw,-,variance,1.44±0.14,47.45±1.22,0.66±0.08,1.64±1.31,63.40±5.13,6.59±0.28
3,Deep Ensemble,raw,-,sampled_entropy,1.17±0.10,84.61±20.83,0.87±0.12,3.85±0.34,53.65±9.51,5.40±0.49
4,Deep Ensemble,raw,-,var.ratio,1.68±0.16,57.72±17.14,0.59±0.10,6.21±0.70,30.16±8.67,2.43±0.44


In [8]:
ens_tab

Unnamed: 0_level_0,Method,Reg. Type,Dropout Layers,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
0,Deep Ensemble,raw,-,bald,1.82±0.11,24.90±1.93,0.36±0.06,4.89±1.45,37.37±8.69,3.53±0.55
1,Deep Ensemble,raw,-,sampled_max_prob,0.50±0.29,221.74±40.94,2.06±0.43,4.01±0.40,52.43±12.09,5.37±0.74
2,Deep Ensemble,raw,-,variance,1.44±0.14,47.45±1.22,0.66±0.08,1.64±1.31,63.40±5.13,6.59±0.28
3,Deep Ensemble,raw,-,sampled_entropy,1.17±0.10,84.61±20.83,0.87±0.12,3.85±0.34,53.65±9.51,5.40±0.49
4,Deep Ensemble,raw,-,var.ratio,1.68±0.16,57.72±17.14,0.59±0.10,6.44±0.67,30.16±8.67,2.43±0.44


In [9]:
print(str(ens_tab.to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{llllllllll}
\toprule
        Method & Reg. Type & Dropout Layers &          UE Score & \multicolumn{3}{l}{CoNLL-2003 (token level)} & \multicolumn{3}{l}{CoNLL-2003 (sequence level)} \\
               &      rejection-curve-auc &       rcc-auc &        rpp &         rejection-curve-auc &      rcc-auc &        rpp \\
\midrule
 Deep Ensemble &       raw &              - &              bald &                1.82$\pm$0.11 &    24.90$\pm$1.93 &  0.36$\pm$0.06 &                   4.89$\pm$1.45 &   37.37$\pm$8.69 &  3.53$\pm$0.55 \\
 Deep Ensemble &       raw &              - &  sampled\_max\_prob &                0.50$\pm$0.29 &  221.74$\pm$40.94 &  2.06$\pm$0.43 &                   4.01$\pm$0.40 &  52.43$\pm$12.09 &  5.37$\pm$0.74 \\
 Deep Ensemble &       raw &              - &          variance &                1.44$\pm$0.14 &    47.45$\pm$1.22 &  0.66$\pm$0.08 &                   1.64$\pm$1.31 &   63.40$\pm$5.13 &  6.59$\pm$0.28 \\
 Deep Ensemble &       raw &             

# MC Mahalanobis

In [5]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
regs = ['raw', 'reg']
methods = ['mc_mahalanobis']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for sn in ['-False', '-True']:
            for level in levels:
                run_dirs = []
                name_sn = '|spectral_norm' if sn == '-True' else ''
                names = [f'{method}|last|{reg}{name_sn}']
                for name in dataset_fnames:
                    model_series_dir = f'../workdir/run_glue_for_model_series/electra-{reg}{sn}/{name}/0.1/{method}/'
                    print(model_series_dir)
                    run_dirs.append([model_series_dir])
                res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines)
                res_df.columns = correct_cols(res_df.columns, level)
                baselines.append(res_df.iloc[-1:])
                tables.append(res_df.iloc[:-1])

../workdir/run_glue_for_model_series/electra-raw-False/conll2003/0.1/mc_mahalanobis/
../workdir/run_glue_for_model_series/electra-raw-False/conll2003/0.1/mc_mahalanobis/
../workdir/run_glue_for_model_series/electra-raw-True/conll2003/0.1/mc_mahalanobis/
../workdir/run_glue_for_model_series/electra-raw-True/conll2003/0.1/mc_mahalanobis/
../workdir/run_glue_for_model_series/electra-reg-False/conll2003/0.1/mc_mahalanobis/
../workdir/run_glue_for_model_series/electra-reg-False/conll2003/0.1/mc_mahalanobis/
../workdir/run_glue_for_model_series/electra-reg-True/conll2003/0.1/mc_mahalanobis/
../workdir/run_glue_for_model_series/electra-reg-True/conll2003/0.1/mc_mahalanobis/


In [12]:
table_mc_det = pd.concat([pd.concat([tab for tab in tables[0::2]]), pd.concat([tab for tab in tables[1::2]])], axis=1)

In [13]:
def preproc_regs(x):
    ind = 2
    if 'baseline' in x:
        ind = 1
    regs = x.split('|')[ind:]
    if regs[-1] == 'spectral_norm':
        regs[-1] = 'SN'
    if len(regs) == 2 and regs[0] == 'raw':
        return regs[-1]
    return '+'.join(regs)
        
table_mc_det = table_mc_det.reset_index()
table_mc_det['Reg. Type'] = table_mc_det.Method.apply(lambda x: preproc_regs(x))
table_mc_det['Method'] = table_mc_det['Method'].apply(lambda x: x.split('|')[0][:2].upper() + ' ' + x.split('|')[0][3:].capitalize() if 'maha' in x else x.split('|')[0])
table_mc_det = table_mc_det[list(table_mc_det.columns[:1]) + list(table_mc_det.columns[-1:]) + list(table_mc_det.columns[1:-1])]

In [19]:
table_mc_det.loc[list(range(1, 8, 2)), ('UE Score', '')] = 'SMD'

In [20]:
table_mc_det.loc[list(range(1, 8, 2))]

Unnamed: 0_level_0,Method,Reg. Type,UE Score,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
1,MC Mahalanobis,raw,SMD,-12.82±23.31,26.76±7.19,0.38±0.10,1.74±1.55,61.05±12.90,5.25±0.98
3,MC Mahalanobis,SN,SMD,-5.29±18.43,24.56±8.06,0.35±0.12,2.67±1.38,53.55±10.54,4.89±1.01
5,MC Mahalanobis,reg,SMD,-5.29±18.43,61.35±26.08,0.84±0.34,-5.29±2.02,107.71±16.77,8.56±0.78
7,MC Mahalanobis,reg+SN,SMD,-5.29±18.43,46.92±8.88,0.64±0.13,-4.52±2.53,99.54±17.15,8.31±1.32


In [21]:
print(str(table_mc_det.iloc[list(range(1, 8, 2))].to_latex(index=False)).replace('±', '$\pm$'))

\begin{tabular}{lllllllll}
\toprule
         Method & Reg. Type & UE Score & \multicolumn{3}{l}{CoNLL-2003 (token level)} & \multicolumn{3}{l}{CoNLL-2003 (sequence level)} \\
                &      rejection-curve-auc &      rcc-auc &        rpp &         rejection-curve-auc &       rcc-auc &        rpp \\
\midrule
 MC Mahalanobis &       raw &      SMD &             -12.82$\pm$23.31 &   26.76$\pm$7.19 &  0.38$\pm$0.10 &                   1.74$\pm$1.55 &   61.05$\pm$12.90 &  5.25$\pm$0.98 \\
 MC Mahalanobis &        SN &      SMD &              -5.29$\pm$18.43 &   24.56$\pm$8.06 &  0.35$\pm$0.12 &                   2.67$\pm$1.38 &   53.55$\pm$10.54 &  4.89$\pm$1.01 \\
 MC Mahalanobis &       reg &      SMD &              -5.29$\pm$18.43 &  61.35$\pm$26.08 &  0.84$\pm$0.34 &                  -5.29$\pm$2.02 &  107.71$\pm$16.77 &  8.56$\pm$0.78 \\
 MC Mahalanobis &    reg+SN &      SMD &              -5.29$\pm$18.43 &   46.92$\pm$8.88 &  0.64$\pm$0.13 &                  -4.52$\pm$2.53 &  

# SNGP

In [4]:
from analyze_results import *

def calc_rcc_aucs_seq_sngp(probabilities, labels, predictions, stds):
    risk_binary = [1.0 * (l != p) for l, p in zip(labels, predictions)]

    results = {}

    ue_scores_stds = np.zeros(len(labels))
    for i in range(len(labels)):
        sent = np.asarray(stds[i])
        ue_scores_stds[i] = sent.max()
    results['sngp'] = rcc_auc(-ue_scores_stds, risk_binary)

    n_examples = len(risk_binary)
    ue_scores_max = np.zeros(n_examples)
    for i in range(n_examples):
        sent = probabilities[i]
        true_probs_max = np.asarray([np.max(proba) for proba in sent])
        ue_scores_max[i] = np.mean(true_probs_max)
    results["max_prob"] = rcc_auc(ue_scores_max, risk_binary)
    return results


def calc_rpp_seq_sngp(probabilities, labels, predictions, stds):
    risk_binary = [1.0 * (l != p) for l, p in zip(labels, predictions)]

    results = {}

    ue_scores_stds = np.zeros(len(labels))
    for i in range(len(labels)):
        sent = np.asarray(stds[i])
        ue_scores_stds[i] = sent.max()
    results['sngp'] = rpp(-ue_scores_stds, risk_binary)

    n_examples = len(risk_binary)
    ue_scores_max = np.zeros(n_examples)
    for i in range(n_examples):
        sent = probabilities[i]
        true_probs_max = np.asarray([np.max(proba) for proba in sent])
        ue_scores_max[i] = np.mean(true_probs_max)
    results["max_prob"] = rpp(ue_scores_max, risk_binary)
    return results

def from_model_outputs_calc_rpp_ner_sngp(model_outputs, methods, level="token"):
    probs = np.asarray(model_outputs["probabilities"])
    probs_toks = probs.reshape(-1, probs.shape[-1])
    
    sampled_probs = np.asarray(model_outputs["sampled_probabilities"])
    sampled_probs_toks = sampled_probs.reshape(
        sampled_probs.shape[0], sampled_probs.shape[1] * sampled_probs.shape[2], -1
    )

    stds = np.asarray(model_outputs["stds"]).mean(-1)
    stds_toks = stds.reshape(-1)

    labels = np.asarray(model_outputs["true_labels"])
    labels_toks = labels.reshape(-1)

    use_idx = labels_toks != -100
    if level == "token":
        
        res = calc_rpp_from_ue_scores([stds_toks[use_idx]], 
                                      ['sngp'], 
                                      probs_toks[use_idx], 
                                      labels_toks[use_idx])
    else:
        # sequence level
        _, _, predictions, _ = unpad_preds(
            probs, sampled_probs, np.argmax(probs, axis=-1), labels
        )
        
        sampled_probs, probs, stds, labels = unpad_preds(
            probs, sampled_probs, stds, labels
        )
        res = calc_rpp_seq_sngp(
            probs,
            labels,
            predictions,
            stds,
        )
    return res

def from_model_outputs_calc_rcc_auc_ner_sngp(model_outputs, methods, level="token"):
    probs = np.asarray(model_outputs["probabilities"])
    probs_toks = probs.reshape(-1, probs.shape[-1])

    stds = np.asarray(model_outputs["stds"]).mean(-1)
    stds_toks = stds.reshape(-1)
    
    sampled_probs = np.asarray(model_outputs["sampled_probabilities"])
    sampled_probs_toks = sampled_probs.reshape(
        sampled_probs.shape[0], sampled_probs.shape[1] * sampled_probs.shape[2], -1
    )

    labels = np.asarray(model_outputs["true_labels"])
    labels_toks = labels.reshape(-1)

    use_idx = labels_toks != -100
    if level == "token":
        
        res = calc_rcc_aucs_from_ue_scores([stds_toks[use_idx]], 
                                      ['sngp'], 
                                      probs_toks[use_idx], 
                                      labels_toks[use_idx])
    else:
        # sequence level
        _, _, predictions, _ = unpad_preds(
            probs, sampled_probs, np.argmax(probs, axis=-1), labels
        )
        
        sampled_probs, probs, stds, labels = unpad_preds(
            probs, sampled_probs, stds, labels
        )
        res = calc_rcc_aucs_seq_sngp(
            probs,
            labels,
            predictions,
            stds,
        )
    return res

In [5]:
def from_model_outputs_calc_arc_auc_ner_sngp(model_outputs, methods, level="token"):
    probs = np.asarray(model_outputs["probabilities"])
    probs_toks = probs.reshape(-1, probs.shape[-1])

    stds = np.asarray(model_outputs["stds"]).mean(-1)
    stds_toks = stds.reshape(-1)
    
    sampled_probs = np.asarray(model_outputs["sampled_probabilities"])
    sampled_probs_toks = sampled_probs.reshape(
        sampled_probs.shape[0], sampled_probs.shape[1] * sampled_probs.shape[2], -1
    )

    labels = np.asarray(model_outputs["true_labels"])
    labels_toks = labels.reshape(-1)

    use_idx = labels_toks != -100
    if level == "token":
        res = calc_aucs_sngp(labels_toks[use_idx], 
                              stds_toks[use_idx], 
                              probs_toks[use_idx])
    else:
        # sequence level
        _, _, predictions, _ = unpad_preds(
            probs, sampled_probs, np.argmax(probs, axis=-1), labels
        )
        
        sampled_probs, probs, stds, labels = unpad_preds(
            probs, sampled_probs, stds, labels
        )
        res = calc_arc_aucs_seq_sngp(
            probs,
            labels,
            predictions,
            stds,
        )
    return res

def calc_aucs_sngp(eval_labels, stds, probabilities):
    
    predictions = np.argmax(probabilities, axis=-1)
    errors = (eval_labels!=predictions).astype('uint8')

    results = {}
    ratio_list = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    sorted_indexes_ensemble = np.argsort(-stds)
    ens_scores = [get_score_ratio(sorted_indexes_ensemble, predictions, eval_labels, ratio) for ratio in ratio_list]
    results['sngp']  = auc(ratio_list, ens_scores)
    
    model_ues = 1 - np.max(probabilities, axis=1)
    sorted_indexes_model = np.argsort(-model_ues)
    model_scores = [get_score_ratio(sorted_indexes_model, predictions, eval_labels, ratio) for ratio in ratio_list]    
    results['max_prob'] = auc(ratio_list, model_scores)

    return results


def calc_arc_aucs_seq_sngp(probabilities, labels, predictions, stds):
    risk_binary = [1.0 * (l != p) for l, p in zip(labels, predictions)]

    results = {}
    ratio_list = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    
    ue_scores_stds = np.zeros(len(labels))
    for i in range(len(labels)):
        sent = np.asarray(stds[i])
        ue_scores_stds[i] = sent.max()
    sorted_indexes_ensemble = np.argsort(-ue_scores_stds)
    ens_scores = [get_score_ratio_seq(sorted_indexes_ensemble, predictions, labels, ratio) for ratio in ratio_list]
    results['sngp']  = auc(ratio_list, ens_scores)
    
    n_examples = len(risk_binary)
    ue_scores_max = np.zeros(n_examples)
    for i in range(n_examples):
        sent = probabilities[i]
        true_probs_max = np.asarray([np.max(proba) for proba in sent])
        ue_scores_max[i] = np.mean(true_probs_max)
    
    sorted_indexes_ensemble = np.argsort(-ue_scores_max)
    ens_scores = [get_score_ratio_seq(sorted_indexes_ensemble, predictions, labels, ratio) for ratio in ratio_list]
    results['max_prob'] = auc(ratio_list, ens_scores)
    return results

In [6]:
def choose_metric(metric_type):
    if metric_type in ["rejection-curve-auc", "roc-auc"]:
        return from_model_outputs_calc_arc_auc_ner_sngp
    elif metric_type == "rcc-auc":
        return from_model_outputs_calc_rcc_auc_ner_sngp
    elif metric_type == "rpp":
        return from_model_outputs_calc_rpp_ner_sngp

    else:
        raise ValueError("Wrong metric type!")

In [7]:
import os

def correct_cols(cols, level):
    new_names = []
    for col in cols:
        new_names.append((col[0]+f' ({level} level)', col[1]))
    return pd.MultiIndex.from_tuples(new_names)

metric_types=["rejection-curve-auc", "rcc-auc", 'rpp']
levels = ['token', 'sequence']
regs = ['raw']#, 'reg']
methods = ['sngp']
dataset_names = ['CoNLL-2003']
dataset_fnames = ['conll2003']
names = []
tables = []
baselines = []

for method in methods:
    for reg in regs:
        for level in levels:
            run_dirs = []
            names = [f'{method}|{reg}']
            for name in dataset_fnames:
                model_series_dir = f'../workdir/run_ner_for_model_series/electra-{reg}-sngp/{name}/0.1/'
                print(model_series_dir)
                run_dirs.append([model_series_dir])
            res_df = collect_datasets(run_dirs, names, dataset_names, metric_types=metric_types, level=level, baselines=raw_baselines)
            res_df.columns = correct_cols(res_df.columns, level)
            baselines.append(res_df.iloc[-1:])
            tables.append(res_df.iloc[:-1])

../workdir/run_ner_for_model_series/electra-raw-sngp/conll2003/0.1/
../workdir/run_ner_for_model_series/electra-raw-sngp/conll2003/0.1/


In [8]:
table_sngp = pd.concat([pd.concat([tab for tab in tables[0::2]]), pd.concat([tab for tab in tables[1::2]])], axis=1)

In [9]:
table_sngp

Unnamed: 0_level_0,Unnamed: 1_level_0,CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (token level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level),CoNLL-2003 (sequence level)
Unnamed: 0_level_1,Unnamed: 1_level_1,rejection-curve-auc,rcc-auc,rpp,rejection-curve-auc,rcc-auc,rpp
Method,UE Score,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
sngp|raw,sngp,-1.03±7.74,340.93±800.82,1.99±3.37,-2.51±11.71,94.93±89.05,4.97±2.28
