In [1]:
# Notebook for new results table

In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
def to_underline(text):
    return '\\underline{' + text + '}'

def to_bf(text):
    return '\\textbf{' + text + '}'

In [4]:
def calc_exp_mean_and_std(fname, ue_methods, metric, highlight_baseline=False):
    perc_metrics = ['rejection-curve-auc', 'rpp']
    with open(fname, 'r') as f:
        curr_metrics = json.loads(f.read())
    metric_results = {}
    for ue_method in ue_methods:
        mean, std = np.mean(list(curr_metrics[ue_method].values())), np.std(list(curr_metrics[ue_method].values()), ddof=1)
        if metric in perc_metrics:
            mean, std = mean * 100, std * 100
        if ue_method == 'max_prob':
            baseline = mean
        value = '{:.{prec}f}'.format(mean, prec=2) + '$\\pm$' + '{:.{prec}f}'.format(std, prec=2)
        if highlight_baseline and ue_method == 'max_prob':
            value = '\\textit{' + value + '}'
        metric_results[ue_method] = value
    return metric_results


def build_method_model_block(path, ue, dataset, ue_methods, metrics, ner_prefix, highlight_baseline=False):
    raw_dict = {}
    df_dict = {}
    # enter row level
    raw_dict[ue] = {}
    df_dict[ue] = {}
    raw_dict[ue][dataset] = {}
    for metric in metrics:
        ue_path = path + dataset + '/' + ue + '/'
        if ner_prefix is not None:
            fname = ue_path + f'metrics_{ner_prefix}_{metric}.json'
        else:
            fname = ue_path + f'metrics_{metric}.json'
        metric_results = calc_exp_mean_and_std(fname, ue_methods, metric, highlight_baseline)
        # so we obtained two dict for one metric
        raw_dict[ue][dataset][metric] = metric_results
        # make buf dataframe
    type_df = pd.DataFrame.from_dict(raw_dict[ue][dataset])
    df_dict[ue][dataset] = type_df
    return df_dict


def build_section(pathes, regs, ues, dataset, ue_methods, metrics, method_name, ner_prefix=None):
    section = {}
    for idx, path in enumerate(pathes):
        highlight_baseline = False
        ue = ues[idx]
        if regs[idx] == '-' and not('MD SN' in method_name):
            highlight_baseline = True
        buf = build_method_model_block(path, ue, dataset, ue_methods, metrics, ner_prefix, highlight_baseline)
        section[regs[idx]] = buf
    df = pd.concat([section[reg][ue][dataset] for reg, ue in zip(regs, ues)])
    df = underline_best(df, metrics)
    method_names = [method_name] * (len(ue_methods) * len(regs))
    if 'max_prob' in ue_methods:
        start_idx = ue_methods.index('max_prob')
        for idx in range(start_idx, len(method_names), len(ue_methods)):
            if 'MD SN' in method_name:
                method_names[idx] = 'SR SN'
            else:
                method_names[idx] = 'SR'
                if regs[(idx - start_idx) // len(ue_methods)] == '-':
                    method_names[idx] = 'SR (baseline)'
    reg_names = []
    for reg in regs:
        reg_names += [reg] * len(ue_methods)
    return df, method_names, reg_names


def underline_best(df, metrics):
    for metric in metrics:
        # find smaller value
        values = [float(x.split('$\\pm$')[0]) for x in df[metric] if not('\\textit{' in x)]
        values_ids = [idx for idx, x in enumerate(df[metric]) if not('\\textit{' in x)]
        min_idx = values_ids[np.argmin(values)]
        #print(values)
        #print(df.index)
        #print(min_idx, df[metric].iloc[min_idx], values[min_idx])
        #min_idx = np.argmin(df[metric])
        df[metric].iloc[min_idx] = to_underline(df[metric].iloc[min_idx])
    return df


def bold_and_underline_best(df, datasets, metrics):
    for metric in metrics:
        # find smaller value
        values = [float(x.split('$\\pm$')[0]) for x in best_res[dataset, metric] if not('\\textit{' in x)]
        min_idx = np.argmin(values)
        #min_idx = np.argmin(df[metric])
        df[metric].iloc[min_idx] = to_bf(df[metric].iloc[min_idx])
    df = underline_best(df, metrics)
    return df


def add_multiindex(dfs, names, metrics):
    # add multiindex to columns
    for idx, df in enumerate(dfs):
        df.columns = pd.MultiIndex.from_tuples([(names[idx], metric) for metric in metrics])
        dfs[idx] = df
    #token_df.columns = pd.MultiIndex.from_tuples([('CoNNL-2003 (10%, token level)', metric) for metric in metrics])
    #seq_df.columns = pd.MultiIndex.from_tuples([('CoNNL-2003 (10%, seq. level)', metric) for metric in metrics])
    new_df = pd.concat(dfs, axis=1)
    return new_df

def add_labels(df, method_names, reg_names):
    names_df = pd.DataFrame()
    names_df['Method'] = method_names
    names_df['Reg. Type'] = reg_names
    names_df['UE Score'] = df.index
    names_df.index = df.index
    df = pd.concat([names_df, df], axis=1)
    # Now reindex and move SR rows
    df['index'] = list(range(len(df)))
    df = df.set_index('index')
    sr_indices = np.where(df['Method'].str.contains('SR'))[0]
    baseline_indices = np.where(df['Method'].str.contains('SR '))[0]
    new_index = [idx for idx in range(len(df)) if idx not in sr_indices] + list([idx for idx in sr_indices if idx not in baseline_indices]) + list(baseline_indices)
    df = df.reindex(new_index).reset_index()
    df = df.drop('index', axis=1)
    return df


def prepare_latex_table(table):
    latex_table = table.to_latex(bold_rows=False, index=False)
    latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
    latex_table = latex_table.replace('variance', 'PV')
    latex_table = latex_table.replace('var\_ratio', 'VR')
    latex_table = latex_table.replace('sampled\_entropy', 'SE')
    latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
    latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
    latex_table = latex_table.replace('max\_prob', 'MP')
    latex_table = latex_table.replace('bald', 'BALD')
    latex_table = latex_table.replace('mixup', 'DS')
    latex_table = latex_table.replace('stds', 'STD')
    latex_table = latex_table.replace('\\textbackslash ', '\\')
    latex_table = latex_table.replace('\\{', '{')
    latex_table = latex_table.replace('\\}', '}')
    
    header = """\\begin{table*}[!ht]
        \\resizebox{\\textwidth}{!}{
        \\begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \\toprule
        \\multirow{2}{*}{\\textbf{Method}} &          \\multirow{2}{*}{\\textbf{\\multirowcell{Reg.\\\\Type}}} & 
        \\multirow{2}{*}{\\textbf{\multirowcell{UE\\\\Score}}} & \\multicolumn{2}{c||}{\\textbf{MRPC}} & \\multicolumn{2}{c||}{\\textbf{SST-2}} & \\multicolumn{2}{c||}{\\textbf{CoLA}} & \\multicolumn{2}{c||}{\\textbf{CoNLL-2003 (token level)}} & \\multicolumn{2}{c}{\\textbf{CoNLL-2003 (seq. level)}}\\\\

              \\cline{4-13}
                 & & &   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$} &  \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$} &   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$}&   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$} &   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$}\\\\

        \\midrule \\hline """
    latex_table = header + latex_table.split('midrule')[1]
    bottom = """
    \\bottomrule
    \\end{tabular}
    }
    \\caption{\\label{tab:distilbert}ELECTRA results.}
    \\end{table*}"""
    latex_table = latex_table.split('\\bottomrule')[0] + bottom
    return latex_table


def select_best(df, metrics, select_metrics, datasets):
    cond = df['Method'].str.contains('SR')
    for dataset in datasets:
        for metric in select_metrics:
            cond = cond + df[dataset, metric].str.contains('underline')
    best_res = df.where(cond).dropna()
    for dataset in datasets:
        for metric in metrics:
            best_res[dataset, metric] = best_res[dataset, metric].str.replace('\\\\underline{', '')
            best_res[dataset, metric] = best_res[dataset, metric].str.replace('}', '')
    # drop bald and pv for de
    '''
    c1 = best_res.where((best_res['Method'] == 'Deep Ensemble') * (best_res['UE Score'] == 'variance')).dropna().index
    c2 = best_res.where((best_res['Method'] == 'Deep Ensemble') * (best_res['UE Score'] == 'bald')).dropna().index
    drop_ids = list(c1) + list(c2)
    best_res = best_res.drop(drop_ids)
    '''
    # split on comp eff and other
    dpp = best_res.where((best_res['Method'].str.contains('DDPP'))).dropna()
    md = best_res.where((best_res['Method'].str.contains('MD'))).dropna()
    baseline = best_res.where((best_res['Method'].str.contains('SR '))).dropna()
    sr = best_res.drop(list(baseline.index)).where((best_res.drop(list(baseline.index))['Method'].str.contains('SR'))).dropna()
    comp_eff_part = pd.concat([dpp, md, sr])
    non_eff = best_res.drop(list(dpp.index) + list(md.index) + list(sr.index) + list(baseline.index))
    best_res = pd.concat([non_eff, comp_eff_part, baseline])
    for dataset in datasets:
        for metric in metrics:
            # find smaller value
            values = [float(x.split('$\\pm$')[0]) for x in best_res[dataset, metric] if not('\\textit{' in x)]
            values_ids = [idx for idx, x in enumerate(best_res[dataset, metric]) if not('\\textit{' in x)]
            min_idx = values_ids[np.argmin(values)]
            #min_idx = np.argmin(best_res[dataset, metric])
            
            if dataset == 'CoNNL-2003 (token level)':
                values = [float(x.split('$\\pm$')[0]) for x in best_res[dataset, metric] if not('\\textit{' in x)]
                print(values)
                val_min_idx = np.argmin(values)
                print(min_idx, val_min_idx)
                print(best_res[dataset, metric].iloc[min_idx])
                print(best_res[dataset, metric].iloc[val_min_idx])
            best_res[dataset, metric].iloc[min_idx] = to_bf(best_res[dataset, metric].iloc[min_idx])
            # now find second smaller value, but only in comp_eff part
            #min_idx = np.argmin(comp_eff_part[dataset, metric])
            values = [float(x.split('$\\pm$')[0]) for x in comp_eff_part[dataset, metric]]
            #values_ids = [idx for idx, x in enumerate(comp_eff_part[dataset, metric]) if not('\\textit{' in x)]
            #min_idx = values_ids[np.argmin(values)]
            min_idx = list(comp_eff_part.index)[int(np.argmin(values))]
            best_res[dataset, metric].loc[min_idx] = to_underline(best_res[dataset, metric].loc[min_idx])
    return best_res


def highlight_best(df, metrics, datasets):
    # bold best results for 1 and 2 tables
    for dataset in datasets:
        for metric in metrics:
            # find smaller value
            underlined_values = [float(x.split('\\underline{')[1].split('$\\pm$')[0]) for x in df[dataset, metric] if ('\\underline{' in x)]
            underlined_values_ids = [idx for idx, x in enumerate(df[dataset, metric]) if ('\\underline{' in x)]
            min_idx = underlined_values_ids[np.argmin(underlined_values)]
            df[dataset, metric].iloc[min_idx] = to_bf(df[dataset, metric].iloc[min_idx])
    return df

# DEBERTA on new data

In [5]:
import copy

datasets = ['mrpc', 'sst2', 'cola']
cls_dfs = []
cls_dfs1 = []
cls_dfs2 = []
for dataset in datasets:
    # mrpc electra MC all
    pathes = ['../../workdir/run_calc_ues_metrics/deberta_raw_no_sn/',
              '../../workdir/run_calc_ues_metrics/deberta_reg_no_sn/',
              '../../workdir/run_calc_ues_metrics/deberta_metric_no_sn/',]
    regs = ['-', 'CER', 'metric']
    ues = ['mc_all', 'mc_all', 'mc_all']
    metrics = ['rcc-auc', 'rpp']
    #dataset = 'mrpc'
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    df_mc, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MC', ner_prefix=None)
    method_names = copy.deepcopy(method_buf)
    reg_names = reg_buf
    method1 = method_buf
    reg1 = reg_buf
    score_names = []
    # mrpc electra dpp
    ues = ['ddpp_dpp', 'ddpp_dpp', 'ddpp_dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    # mrpc electra dpp + ood
    ues = ['ddpp_ood', 'ddpp_ood', 'ddpp_ood']
    df_dpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    # mrpc electra de
    de_pathes = ['../../workdir/run_calc_ues_metrics/deberta_raw_no_sn/',]
    de_regs = ['-']
    de_ues = ['ensemble']
    df_de, method_buf, reg_buf = build_section(de_pathes, de_regs, de_ues, dataset, ue_methods, metrics, 'Deep Ensemble', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra md
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    ue_methods = ['mahalanobis_distance']
    df_maha, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MD', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf
    reg2 = reg_buf
    # mrpc electra md sn
    maha_sn_pathes = ['../../workdir/run_calc_ues_metrics/deberta_raw_sn/',
                      '../../workdir/run_calc_ues_metrics/deberta_reg_sn/',
                      '../../workdir/run_calc_ues_metrics/deberta_metric_sn/',]
    #maha_sn_pathes = ['/home/jovyan/uncertainty-estimation/workdir/new_final_results/electra/50_md_sn/electra_raw_sn/',
    #                  '/home/jovyan/uncertainty-estimation/workdir/new_final_results/electra/50_md_sn/electra_reg_sn/',
    #                  '/home/jovyan/uncertainty-estimation/workdir/new_final_results/electra/50_md_sn/electra_metric_sn/',]
    #ues = ['mahalanobis', 'mahalanobis', 'maha_sn']
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    #ue_methods = ['max_prob', 'mahalanobis_distance']
    #df_maha_sn_2, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix=None)
    #method2 = method_buf
    #reg2 = reg_buf
    ue_methods = ['max_prob', 'mahalanobis_distance']
    df_maha_sn, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra sngp
    #sngp_pathes = ['/home/jovyan/uncertainty-estimation/workdir/new_final_results/electra/old_paper_res/electra_raw_sn/',]
    #sngp_regs = ['-']
    #sngp_ues = ['sngp']
    #sngp_ue_methods = ['stds']
    #df_sngp, method_buf, reg_buf = build_section(sngp_pathes, sngp_regs, sngp_ues, dataset, sngp_ue_methods, metrics, 'SNGP', ner_prefix=None)
    #method2 = method_buf
    #reg2 = reg_buf
    #method_names += method_buf
    #reg_names += reg_buf
    # mrpc electra msd
    msd_regs = ['MSD']
    msd_pathes = ['../../workdir/run_calc_ues_metrics/mixup_deberta_fix_repro_fix/',]
    msd_ues = ['msd/all']
    msd_ue_methods = ['mixup']#['max_prob', 'mixup']
    df_msd, method_buf, reg_buf = build_section(msd_pathes, msd_regs, msd_ues, dataset, msd_ue_methods, metrics, 'MSD', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    #method2 = method_buf
    #reg2 = reg_buf
    # df for best results
    full_df = pd.concat([df_mc, df_dpp, df_dpp_ood, df_de, df_maha, df_maha_sn, df_msd])#
    # df for table 1
    df_1 = pd.concat([df_mc, df_dpp, df_dpp_ood])#df_dpp_ood
    df_2 = pd.concat([df_maha, df_maha_sn])
    cls_dfs.append(full_df)
    cls_dfs1.append(df_1)
    cls_dfs2.append(df_2)

In [6]:
# same for ner
dataset = ''
ner_prefixes = ['token', 'seq']
ner_dfs = []
ner_dfs1 = []
ner_dfs2 = []
for ner_prefix in ner_prefixes:
    # mrpc electra MC all
    pathes = ['../../workdir/run_calc_ues_metrics/deberta_raw_no_sn/conll2003/',
              '../../workdir/run_calc_ues_metrics/deberta_reg_no_sn/conll2003/',
              '../../workdir/run_calc_ues_metrics/deberta_metric_no_sn/conll2003/',]
    regs = ['-', 'CER', 'metric']
    ues = ['mc_all', 'mc_all', 'mc_all']
    metrics = ['rcc-auc', 'rpp']
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    df_mc, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MC', ner_prefix)
    method_names = copy.deepcopy(method_buf)
    reg_names = reg_buf
    method1 = method_buf.copy()
    reg1 = reg_buf.copy()
    score_names = []
    # mrpc electra dpp
    ues = ['ddpp_dpp', 'ddpp_dpp', 'ddpp_dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf.copy()
    reg1 += reg_buf.copy()
    # mrpc electra dpp + ood
    ues = ['ddpp_ood', 'ddpp_ood', 'ddpp_ood']
    df_dpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf.copy()
    reg1 += reg_buf.copy()
    # mrpc electra de
    de_pathes = ['../../workdir/run_calc_ues_metrics/deberta_raw_no_sn/conll2003/',]
    de_regs = ['-']
    de_ues = ['ensemble']
    df_de, method_buf, reg_buf = build_section(de_pathes, de_regs, de_ues, dataset, ue_methods, metrics, 'Deep Ensemble', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra md
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    ue_methods = ['mahalanobis_distance']
    df_maha, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MD', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf.copy()
    reg2 = reg_buf.copy()
    # mrpc electra md sn
    maha_sn_pathes = ['../../workdir/run_calc_ues_metrics/deberta_raw_sn/conll2003/',
                      '../../workdir/run_calc_ues_metrics/deberta_reg_sn/conll2003/',
                      '../../workdir/run_calc_ues_metrics/deberta_metric_sn/conll2003/',]
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    ue_methods = ['max_prob', 'mahalanobis_distance']
    df_maha_sn_2, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix)
    method2 += method_buf.copy()
    reg2 += reg_buf.copy()
    #ue_methods = ['mahalanobis_distance']
    #df_maha_sn, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra sngp
    #sngp_pathes = ['/home/jovyan/uncertainty-estimation/workdir/new_final_results/old_conll/electra_raw_sn/conll2003/',]
    #sngp_regs = ['-']
    #sngp_ues = ['sngp']
    #sngp_ue_methods = ['stds']
    #df_sngp, method_buf, reg_buf = build_section(sngp_pathes, sngp_regs, sngp_ues, dataset, sngp_ue_methods, metrics, 'SNGP', ner_prefix)
    #method_names += method_buf
    #reg_names += reg_buf
    #method2 += method_buf.copy()
    #reg2 += reg_buf.copy()
    # mrpc electra msd
    msd_regs = ['MSD']
    msd_pathes = ['../../workdir/run_calc_ues_metrics/mixup_deberta_fix_repro_fix/conll/',]
    msd_ues = ['mixup/all']
    msd_ue_methods = ['mixup'] #['max_prob', 'mixup']
    df_msd, method_buf, reg_buf = build_section(msd_pathes, msd_regs, msd_ues, dataset, msd_ue_methods, metrics, 'MSD', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    #method2 += method_buf.copy()
    #reg2 += reg_buf.copy()
    full_df = pd.concat([df_mc, df_dpp, df_dpp_ood, df_de, df_maha, df_maha_sn_2, df_msd])
    # df for table 1
    df_1 = pd.concat([df_mc, df_dpp, df_dpp_ood])
    df_2 = pd.concat([df_maha, df_maha_sn_2])
    ner_dfs.append(full_df)
    ner_dfs1.append(df_1)
    ner_dfs2.append(df_2)

In [7]:
# build table 1
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs1 = cls_dfs1 + ner_dfs1
full_df1 = add_multiindex(all_dfs1, dataset_names, metrics)

dff1 = add_labels(full_df1, method1, reg1)
dff1 = highlight_best(dff1, metrics, dataset_names)
#best_res = select_best(dff1, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(dff1)
print(latex_table)

\begin{table*}[!ht]
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}\\

        \midrule \hline 
                MC &         - &         PV &                     

In [8]:
# build table 2
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs1 = cls_dfs2 + ner_dfs2
full_df1 = add_multiindex(all_dfs1, dataset_names, metrics)

dff1 = add_labels(full_df1, method2, reg2)
dff1 = highlight_best(dff1, metrics, dataset_names)
#best_res = select_best(dff1, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(dff1)
print(latex_table)

\begin{table*}[!ht]
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}\\

        \midrule \hline 
          MD &         - & MD &                      14.66$\pm$3.6

In [9]:
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs = cls_dfs + ner_dfs
full_df = add_multiindex(all_dfs, dataset_names, metrics)
#cls_df = pd.concat(all_dfs, axis=1)
dff = add_labels(full_df, method_names, reg_names)
best_res = select_best(dff, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(dff)
print(latex_table)

[4.91, 5.1, 5.21, 4.56, 5.91, 4.3, 3.5, 7.01, 5.61, 5.48, 5.73, 5.22, 7.3, 6.14, 5.65, 5.51, 8.07, 4.83, 4.78, 5.42, 5.19, 4.95, 5.74, 5.19, 6.8, 4.91, 5.25, 8.25]
6 6
3.50$\pm$1.72
3.50$\pm$1.72
[0.07, 0.07, 0.08, 0.07, 0.09, 0.05, 0.05, 0.1, 0.08, 0.08, 0.09, 0.09, 0.11, 0.09, 0.08, 0.09, 0.12, 0.07, 0.07, 0.08, 0.08, 0.08, 0.09, 0.08, 0.11, 0.07, 0.08, 0.14]
5 5
0.05$\pm$0.03
0.05$\pm$0.03
\begin{table*}[!ht]
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow



In [10]:
latex_table = prepare_latex_table(best_res)
print(latex_table)

\begin{table*}[!ht]
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}\\

        \midrule \hline 
                MC &         - &     SMP &             15.06$\pm$3

# ELECTRA ablation study

In [55]:
def calc_exp_mean_and_std(fname, ue_methods, metric, highlight_baseline=False):
    perc_metrics = ['rejection-curve-auc', 'rpp']
    with open(fname, 'r') as f:
        curr_metrics = json.loads(f.read())
    metric_results = {}
    for ue_method in ue_methods:
        mean, std = np.mean(list(curr_metrics[ue_method].values())), np.std(list(curr_metrics[ue_method].values()), ddof=1)
        if metric in perc_metrics:
            mean, std = mean * 100, std * 100
        if ue_method == 'max_prob':
            baseline = mean
        value = '{:.{prec}f}'.format(mean, prec=2) + '$\\pm$' + '{:.{prec}f}'.format(std, prec=2)
        if highlight_baseline and ue_method == 'max_prob':
            value = '\\textit{' + value + '}'
        metric_results[ue_method] = value
    return metric_results


def build_method_model_block(path, ue, dataset, ue_methods, metrics, ner_prefix, highlight_baseline=False):
    raw_dict = {}
    df_dict = {}
    # enter row level
    raw_dict[ue] = {}
    df_dict[ue] = {}
    raw_dict[ue][dataset] = {}
    for metric in metrics:
        ue_path = path + dataset + '/' + ue + '/'
        if ner_prefix is not None:
            fname = ue_path + f'metrics_{ner_prefix}_{metric}.json'
        else:
            fname = ue_path + f'metrics_{metric}.json'
        metric_results = calc_exp_mean_and_std(fname, ue_methods, metric, highlight_baseline)
        # so we obtained two dict for one metric
        raw_dict[ue][dataset][metric] = metric_results
        # make buf dataframe
    type_df = pd.DataFrame.from_dict(raw_dict[ue][dataset])
    df_dict[ue][dataset] = type_df
    return df_dict


def build_section(pathes, regs, ues, dataset, ue_methods, metrics, method_name, ner_prefix=None):
    section = {}
    for idx, path in enumerate(pathes):
        highlight_baseline = False
        ue = ues[idx]
        if regs[idx] == '-' and not('MD SN' in method_name):
            highlight_baseline = True
        buf = build_method_model_block(path, ue, dataset, ue_methods, metrics, ner_prefix, highlight_baseline=False)
        section[regs[idx]] = buf
    df = pd.concat([section[reg][ue][dataset] for reg, ue in zip(regs, ues)])
    df = underline_best(df, metrics)
    method_names = [method_name] * (len(ue_methods) * len(regs))
    if 'max_prob' in ue_methods:
        start_idx = ue_methods.index('max_prob')
        for idx in range(start_idx, len(method_names), len(ue_methods)):
            if 'MD SN' in method_name:
                method_names[idx] = 'SR SN'
            else:
                method_names[idx] = 'SR'
                if regs[(idx - start_idx) // len(ue_methods)] == '-':
                    method_names[idx] = 'SR (baseline)'
    reg_names = []
    for reg in regs:
        reg_names += [reg] * len(ue_methods)
    return df, method_names, reg_names


def underline_best(df, metrics):
    for metric in metrics:
        # find smaller value
        values = [float(x.split('$\\pm$')[0]) for x in df[metric] if not('\\textit{' in x)]
        values_ids = [idx for idx, x in enumerate(df[metric]) if not('\\textit{' in x)]
        min_idx = values_ids[np.argmin(values)]
        #print(values)
        #print(df.index)
        #print(min_idx, df[metric].iloc[min_idx], values[min_idx])
        #min_idx = np.argmin(df[metric])
        df[metric].iloc[min_idx] = to_underline(df[metric].iloc[min_idx])
    return df


def bold_and_underline_best(df, datasets, metrics):
    for metric in metrics:
        # find smaller value
        values = [float(x.split('$\\pm$')[0]) for x in best_res[dataset, metric] if not('\\textit{' in x)]
        min_idx = np.argmin(values)
        #min_idx = np.argmin(df[metric])
        df[metric].iloc[min_idx] = to_bf(df[metric].iloc[min_idx])
    df = underline_best(df, metrics)
    return df


def add_multiindex(dfs, names, metrics):
    # add multiindex to columns
    for idx, df in enumerate(dfs):
        df.columns = pd.MultiIndex.from_tuples([(names[idx], metric) for metric in metrics])
        dfs[idx] = df
    #token_df.columns = pd.MultiIndex.from_tuples([('CoNNL-2003 (10%, token level)', metric) for metric in metrics])
    #seq_df.columns = pd.MultiIndex.from_tuples([('CoNNL-2003 (10%, seq. level)', metric) for metric in metrics])
    new_df = pd.concat(dfs, axis=1)
    return new_df

def add_labels(df, method_names, reg_names):
    names_df = pd.DataFrame()
    names_df['Method'] = method_names
    names_df['Reg. Type'] = reg_names
    names_df['UE Score'] = df.index
    names_df.index = df.index
    df = pd.concat([names_df, df], axis=1)
    # Now reindex and move SR rows
    df['index'] = list(range(len(df)))
    df = df.set_index('index')
    sr_indices = np.where(df['Method'].str.contains('SR'))[0]
    baseline_indices = np.where(df['Method'].str.contains('SR '))[0]
    new_index = [idx for idx in range(len(df)) if idx not in sr_indices] + list([idx for idx in sr_indices if idx not in baseline_indices]) + list(baseline_indices)
    df = df.reindex(new_index).reset_index()
    df = df.drop('index', axis=1)
    return df


def prepare_latex_table(table):
    latex_table = table.to_latex(bold_rows=False, index=False)
    latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
    latex_table = latex_table.replace('variance', 'PV')
    latex_table = latex_table.replace('var\_ratio', 'VR')
    latex_table = latex_table.replace('sampled\_entropy', 'SE')
    latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
    latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
    latex_table = latex_table.replace('max\_prob', 'MP')
    latex_table = latex_table.replace('bald', 'BALD')
    latex_table = latex_table.replace('mixup', 'DS')
    latex_table = latex_table.replace('stds', 'STD')
    latex_table = latex_table.replace('\\textbackslash ', '\\')
    latex_table = latex_table.replace('\\{', '{')
    latex_table = latex_table.replace('\\}', '}')
    
    header = """\\begin{table*}[!ht]
        \\resizebox{\\textwidth}{!}{
        \\begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \\toprule
        \\multirow{2}{*}{\\textbf{Method}} &          \\multirow{2}{*}{\\textbf{\\multirowcell{Reg.\\\\Type}}} & 
        \\multirow{2}{*}{\\textbf{\multirowcell{UE\\\\Score}}} & \\multicolumn{2}{c||}{\\textbf{MRPC}} & \\multicolumn{2}{c||}{\\textbf{SST-2}} & \\multicolumn{2}{c||}{\\textbf{CoLA}} & \\multicolumn{2}{c||}{\\textbf{CoNLL-2003 (token level)}} & \\multicolumn{2}{c}{\\textbf{CoNLL-2003 (seq. level)}}\\\\

              \\cline{4-13}
                 & & &   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$} &  \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$} &   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$}&   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$} &   \\textbf{RCC-AUC $\\downarrow$} &      \\textbf{RPP $\\downarrow$}\\\\

        \\midrule \\hline """
    latex_table = header + latex_table.split('midrule')[1]
    bottom = """
    \\bottomrule
    \\end{tabular}
    }
    \\caption{\\label{tab:distilbert}ELECTRA results.}
    \\end{table*}"""
    latex_table = latex_table.split('\\bottomrule')[0] + bottom
    return latex_table


def select_best(df, metrics, select_metrics, datasets):
    cond = df['Method'].str.contains('SR')
    for dataset in datasets:
        for metric in select_metrics:
            cond = cond + df[dataset, metric].str.contains('underline')
    best_res = df#.where(cond).dropna()
    for dataset in datasets:
        for metric in metrics:
            best_res[dataset, metric] = best_res[dataset, metric].str.replace('\\\\underline{', '')
            best_res[dataset, metric] = best_res[dataset, metric].str.replace('}', '')
    # drop bald and pv for de
    '''
    c1 = best_res.where((best_res['Method'] == 'Deep Ensemble') * (best_res['UE Score'] == 'variance')).dropna().index
    c2 = best_res.where((best_res['Method'] == 'Deep Ensemble') * (best_res['UE Score'] == 'bald')).dropna().index
    drop_ids = list(c1) + list(c2)
    best_res = best_res.drop(drop_ids)
    '''
    # split on comp eff and other
    dpp = best_res.where((best_res['Method'].str.contains('DPP'))).dropna()
    md = best_res.where((best_res['Method'].str.contains('MD'))).dropna()
    baseline = best_res.where((best_res['Method'].str.contains('SR '))).dropna()
    sr = best_res.drop(list(baseline.index)).where((best_res.drop(list(baseline.index))['Method'].str.contains('SR'))).dropna()
    comp_eff_part = pd.concat([dpp, md, sr])
    non_eff = best_res.drop(list(dpp.index) + list(md.index) + list(sr.index) + list(baseline.index))
    best_res = pd.concat([non_eff, comp_eff_part, baseline])
    for dataset in datasets:
        for metric in metrics:
            # find smaller value
            values = [float(x.split('$\\pm$')[0]) for x in best_res[dataset, metric] if not('\\textit{' in x)]
            values_ids = [idx for idx, x in enumerate(best_res[dataset, metric]) if not('\\textit{' in x)]
            min_idx = values_ids[np.argmin(values)]
            #min_idx = np.argmin(best_res[dataset, metric])
            
            if dataset == 'CoNNL-2003 (token level)':
                values = [float(x.split('$\\pm$')[0]) for x in best_res[dataset, metric] if not('\\textit{' in x)]
                print(values)
                val_min_idx = np.argmin(values)
                print(min_idx, val_min_idx)
                print(best_res[dataset, metric].iloc[min_idx])
                print(best_res[dataset, metric].iloc[val_min_idx])
            best_res[dataset, metric].iloc[min_idx] = to_bf(best_res[dataset, metric].iloc[min_idx])
            # now find second smaller value, but only in comp_eff part
            #min_idx = np.argmin(comp_eff_part[dataset, metric])
            values = [float(x.split('$\\pm$')[0]) for x in comp_eff_part[dataset, metric]]
            #values_ids = [idx for idx, x in enumerate(comp_eff_part[dataset, metric]) if not('\\textit{' in x)]
            #min_idx = values_ids[np.argmin(values)]
            min_idx = list(comp_eff_part.index)[int(np.argmin(values))]
            #best_res[dataset, metric].loc[min_idx] = to_underline(best_res[dataset, metric].loc[min_idx])
    return best_res


def highlight_best(df, metrics, datasets):
    # bold best results for 1 and 2 tables
    for dataset in datasets:
        for metric in metrics:
            # find smaller value
            underlined_values = [float(x.split('\\underline{')[1].split('$\\pm$')[0]) for x in df[dataset, metric] if ('\\underline{' in x)]
            underlined_values_ids = [idx for idx, x in enumerate(df[dataset, metric]) if ('\\underline{' in x)]
            min_idx = underlined_values_ids[np.argmin(underlined_values)]
            df[dataset, metric].iloc[min_idx] = to_bf(df[dataset, metric].iloc[min_idx])
    return df

In [56]:
import copy

datasets = ['mrpc', 'sst2', 'cola']
cls_dfs = []
cls_dfs1 = []
cls_dfs2 = []
for dataset in datasets:
    # mrpc electra MC all
    pathes = ['../../workdir/run_calc_ues_metrics/electra_raw_no_sn/']
    regs = ['-']
    ues = ['ddpp_dpp', 'ddpp_dpp', 'ddpp_dpp']
    metrics = ['rcc-auc', 'rpp']
    # mrpc electra dpp
    ues = ['ddpp_dpp', 'ddpp_dpp', 'ddpp_dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_ddpp_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix=None)
    method_names = copy.deepcopy(method_buf)
    reg_names = reg_buf
    method1 = method_buf.copy()
    reg1 = reg_buf.copy()
    # mrpc electra dpp + ood
    ues = ['ddpp_ood', 'ddpp_ood', 'ddpp_ood']
    df_ddpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    # mrpc electra dpp + ood
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    ues = ['ddpp_raw', 'ddpp_raw', 'ddpp_raw']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DPP', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    
    full_df = pd.concat([df_ddpp_dpp, df_ddpp_ood, df_dpp])#
    # df for table 1
    df_1 = full_df.copy()
    cls_dfs.append(full_df)
    cls_dfs1.append(df_1)

In [57]:
# same for ner
dataset = ''
ner_prefixes = ['token', 'seq']
ner_dfs = []
ner_dfs1 = []
ner_dfs2 = []
for ner_prefix in ner_prefixes:
    # mrpc electra MC all
    pathes = ['../../workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/']
    regs = ['-']
    metrics = ['rcc-auc', 'rpp']
    # mrpc electra dpp
    ues = ['ddpp_dpp', 'ddpp_dpp', 'ddpp_dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_ddpp_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix)
    method_names = copy.deepcopy(method_buf)
    reg_names = reg_buf
    method1 = method_buf.copy()
    reg1 = reg_buf.copy()
    # mrpc electra dpp + ood
    ues = ['ddpp_ood', 'ddpp_ood', 'ddpp_ood']
    df_ddpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf.copy()
    reg1 += reg_buf.copy()
    
    # mrpc electra dpp + ood
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    ues = ['ddpp_raw', 'ddpp_raw', 'ddpp_raw']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DPP', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf.copy()
    reg1 += reg_buf.copy()
    
    full_df = pd.concat([df_ddpp_dpp, df_ddpp_ood, df_dpp])#
    # df for table 1
    df_1 = full_df
    ner_dfs.append(full_df)
    ner_dfs1.append(df_1)

In [63]:
# build table 1
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs1 = cls_dfs1 + ner_dfs1
full_df1 = add_multiindex(all_dfs1, dataset_names, metrics)

dff1 = add_labels(full_df1, method1, reg1)
#dff1 = highlight_best(dff1, metrics, dataset_names)
best_res = select_best(dff1, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(best_res.drop(columns=['Reg. Type']))
print(latex_table)

[6.12, 6.39, 6.08, 6.32, 6.59, 6.09, 6.31, 6.49, 6.18, 6.08]
2 2
6.08$\pm$0.62
6.08$\pm$0.62
[0.1, 0.1, 0.1, 0.1, 0.11, 0.1, 0.1, 0.1, 0.1, 0.1]
0 0
0.10$\pm$0.01
0.10$\pm$0.01
\begin{table*}[!ht]
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RP



In [60]:
best_res

Unnamed: 0,Method,Reg. Type,UE Score,"(MRPC, rcc-auc)","(MRPC, rpp)","(SST2, rcc-auc)","(SST2, rpp)","(CoLA, rcc-auc)","(CoLA, rpp)","(CoNNL-2003 (token level), rcc-auc)","(CoNNL-2003 (token level), rpp)","(CoNNL-2003 (seq. level), rcc-auc)","(CoNNL-2003 (seq. level), rpp)"
0,DDPP (+DPP) (Ours),-,variance,22.30$\pm$7.15,2.58$\pm$0.65,16.70$\pm$1.38,1.12$\pm$0.12,49.75$\pm$3.96,2.44$\pm$0.29,6.12$\pm$0.71,\textbf{0.10$\pm$0.01},16.78$\pm$2.44,1.93$\pm$0.20
1,DDPP (+DPP) (Ours),-,bald,23.08$\pm$7.00,2.63$\pm$0.63,\textbf{16.08$\pm$2.37},\textbf{1.05$\pm$0.18},49.59$\pm$5.40,2.48$\pm$0.31,6.39$\pm$0.64,0.10$\pm$0.01,21.53$\pm$4.77,2.63$\pm$0.45
2,DDPP (+DPP) (Ours),-,sampled_max_prob,\textbf{21.79$\pm$7.72},\textbf{2.57$\pm$0.68},17.55$\pm$3.03,1.19$\pm$0.23,\textbf{47.86$\pm$5.51},2.39$\pm$0.31,\textbf{6.08$\pm$0.62},0.10$\pm$0.01,17.71$\pm$2.77,2.05$\pm$0.23
3,DDPP (+OOD) (Ours),-,variance,22.73$\pm$7.45,2.65$\pm$0.59,19.05$\pm$2.95,1.29$\pm$0.23,51.11$\pm$12.03,2.37$\pm$0.34,6.32$\pm$0.72,0.10$\pm$0.01,16.75$\pm$2.31,1.94$\pm$0.21
4,DDPP (+OOD) (Ours),-,bald,23.85$\pm$8.39,2.69$\pm$0.58,18.27$\pm$3.05,1.22$\pm$0.23,52.59$\pm$12.08,2.42$\pm$0.34,6.59$\pm$0.69,0.11$\pm$0.01,20.56$\pm$3.09,2.50$\pm$0.26
5,DDPP (+OOD) (Ours),-,sampled_max_prob,22.31$\pm$7.80,2.60$\pm$0.65,19.86$\pm$3.83,1.36$\pm$0.29,50.14$\pm$9.73,\textbf{2.32$\pm$0.30},6.09$\pm$0.67,0.10$\pm$0.01,17.76$\pm$2.75,2.06$\pm$0.23
6,DPP,-,variance,23.96$\pm$9.77,2.63$\pm$0.60,18.60$\pm$3.59,1.20$\pm$0.23,53.49$\pm$4.30,2.43$\pm$0.26,6.31$\pm$0.56,0.10$\pm$0.01,\textbf{16.23$\pm$2.23},\textbf{1.87$\pm$0.21}
7,DPP,-,bald,24.94$\pm$10.22,2.68$\pm$0.58,19.39$\pm$4.99,1.21$\pm$0.31,54.59$\pm$4.09,2.49$\pm$0.26,6.49$\pm$0.56,0.10$\pm$0.01,19.09$\pm$3.59,2.27$\pm$0.32
8,DPP,-,sampled_max_prob,21.83$\pm$7.92,2.59$\pm$0.65,18.19$\pm$3.44,1.23$\pm$0.25,51.06$\pm$4.51,2.40$\pm$0.28,6.18$\pm$0.54,0.10$\pm$0.00,17.28$\pm$2.53,1.98$\pm$0.21
9,SR (baseline),-,max_prob,22.32$\pm$8.08,2.58$\pm$0.65,17.93$\pm$3.84,1.22$\pm$0.28,49.48$\pm$3.71,2.35$\pm$0.25,6.08$\pm$0.62,0.10$\pm$0.01,18.81$\pm$3.35,2.21$\pm$0.29


In [150]:
datasets = ['mrpc', 'sst2', 'cola']
cls_dfs = []
cls_dfs1 = []
cls_dfs2 = []
for dataset in datasets:
    # mrpc electra MC all
    pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/current_paper_results/electra_raw_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/current_paper_results/electra_reg_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/metric_opt_electra_param_last_fix_171/',]
    regs = ['-', 'CER', 'metric']
    ues = ['mc_all', 'mc_all', 'all']
    metrics = ['rcc-auc', 'rpp']
    #dataset = 'mrpc'
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    df_mc, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MC', ner_prefix=None)
    method_names = method_buf
    reg_names = reg_buf
    method1 = method_buf
    reg1 = reg_buf
    score_names = []
    # mrpc electra dpp
    ues = ['ddpp_dpp', 'ddpp_dpp', 'dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    # mrpc electra dpp + ood
    ues = ['ddpp_ood', 'ddpp_ood', 'dpp_with_ood']
    df_dpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    # mrpc electra de
    de_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/current_paper_results/electra_raw_no_sn/',]
    de_regs = ['-']
    de_ues = ['deep_ensemble']
    df_de, method_buf, reg_buf = build_section(de_pathes, de_regs, de_ues, dataset, ue_methods, metrics, 'Deep Ensemble', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra md
    ues = ['maha', 'maha', 'maha']
    ue_methods = ['mahalanobis_distance']
    df_maha, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MD', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf
    reg2 = reg_buf
    # mrpc electra md sn
    maha_sn_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/current_paper_results/electra_raw_sn/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/current_paper_results/electra_reg_sn/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/metric_opt_electra_param_last_fix_171/',]
    ues = ['mahalanobis', 'mahalanobis', 'maha_sn']
    ue_methods = ['max_prob', 'mahalanobis_distance']
    df_maha_sn_2, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix=None)
    method2 = method_buf
    reg2 = reg_buf
    ue_methods = ['mahalanobis_distance']
    df_maha_sn, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra sngp
    sngp_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/current_paper_results/electra_raw_sn/',]
    sngp_regs = ['-']
    sngp_ues = ['sngp']
    sngp_ue_methods = ['stds']
    df_sngp, method_buf, reg_buf = build_section(sngp_pathes, sngp_regs, sngp_ues, dataset, sngp_ue_methods, metrics, 'SNGP', ner_prefix=None)
    method2 = method_buf
    reg2 = reg_buf
    #method_names += method_buf
    #reg_names += reg_buf
    # mrpc electra msd
    msd_regs = ['MSD']
    msd_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_electra_fix_repro_fix/',]
    msd_ues = ['msd/all']
    msd_ue_methods = ['mixup']#['max_prob', 'mixup']
    df_msd, method_buf, reg_buf = build_section(msd_pathes, msd_regs, msd_ues, dataset, msd_ue_methods, metrics, 'MSD', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf
    reg2 = reg_buf
    # df for best results
    full_df = pd.concat([df_mc, df_dpp, df_dpp_ood, df_de, df_maha, df_maha_sn, df_msd])
    # df for table 1
    df_1 = pd.concat([df_mc, df_dpp, df_dpp_ood])
    df_2 = pd.concat([df_maha, df_maha_sn_2, df_sngp, df_msd])
    cls_dfs.append(full_df)
    cls_dfs1.append(df_1)
    cls_dfs2.append(df_2)

In [152]:
# same for ner
dataset = ''
ner_prefixes = ['token', 'seq']
ner_dfs = []
ner_dfs1 = []
ner_dfs2 = []
for ner_prefix in ner_prefixes:
    # mrpc electra MC all
    pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/new_fixed_conll/electra_raw_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/new_fixed_conll/electra_reg_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/metric_opt_electra_param_last_fix_171/conll/',]
    regs = ['-', 'CER', 'metric']
    ues = ['mc_all', 'mc_all', 'all']
    metrics = ['rcc-auc', 'rpp']
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    df_mc, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MC', ner_prefix)
    method_names = method_buf
    reg_names = reg_buf
    method1 = method_buf.copy()
    reg1 = reg_buf.copy()
    score_names = []
    # mrpc electra dpp
    ues = ['ddpp_dpp_best', 'ddpp_dpp_best', 'dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf.copy()
    reg1 += reg_buf.copy()
    # mrpc electra dpp + ood
    ues = ['ddpp_ood_best', 'ddpp_ood_best', 'dpp_with_ood']
    df_dpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf.copy()
    reg1 += reg_buf.copy()
    # mrpc electra de
    de_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/new_fixed_conll/electra_raw_no_sn/conll2003/',]
    de_regs = ['-']
    de_ues = ['deep_ensemble']
    df_de, method_buf, reg_buf = build_section(de_pathes, de_regs, de_ues, dataset, ue_methods, metrics, 'Deep Ensemble', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra md
    ues = ['maha', 'maha', 'maha']
    ue_methods = ['mahalanobis_distance']
    df_maha, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MD', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf.copy()
    reg2 = reg_buf.copy()
    # mrpc electra md sn
    maha_sn_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/new_fixed_conll/electra_raw_sn/conll2003/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/new_fixed_conll/electra_reg_sn/conll2003/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/metric_opt_electra_param_last_fix_171/conll/',]
    ues = ['mahalanobis', 'mahalanobis', 'maha_sn']
    ue_methods = ['max_prob', 'mahalanobis_distance']
    df_maha_sn_2, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix)
    method2 += method_buf.copy()
    reg2 += reg_buf.copy()
    ue_methods = ['mahalanobis_distance']
    df_maha_sn, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra sngp
    sngp_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/new_fixed_conll/electra_raw_sn/conll2003/',]
    sngp_regs = ['-']
    sngp_ues = ['sngp']
    sngp_ue_methods = ['stds']
    df_sngp, method_buf, reg_buf = build_section(sngp_pathes, sngp_regs, sngp_ues, dataset, sngp_ue_methods, metrics, 'SNGP', ner_prefix)
    #method_names += method_buf
    #reg_names += reg_buf
    method2 += method_buf.copy()
    reg2 += reg_buf.copy()
    # mrpc electra msd
    msd_regs = ['MSD']
    msd_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_electra_fix_repro_fix/conll/',]
    msd_ues = ['mixup/all']
    msd_ue_methods = ['mixup'] #['max_prob', 'mixup']
    df_msd, method_buf, reg_buf = build_section(msd_pathes, msd_regs, msd_ues, dataset, msd_ue_methods, metrics, 'MSD', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    method2 += method_buf.copy()
    reg2 += reg_buf.copy()
    full_df = pd.concat([df_mc, df_dpp, df_dpp_ood, df_de, df_maha, df_maha_sn, df_msd])
    # df for table 1
    df_1 = pd.concat([df_mc, df_dpp, df_dpp_ood])
    df_2 = pd.concat([df_maha, df_maha_sn_2, df_sngp, df_msd])
    ner_dfs.append(full_df)
    ner_dfs1.append(df_1)
    ner_dfs2.append(df_2)

In [153]:
# build table 1
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs1 = cls_dfs1 + ner_dfs1
full_df1 = add_multiindex(all_dfs1, dataset_names, metrics)

dff1 = add_labels(full_df1, method1, reg1)
dff1 = highlight_best(dff1, metrics, dataset_names)
#best_res = select_best(dff1, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(dff1)
print(latex_table)

\begin{table*}
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}\\

        \midrule \hline 
                MC &         - &         PV &                      14.6

In [154]:
# build table 2
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs1 = cls_dfs2 + ner_dfs2
full_df1 = add_multiindex(all_dfs1, dataset_names, metrics)

dff1 = add_labels(full_df1, method2, reg2)
dff1 = highlight_best(dff1, metrics, dataset_names)
#best_res = select_best(dff1, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(dff1)
print(latex_table)

\begin{table*}
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}\\

        \midrule \hline 
          MD &         - & MD &                      13.21$\pm$1.53 &  

In [155]:
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs = cls_dfs + ner_dfs
full_df = add_multiindex(all_dfs, dataset_names, metrics)
#cls_df = pd.concat(all_dfs, axis=1)
#full_df

In [156]:
# TODO:
# +move sr to end
# +add column names
# +rename UE scores
# +func for drop all not underlined samples
# +func for finding best samples
# +add ner
# +collect full table
dff = add_labels(full_df, method_names, reg_names)
best_res = select_best(dff, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(dff)
#print(latex_table)

[6.05, 5.65, 5.3, 6.76, 4.95, 5.0, 10.4, 6.91, 6.4, 7.9, 7.46, 6.33, 6.16, 7.44, 7.21, 8.98, 9.63, 12.84, 9.41, 7.54, 6.53, 6.23, 7.11]
4 4
4.95$\pm$1.13
4.95$\pm$1.13
[0.09, 0.09, 0.08, 0.1, 0.07, 0.07, 0.15, 0.11, 0.1, 0.12, 0.12, 0.1, 0.1, 0.12, 0.11, 0.13, 0.14, 0.18, 0.14, 0.12, 0.1, 0.1, 0.11]
4 4
0.07$\pm$0.02
0.07$\pm$0.02




In [157]:
# And also add table with best results
# +remove sngp
# +remove sr msd
# +use only rcc-auc
# +only smp for de
# +underline - for best comp eff method
# fix choose by str
latex_table = prepare_latex_table(best_res)
print(latex_table)

\begin{table*}
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}\\

        \midrule \hline 
                MC &         - &             PV &                      

# DEBERTA

In [82]:
datasets = ['mrpc', 'sst2', 'cola']
cls_dfs = []
for dataset in datasets:
    # mrpc electra MC all
    pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_no_sn/',]
    regs = ['-', 'CER', 'metric']
    ues = ['mc_all', 'mc_all', 'mc_all']
    metrics = ['rcc-auc', 'rpp']
    #dataset = 'mrpc'
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    df_mc, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MC', ner_prefix=None)
    method_names = method_buf
    reg_names = reg_buf
    method1 = method_buf
    reg1 = reg_buf
    score_names = []
    # mrpc electra dpp
    ues = ['ddpp_dpp', 'ddpp_dpp', 'ddpp_dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    # mrpc electra dpp + ood
    ues = ['ddpp_ood', 'ddpp_ood', 'ddpp_ood']
    df_dpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method1 += method_buf
    reg1 += reg_buf
    # mrpc electra de
    de_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/',]
    de_regs = ['-']
    de_ues = ['deep_ensemble']
    df_de, method_buf, reg_buf = build_section(de_pathes, de_regs, de_ues, dataset, ue_methods, metrics, 'Deep Ensemble', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra md
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    ue_methods = ['mahalanobis_distance']
    df_maha, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MD', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf
    reg2 = reg_buf
    # mrpc electra md sn
    maha_sn_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_sn/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_sn/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_sn/',]
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    ue_methods = ['mahalanobis_distance']
    df_maha_sn, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf
    reg2 = reg_buf
    # mrpc electra sngp
    '''
    sngp_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/current_paper_results/electra_raw_sn/',]
    sngp_regs = ['-']
    sngp_ues = ['sngp']
    sngp_ue_methods = ['stds']
    df_sngp, method_buf, reg_buf = build_section(sngp_pathes, sngp_regs, sngp_ues, dataset, sngp_ue_methods, metrics, 'SNGP', ner_prefix=None)
    method2 = method_buf
    reg2 = reg_buf
    '''
    #method_names += method_buf
    #reg_names += reg_buf
    # mrpc electra msd
    msd_regs = ['MSD']
    msd_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_deberta_fix_repro_fix/',]
    msd_ues = ['msd/all']
    msd_ue_methods = ['mixup']#['max_prob', 'mixup']
    df_msd, method_buf, reg_buf = build_section(msd_pathes, msd_regs, msd_ues, dataset, msd_ue_methods, metrics, 'MSD', ner_prefix=None)
    method_names += method_buf
    reg_names += reg_buf
    method2 = method_buf
    reg2 = reg_buf
    # df for best results
    full_df = pd.concat([df_mc, df_dpp, df_dpp_ood, df_de, df_maha, df_maha_sn, df_msd])
    # df for table 1
    df_1 = pd.concat([df_mc, df_dpp, df_dpp_ood])
    df_2 = pd.concat([df_maha, df_maha_sn, df_msd])
    cls_dfs.append(full_df)

In [83]:
# same for ner
dataset = ''
ner_prefixes = ['token', 'seq']
ner_dfs = []
for ner_prefix in ner_prefixes:
    # mrpc electra MC all
    pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_no_sn/conll2003/',]
    regs = ['-', 'CER', 'metric']
    ues = ['mc_all', 'mc_all', 'mc_all']
    metrics = ['rcc-auc', 'rpp']
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    df_mc, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MC', ner_prefix)
    method_names = method_buf
    reg_names = reg_buf
    score_names = []
    # mrpc electra dpp
    ues = ['ddpp_dpp', 'ddpp_dpp', 'ddpp_dpp']
    ue_methods = ['variance', 'bald', 'sampled_max_prob']
    df_dpp, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+DPP) (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra dpp + ood
    ues = ['ddpp_ood', 'ddpp_ood', 'ddpp_ood']
    df_dpp_ood, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'DDPP (+OOD) (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra de
    de_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/conll2003/',]
    de_regs = ['-']
    de_ues = ['deep_ensemble']
    df_de, method_buf, reg_buf = build_section(de_pathes, de_regs, de_ues, dataset, ue_methods, metrics, 'Deep Ensemble', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra md
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    ue_methods = ['mahalanobis_distance']
    df_maha, method_buf, reg_buf = build_section(pathes, regs, ues, dataset, ue_methods, metrics, 'MD', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra md sn
    maha_sn_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_sn/conll2003/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_sn/conll2003/',
                      '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_sn/conll2003/',]
    ues = ['mahalanobis', 'mahalanobis', 'mahalanobis']
    ue_methods = ['mahalanobis_distance']
    df_maha_sn, method_buf, reg_buf = build_section(maha_sn_pathes, regs, ues, dataset, ue_methods, metrics, 'MD SN (Ours)', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    # mrpc electra sngp
    '''
    sngp_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/new_fixed_conll/electra_raw_sn/conll2003/',]
    sngp_regs = ['-']
    sngp_ues = ['sngp']
    sngp_ue_methods = ['stds']
    df_sngp, method_buf, reg_buf = build_section(sngp_pathes, sngp_regs, sngp_ues, dataset, sngp_ue_methods, metrics, 'SNGP', ner_prefix)
    #method_names += method_buf
    #reg_names += reg_buf
    '''
    # mrpc electra msd
    msd_regs = ['MSD']
    msd_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_deberta_fix_repro_fix/conll/',]
    msd_ues = ['mixup/all']
    msd_ue_methods = ['mixup'] #['max_prob', 'mixup']
    df_msd, method_buf, reg_buf = build_section(msd_pathes, msd_regs, msd_ues, dataset, msd_ue_methods, metrics, 'MSD', ner_prefix)
    method_names += method_buf
    reg_names += reg_buf
    full_df = pd.concat([df_mc, df_dpp, df_dpp_ood, df_de, df_maha, df_maha_sn, df_msd])
    ner_dfs.append(full_df)

In [84]:
dataset_names = ['MRPC', 'SST2', 'CoLA'] + ['CoNNL-2003 (token level)', 'CoNNL-2003 (seq. level)']
all_dfs = cls_dfs + ner_dfs
full_df = add_multiindex(all_dfs, dataset_names, metrics)
#cls_df = pd.concat(all_dfs, axis=1)
#full_df

In [86]:
# TODO:
# +move sr to end
# +add column names
# +rename UE scores
# +func for drop all not underlined samples
# +func for finding best samples
# +add ner
# +collect full table
dff = add_labels(full_df, method_names, reg_names)
best_res = select_best(dff, metrics, ['rcc-auc'], dataset_names).dropna()

latex_table = prepare_latex_table(dff)
print(latex_table)

[20.8, 20.62, 17.46, 20.63, 19.94, 18.44, 18.53, 16.56, 13.14, 13.08, 12.28, 13.1, 17.69, 18.72]
Int64Index([9, 11, 17, 18, 20, 23, 26, 30, 31, 32, 33, 34, 37, 38], dtype='int64') Int64Index([3, 5, 8, 27, 29, 36, 9, 11, 17, 18, 20, 23, 26, 30, 31, 32, 33, 34,
            37, 38, 39],
           dtype='int64')
33 10
33 \textbf{12.28$\pm$0.74} 12.28
[2.49, 2.51, 2.2, 2.54, 2.52, 2.36, 2.42, 2.21, 1.83, 1.85, 1.73, 1.87, 2.12, 2.21]
Int64Index([9, 11, 17, 18, 20, 23, 26, 30, 31, 32, 33, 34, 37, 38], dtype='int64') Int64Index([3, 5, 8, 27, 29, 36, 9, 11, 17, 18, 20, 23, 26, 30, 31, 32, 33, 34,
            37, 38, 39],
           dtype='int64')
33 10
33 1.73$\pm$0.11 1.73
[18.36, 17.84, 13.82, 21.56, 18.61, 19.45, 22.63, 13.0, 11.85, 12.99, 12.46, 10.5, 15.34, 15.36]
Int64Index([9, 11, 17, 18, 20, 23, 26, 30, 31, 32, 33, 34, 37, 38], dtype='int64') Int64Index([3, 5, 8, 27, 29, 36, 9, 11, 17, 18, 20, 23, 26, 30, 31, 32, 33, 34,
            37, 38, 39],
           dtype='int64')
34 11
34 \tex



In [87]:
# And also add table with best results
# +remove sngp
# +remove sr msd
# +use only rcc-auc
# +only smp for de
# +underline - for best comp eff method
# fix choose by str
latex_table = prepare_latex_table(best_res)
print(latex_table)

\begin{table*}
        \resizebox{\textwidth}{!}{
        \begin{tabular}{l|l|l||c|c||c|c||c|c||c|c||c|c}
        \toprule
        \multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{\multirowcell{Reg.\\Type}}} & 
        \multirow{2}{*}{\textbf{\multirowcell{UE\\Score}}} & \multicolumn{2}{c||}{\textbf{MRPC}} & \multicolumn{2}{c||}{\textbf{SST-2}} & \multicolumn{2}{c||}{\textbf{CoLA}} & \multicolumn{2}{c||}{\textbf{CoNLL-2003 (token level)}} & \multicolumn{2}{c}{\textbf{CoNLL-2003 (seq. level)}}\\

              \cline{4-13}
                 & & &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &  \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}&   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$} &   \textbf{RCC-AUC $\downarrow$} &      \textbf{RPP $\downarrow$}\\

        \midrule \hline 
                MC &       CER &             PV &                      

In [None]:
# Maha SN, Distilbert
reg_types = ['-', 'metric', 'CER']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/raw_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/metric_171_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/cer_sn/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/raw_sn/conll/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/metric_171_sn/conll/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/cer_sn/conll/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['maha_sn']
    ues_names = ['MD SN (Ours)']
    ues_layers = ['all']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'mahalanobis_distance']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: -
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type &             UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) & (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
 MD SN (Ours) &         - & MD &                       7.28$\pm$2.30 &                   0.10$\pm$0.03 &                         19.68$\pm$5.47 &                      1.95$\pm$0.33 &  29.89$\pm$2.22 & 3.48$\pm$0.27 &  32.99$\pm$3.74 & 2.03$\pm$0.20 & 104.40$\pm$5.39 & 4.32$\pm$0.16 \\
SR (baseline) &         - &             MP &                       7.45$\pm$1.72 &                   0.11$\pm$0.02 &                         22.55$\pm$5.36 &                      2.35$\pm$0.38 &  42.41$\pm$4.64 & 4.37$\pm$0.48 &  46.26$\pm$9.40 & 2.74$\pm$0.54 & 109.89$\pm$4.13 & 4.48$\pm$0.22 \\
\bottomrule
\end{tabular}

Model w

In [None]:
# Maha, Distilbert
reg_types = ['-', 'metric', 'CER']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/raw/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/metric_171/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/cer/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/raw/conll/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/metric_171/conll/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert/cer/conll/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['maha']
    ues_names = ['MD']
    ues_layers = ['all']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'mahalanobis_distance']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: -
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type &             UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) &  (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
           MD &         - & MD &                       7.24$\pm$2.46 &                   0.10$\pm$0.03 &                         20.01$\pm$5.71 &                      1.98$\pm$0.22 &  26.24$\pm$2.13 & 3.12$\pm$0.23 &  29.69$\pm$3.10 & 1.79$\pm$0.18 & 120.22$\pm$10.12 & 4.67$\pm$0.30 \\
SR (baseline) &         - &             MP &                       7.53$\pm$2.14 &                   0.11$\pm$0.03 &                         22.43$\pm$5.16 &                      2.25$\pm$0.24 &  31.21$\pm$3.80 & 3.51$\pm$0.29 &  37.07$\pm$5.39 & 2.18$\pm$0.26 & 135.69$\pm$14.42 & 4.99$\pm$0.42 \\
\bottomrule
\end{tabular}

Mode

In [74]:
# Distilbert, Deep Ensemble
reg_types = ['-']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert_ensemble/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/distilbert_ensemble/conll2003/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['de']
    ues_names = ['DE']
    ues_layers = ['-']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: -
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) &  (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
           DE &         - &         PV &                       6.30$\pm$2.54 &                   0.08$\pm$0.03 &                         17.17$\pm$2.38 &                      1.59$\pm$0.30 &  34.20$\pm$3.28 & 3.36$\pm$0.08 &  29.69$\pm$4.80 & 1.61$\pm$0.20 & 122.00$\pm$12.58 & 4.40$\pm$0.27 \\
           DE &         - &             BALD &                       5.89$\pm$1.76 &                   0.08$\pm$0.03 &                         17.46$\pm$2.44 &                      1.60$\pm$0.31 &  35.41$\pm$3.94 & 3.44$\pm$0.08 &  30.38$\pm$4.89 & 1.64$\pm$0.20 & 123.73$\pm$13.09 & 4.45$\pm$0.27 \\
           DE &         -

In [97]:
# Distilbert, Mixup
reg_types = ['MSD']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_distilbert_fix_repro_fix/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_distilbert_fix_repro_fix/conll/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['msd/all', 'msd/last']
    ues_names = ['MSD-all', 'MSD-last']
    ues_layers = ['all', 'last']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'mixup']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    ues = ['mixup/all', 'mixup/last']
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: MSD
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type & UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) &  (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
      MSD-all &       MSD &    DS &                       6.39$\pm$1.67 &                   0.09$\pm$0.02 &                         17.70$\pm$3.57 &                      1.85$\pm$0.09 &  58.41$\pm$6.88 & 6.87$\pm$1.16 &  29.97$\pm$2.81 & 1.84$\pm$0.19 & 118.91$\pm$27.91 & 4.97$\pm$1.26 \\
     MSD-last &       MSD &    DS &                       6.46$\pm$1.67 &                   0.09$\pm$0.02 &                         17.76$\pm$3.54 &                      1.86$\pm$0.09 &  58.49$\pm$6.85 & 6.89$\pm$1.15 &  30.08$\pm$2.82 & 1.85$\pm$0.19 & 120.80$\pm$27.37 & 5.04$\pm$1.17 \\
SR (baseline) &       MSD & MP &               

In [106]:
# MC-all, dpp, dpp with ood, Distilbert
reg_types = ['-', 'metric', 'CER']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_no_sn/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_no_sn/conll2003/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['mc_all', 'ddpp_dpp', 'ddpp_ood']
    ues_names = ['MC', 'DDPP (+DPP) (Ours)', 'DDPP (+OOD) (Ours)']
    ues_layers = ['all', 'all', 'all']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: -
There are several rows for SR, used first one
                    MRPC                           SST2                 \
                 rcc-auc            rpp         rcc-auc            rpp   
max_prob  22.65$\pm$3.97  2.67$\pm$0.36  18.25$\pm$1.80  1.23$\pm$0.12   
max_prob  22.33$\pm$3.84  2.66$\pm$0.35  18.25$\pm$1.80  1.23$\pm$0.12   

                    CoLA                 
                 rcc-auc            rpp  
max_prob  69.61$\pm$4.25  3.32$\pm$0.17  
max_prob  69.49$\pm$4.17  3.32$\pm$0.17  
\begin{tabular}{lllllllllllll}
\toprule
            Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) & (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
                MC &         - &         PV &                       5.27$\pm$0.89 &                   0.08

In [79]:
# Maha SN, Deberta
reg_types = ['-', 'metric', 'CER']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_sn/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_sn/conll2003/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['mahalanobis']
    ues_names = ['MD SN (Ours)']
    ues_layers = ['all']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'mahalanobis_distance']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: -
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type &             UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) & (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
 MD SN (Ours) &         - & MD &                       5.36$\pm$1.14 &                   0.08$\pm$0.01 &                         16.72$\pm$4.89 &                      1.90$\pm$0.18 &  12.28$\pm$0.74 & 1.73$\pm$0.11 &  12.46$\pm$1.94 & 0.90$\pm$0.13 &  54.40$\pm$4.10 & 2.66$\pm$0.18 \\
SR (baseline) &         - &             MP &                       4.74$\pm$0.88 &                   0.07$\pm$0.01 &                         18.01$\pm$3.76 &                      2.08$\pm$0.31 &  22.60$\pm$5.09 & 2.47$\pm$0.43 &  18.07$\pm$4.99 & 1.22$\pm$0.30 &  63.14$\pm$4.09 & 3.07$\pm$0.26 \\
\bottomrule
\end{tabular}

Model w

In [80]:
# Maha, Deberta
reg_types = ['-', 'metric', 'CER']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_no_sn/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_no_sn/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_metric_no_sn/conll2003/',
              '/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_reg_no_sn/conll2003/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['mahalanobis']
    ues_names = ['MD']
    ues_layers = ['all']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'mahalanobis_distance']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: -
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type &             UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) & (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
           MD &         - & MD &                       4.87$\pm$0.89 &                   0.07$\pm$0.01 &                         14.44$\pm$4.09 &                      1.78$\pm$0.47 &  16.56$\pm$3.44 & 2.21$\pm$0.37 &  13.00$\pm$3.11 & 0.89$\pm$0.20 &  58.02$\pm$4.50 & 2.80$\pm$0.18 \\
SR (baseline) &         - &             MP &                       5.37$\pm$0.44 &                   0.08$\pm$0.01 &                         17.31$\pm$5.20 &                      2.12$\pm$0.58 &  22.65$\pm$3.97 & 2.67$\pm$0.36 &  18.25$\pm$1.80 & 1.23$\pm$0.12 &  69.61$\pm$4.25 & 3.32$\pm$0.17 \\
\bottomrule
\end{tabular}

Model w

In [123]:
# Deberta, Deep Ensemble
reg_types = ['-']
cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/deberta/deberta_raw_no_sn/conll2003/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['deep_ensemble']
    ues_names = ['DE']
    ues_layers = ['-']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: -
There are several rows for SR, used first one
MRPC  rcc-auc    23.59$\pm$6.34
      rpp         2.66$\pm$0.39
SST2  rcc-auc    17.12$\pm$1.87
      rpp         1.13$\pm$0.09
CoLA  rcc-auc    82.36$\pm$4.37
      rpp         3.88$\pm$0.17
Name: max_prob, dtype: object
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &    (MRPC, rpp) & (SST2, rcc-auc) &    (SST2, rpp) & (CoLA, rcc-auc) &    (CoLA, rpp) \\
\midrule
           DE &         - &         PV &                       3.82$\pm$1.86 &                   0.05$\pm$0.03 &                         11.85$\pm$3.05 &                      1.39$\pm$0.48 &  18.71$\pm$2.55 &  2.02$\pm$0.23 &  11.90$\pm$1.02 &  0.75$\pm$0.03 & 81.47$\pm$13.66 &  3.32$\pm$0.23 \\
           DE &         - &             BALD &        

In [98]:
# Deberta, Mixup
reg_types = ['MSD']

cls_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_deberta_fix_repro_fix/',]
ner_pathes = ['/home/jovyan/uncertainty-estimation/workdir/final_results/mixup_deberta_fix_repro_fix/conll/',]
for idx in range(len(reg_types)):
    reg_type = reg_types[idx]
    cls_path = cls_pathes[idx]
    ner_path = ner_pathes[idx]
    print(f'Model with regularization: {reg_type}')
    #reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
    ues = ['msd/all', 'msd/last']
    ues_names = ['MSD-all', 'MSD-last']
    ues_layers = ['all', 'last']
    metrics = ['rcc-auc', 'rpp']
    metric_names = ['rcc-auc', 'rpp']
    types = ['mrpc', 'sst2', 'cola']
    types_names = ['MRPC', 'SST2', 'CoLA']
    ue_methods = ['max_prob', 'mixup']
    perc_metrics = ['rejection-curve-auc', 'rpp']
    diff_metrics = ['rejection-curve-auc', 'roc-auc']


    # copied from table
    baselines_dict = {'mrpcrejection-curve-aucmax_prob': 0.9208435457516339 * 100,
                      'mrpcrcc-aucmax_prob': 23.279293481630972,
                      'mrpcrppmax_prob': 0.026788574907087016 * 100,
                      'colarejection-curve-aucmax_prob': 0.9203619367209971 * 100,
                      'colarcc-aucmax_prob': 59.03726591032054,
                      'colarppmax_prob': 0.02631936969193335 * 100,
                      'sst2rejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      'sst2rcc-aucmax_prob': 18.067838464295736,
                      'sst2rppmax_prob': 0.012349462026204303 * 100,
                      '20ngrejection-curve-aucmax_prob': 0.9379778287461774 * 100,
                      '20ngrcc-aucmax_prob': 18.067838464295736,
                      '20ngrppmax_prob': 0.012349462026204303 * 100}
    raw_df, baselines_dict = get_df(cls_path, reg_type, baselines_dict, True)
    miscl_df = raw_df
    miscl_df.reset_index(inplace=True, drop=True)

    # NER
    ues = ['mixup/all', 'mixup/last']
    types = ['token', 'seq']
    baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                      'tokenrcc-aucmax_prob': 230.81709306328332,
                      'tokenrppmax_prob': 1.8920894383333335,
                      'seqrejection-curve-aucmax_prob': 85.96980676333334,
                      'seqrcc-aucmax_prob': 69.59317634405001,
                      'seqrppmax_prob': 7.4613176516666675}
    reg_df, baselines_dict = get_df_ner(ner_path, reg_type, baselines_dict, 1)
    ner_df = pd.concat([reg_df])
    ner_df.reset_index(inplace=True, drop=True)

    miscl_df.drop(['Method', 'Reg. Type', 'UE Score'], axis=1, inplace=True)
    miscl_df = pd.concat([ner_df, miscl_df], axis=1)

    latex_table = prepare_latex_table(miscl_df)
    print(latex_table)

Model with regularization: MSD
\begin{tabular}{lllllllllllll}
\toprule
       Method & Reg. Type & UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) & (MRPC, rcc-auc) &   (MRPC, rpp) & (SST2, rcc-auc) &   (SST2, rpp) &  (CoLA, rcc-auc) &   (CoLA, rpp) \\
\midrule
      MSD-all &       MSD &    DS &                       6.84$\pm$2.28 &                   0.10$\pm$0.03 &                         14.68$\pm$3.92 &                      1.82$\pm$0.45 &  13.08$\pm$1.25 & 1.88$\pm$0.19 &  11.66$\pm$2.60 & 0.81$\pm$0.12 &   53.42$\pm$4.73 & 2.61$\pm$0.20 \\
     MSD-last &       MSD &    DS &                       6.84$\pm$2.28 &                   0.10$\pm$0.03 &                         14.68$\pm$3.94 &                      1.82$\pm$0.45 &  13.08$\pm$1.25 & 1.88$\pm$0.19 &  11.66$\pm$2.60 & 0.81$\pm$0.12 &   53.42$\pm$4.73 & 2.61$\pm$0.20 \\
SR (baseline) &       MSD & MP &               