In [2]:
# Notebook for ner results table

In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
raw_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_raw_01/'
reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
ues = ['last', 'all', 'dpp', 'dpp_with_ood']
ues_names = ['MC', 'MC', 'DPP_on_masks', 'DPP_with_ood']
ues_layers = ['last', 'all', 'last', 'last']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'variance', 'var_ratio', 'sampled_entropy', 'bald', 'sampled_max_prob']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']

In [4]:
def get_df(raw_path, reg_type, baselines_dict={}, baselines=None):
    raw_dict = {}
    df_dict = {}
    for ue, ue_name in zip(ues, ues):
        ue_path = raw_path + ue + '/'
        # enter row level
        raw_dict[ue_name] = {}
        df_dict[ue_name] = {}
        for ue_type in types:
            raw_dict[ue_name][ue_type] = {}
            for metric in metrics:
                fname = ue_path + f'metrics_{ue_type}_{metric}.json'
                with open(fname, 'r') as f:
                    curr_metrics = json.loads(f.read())
                metric_results = {}
                for ue_method in ue_methods:
                    mean, std = np.mean(list(curr_metrics[ue_method].values())), np.std(list(curr_metrics[ue_method].values()))
                    if metric in perc_metrics:
                        mean, std = mean * 100, std * 100
                    if ue_method == 'max_prob':
                        baseline = mean
                        if baselines is None:
                            baselines_dict[ue_type + metric + ue_method] = baseline
                        else:
                            baseline = baselines_dict[ue_type + metric + ue_method]
                    if metric in diff_metrics and ue_method != 'max_prob':
                        mean = mean - baseline
                    value = '{:.{prec}f}'.format(mean, prec=2) + '$\\pm$' + '{:.{prec}f}'.format(std, prec=2)
                    metric_results[ue_method] = value

                # so we obtained two dict for one metric
                raw_dict[ue_name][ue_type][metric] = metric_results
                # make buf dataframe
            type_df = pd.DataFrame.from_dict(raw_dict[ue_name][ue_type])
            df_dict[ue_name][ue_type] = type_df

    token_df = pd.concat([df_dict[ue]['token'] for ue in ues])
    seq_df = pd.concat([df_dict[ue]['seq'] for ue in ues])
    # make multiindex
    token_df.columns = pd.MultiIndex.from_tuples([('CoNLL-2003 (token level)', metric) for metric in metrics])
    seq_df.columns = pd.MultiIndex.from_tuples([('CoNLL-2003 (sequence level)', metric) for metric in metrics])
    raw_df = pd.concat([token_df, seq_df], axis=1)

    # after rename max_prob column to baseline and drop all max_prob columns
    max_prob_rows = raw_df.loc['max_prob']
    if len(max_prob_rows) != len(metrics) * 2:
        buf_max_prob = raw_df.loc['max_prob'].drop_duplicates().loc['max_prob']
    else:
        buf_max_prob = raw_df.loc['max_prob']
    if len(buf_max_prob) == 2:
        buf_max_prob = buf_max_prob.iloc[0]
    raw_df.drop('max_prob', inplace=True)
    raw_df.loc['max_prob'] = buf_max_prob
    names_df = pd.DataFrame()
    methods = []
    for ue in ues_names:
        methods += [ue] * (len(ue_methods) - 1)
    methods += ['SR (baseline)']
    layers = []
    for ue in ues_layers:
        layers += [ue] * (len(ue_methods) - 1)
    layers += ['-']
    reg_type = [reg_type] * len(raw_df)
    names_df['Method'] = methods
    names_df['Reg. Type'] = reg_type
    #names_df['Dropout Layers'] = layers
    names_df['UE Score'] = raw_df.index
    names_df.index = raw_df.index
    raw_df = pd.concat([names_df, raw_df], axis=1)
    return raw_df, baselines_dict

In [309]:
raw_df, baselines_dict = get_df(raw_path, 'raw')
#reg_df = get_df(reg_path, 'reg')
reg_df, baselines_dict = get_df(reg_path, 'reg', baselines_dict, 1)

In [311]:
ner_df = pd.concat([raw_df, reg_df])
ner_df.reset_index(inplace=True, drop=True)

In [312]:
ner_df

Unnamed: 0,Method,Reg. Type,Dropout Layers,UE Score,"(CoNNL-2003 (10%, token level), rejection-curve-auc)","(CoNNL-2003 (10%, token level), rcc-auc)","(CoNNL-2003 (10%, token level), rpp)","(CoNNL-2003 (10%, sequence level), rejection-curve-auc)","(CoNNL-2003 (10%, sequence level), rcc-auc)","(CoNNL-2003 (10%, sequence level), rpp)"
0,MC,raw,last,variance,-0.08$\pm$0.40,398.39$\pm$70.16,2.40$\pm$0.37,-2.99$\pm$1.36,101.22$\pm$12.14,9.95$\pm$0.74
1,MC,raw,last,var_ratio,0.68$\pm$0.29,166.12$\pm$44.83,1.59$\pm$0.25,1.85$\pm$1.56,65.10$\pm$15.56,5.03$\pm$0.62
2,MC,raw,last,sampled_entropy,0.83$\pm$0.28,159.42$\pm$35.98,1.45$\pm$0.26,1.36$\pm$1.33,63.85$\pm$8.79,5.65$\pm$0.63
3,MC,raw,last,bald,-0.42$\pm$0.40,439.98$\pm$75.20,2.76$\pm$0.38,0.02$\pm$1.27,79.65$\pm$10.86,6.98$\pm$0.57
4,MC,raw,last,sampled_max_prob,0.00$\pm$0.33,263.22$\pm$46.64,2.32$\pm$0.33,1.44$\pm$1.35,62.16$\pm$9.15,5.58$\pm$0.70
5,MC,raw,all,variance,1.19$\pm$0.08,111.91$\pm$44.15,1.21$\pm$0.30,-2.53$\pm$1.79,82.66$\pm$10.51,8.56$\pm$0.57
6,MC,raw,all,var_ratio,1.63$\pm$0.14,62.84$\pm$19.34,0.65$\pm$0.14,1.68$\pm$1.61,46.32$\pm$12.22,4.03$\pm$0.64
7,MC,raw,all,sampled_entropy,1.06$\pm$0.09,115.63$\pm$36.03,1.24$\pm$0.27,-0.76$\pm$1.92,64.18$\pm$8.67,5.69$\pm$0.66
8,MC,raw,all,bald,1.61$\pm$0.06,63.89$\pm$24.84,0.80$\pm$0.22,0.06$\pm$1.93,62.28$\pm$9.52,6.03$\pm$0.70
9,MC,raw,all,sampled_max_prob,-0.32$\pm$0.39,256.63$\pm$49.04,2.37$\pm$0.37,-0.85$\pm$1.94,62.44$\pm$9.37,5.63$\pm$0.70


In [313]:
raw_baseline = ner_df.loc[20]
ner_df.drop(20, inplace=True)
ner_df.loc[-1] = raw_baseline
ner_df.reset_index(inplace=True, drop=True)

ner_df

Unnamed: 0,Method,Reg. Type,Dropout Layers,UE Score,"(CoNNL-2003 (10%, token level), rejection-curve-auc)","(CoNNL-2003 (10%, token level), rcc-auc)","(CoNNL-2003 (10%, token level), rpp)","(CoNNL-2003 (10%, sequence level), rejection-curve-auc)","(CoNNL-2003 (10%, sequence level), rcc-auc)","(CoNNL-2003 (10%, sequence level), rpp)"
0,MC,raw,last,variance,-0.08$\pm$0.40,398.39$\pm$70.16,2.40$\pm$0.37,-2.99$\pm$1.36,101.22$\pm$12.14,9.95$\pm$0.74
1,MC,raw,last,var_ratio,0.68$\pm$0.29,166.12$\pm$44.83,1.59$\pm$0.25,1.85$\pm$1.56,65.10$\pm$15.56,5.03$\pm$0.62
2,MC,raw,last,sampled_entropy,0.83$\pm$0.28,159.42$\pm$35.98,1.45$\pm$0.26,1.36$\pm$1.33,63.85$\pm$8.79,5.65$\pm$0.63
3,MC,raw,last,bald,-0.42$\pm$0.40,439.98$\pm$75.20,2.76$\pm$0.38,0.02$\pm$1.27,79.65$\pm$10.86,6.98$\pm$0.57
4,MC,raw,last,sampled_max_prob,0.00$\pm$0.33,263.22$\pm$46.64,2.32$\pm$0.33,1.44$\pm$1.35,62.16$\pm$9.15,5.58$\pm$0.70
5,MC,raw,all,variance,1.19$\pm$0.08,111.91$\pm$44.15,1.21$\pm$0.30,-2.53$\pm$1.79,82.66$\pm$10.51,8.56$\pm$0.57
6,MC,raw,all,var_ratio,1.63$\pm$0.14,62.84$\pm$19.34,0.65$\pm$0.14,1.68$\pm$1.61,46.32$\pm$12.22,4.03$\pm$0.64
7,MC,raw,all,sampled_entropy,1.06$\pm$0.09,115.63$\pm$36.03,1.24$\pm$0.27,-0.76$\pm$1.92,64.18$\pm$8.67,5.69$\pm$0.66
8,MC,raw,all,bald,1.61$\pm$0.06,63.89$\pm$24.84,0.80$\pm$0.22,0.06$\pm$1.93,62.28$\pm$9.52,6.03$\pm$0.70
9,MC,raw,all,sampled_max_prob,-0.32$\pm$0.39,256.63$\pm$49.04,2.37$\pm$0.37,-0.85$\pm$1.94,62.44$\pm$9.37,5.63$\pm$0.70


In [314]:
latex_table = ner_df.to_latex(bold_rows=False, index=False)
# firstly replace header
new_header = """\\begin{table*}[]
\\scalebox{0.45}{
\\begin{tabular}{l|l|l|l||l|l|l||l|l|l}
\\toprule
\\multirow{2}{*}{\\textbf{Method}} &          \\multirow{2}{*}{\\textbf{Reg. Type}} & 
\\multirow{2}{*}{\\textbf{Dropout. Layers}} & 
\\multirow{2}{*}{\\textbf{UE Score}} & \\multicolumn{3}{l}{\\textbf{CoNNL-2003 (10\\%, token level)}} & \\multicolumn{3}{l}{\\textbf{CoNNL-2003 (10\\%, sequence level)}} \\\\

      \\cline{5-10}
         & & & & rejection-curve-auc &   rcc-auc &      rpp & rejection-curve-auc &    rcc-auc &      rpp \\\\

\\midrule \\hline \\hline
"""
latex_table = new_header + latex_table.split('\\midrule')[1]

table_end = """\\bottomrule
\\end{tabular}
}
\\caption{\\label{tab:ner_10_res} Comparison of results for all methods for CoNLL-2003.}
\\end{table*}
"""

latex_table = latex_table.split('\\bottomrule')[0] + table_end

In [315]:
double_hline_indices = [7, 12, 17, 22, 27, 32, 37, 42]
new_table = ''
for idx, parts in enumerate(latex_table.split('\\\\')):
    if idx in double_hline_indices:
        new_table = "\\\\ \\hline \\hline".join([new_table, parts])
    else:
        new_table = "\\\\".join([new_table, parts])
new_table = new_table[2:]
# fix pm
new_table = new_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
# add italic at last string for raw baseline
split_str = "Baseline &       raw &              - &          max\\_prob &"
to_italic = new_table.split(split_str)[1]
#new_table = split_str.join([new_table.split(split_str)[0], ])

In [316]:
to_italic.split('&')

['                                     92.77$\\pm$0.33 ',
 '                         263.30$\\pm$46.91 ',
 '                        2.31$\\pm$0.33 ',
 '                                     84.00$\\pm$1.75 ',
 '                             65.87$\\pm$11.75 ',
 '                           7.13$\\pm$0.87 \\\\\n\\bottomrule\n\\end{tabular}\n}\n\\caption{\\label{tab:ner_10_res} Comparison of results for all methods for CoNLL-2003.}\n\\end{table*}\n']

In [317]:
print(new_table)

\begin{table*}[]
\scalebox{0.45}{
\begin{tabular}{l|l|l|l||l|l|l||l|l|l}
\toprule
\multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{Reg. Type}} & 
\multirow{2}{*}{\textbf{Dropout. Layers}} & 
\multirow{2}{*}{\textbf{UE Score}} & \multicolumn{3}{l}{\textbf{CoNNL-2003 (10\%, token level)}} & \multicolumn{3}{l}{\textbf{CoNNL-2003 (10\%, sequence level)}} \\

      \cline{5-10}
         & & & & rejection-curve-auc &   rcc-auc &      rpp & rejection-curve-auc &    rcc-auc &      rpp \\

\midrule \hline \hline

           MC &       raw &           last &          variance &                                     -0.08$\pm$0.40 &                         398.39$\pm$70.16 &                        2.40$\pm$0.37 &                                     -2.99$\pm$1.36 &                            101.22$\pm$12.14 &                           9.95$\pm$0.74 \\
           MC &       raw &           last &         var\_ratio &                                      0.68$\pm$0.29 &           

In [361]:
# Now make the same for maha
raw_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_raw_01/'
reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_reg_01_fix/'
sn_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_sn_last_01/'
sn_reg_path = '/data/gkuzmin/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_sn_last_reg_01/'

ues = ['maha']
ues_names = ['Mahalanobis distance']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']

In [362]:
raw_df, baselines_dict = get_df(raw_path, 'raw')
#reg_df = get_df(reg_path, 'reg')
reg_df, _ = get_df(reg_path, 'reg', baselines_dict, 1)
sn_df, _ = get_df(sn_path, 'spectral_norm', baselines_dict, 1)
sn_reg_df, _ = get_df(sn_reg_path, 'reg+spectral_norm', baselines_dict, 1)
ner_df = pd.concat([raw_df, reg_df, sn_df, sn_reg_df])
ner_df.reset_index(inplace=True, drop=True)

                     CoNNL-2003 (10%, token level)                    \
                               rejection-curve-auc           rcc-auc   
max_prob                            92.77$\pm$0.33  263.30$\pm$46.91   
mahalanobis_distance                 2.23$\pm$0.00    15.59$\pm$4.26   

                                    CoNNL-2003 (10%, sequence level)  \
                                rpp              rejection-curve-auc   
max_prob              2.31$\pm$0.33                   84.00$\pm$1.75   
mahalanobis_distance  0.19$\pm$0.05                    4.33$\pm$1.29   

                                                      
                              rcc-auc            rpp  
max_prob              65.87$\pm$11.75  7.13$\pm$0.87  
mahalanobis_distance   37.67$\pm$8.31  2.68$\pm$0.46  
                     CoNNL-2003 (10%, token level)                    \
                               rejection-curve-auc           rcc-auc   
max_prob                            92.71$\pm$0.45  259.24

In [363]:
raw_baseline = ner_df.loc[1]
reg_baseline = ner_df.loc[3]
sn_baseline = ner_df.loc[5]
sn_reg_baseline = ner_df.loc[7]
ner_df.drop([1,3,5,7], inplace=True)
ner_df.reset_index(inplace=True, drop=True)
ner_df.loc[-1] = reg_baseline
ner_df.reset_index(inplace=True, drop=True)
ner_df.loc[-1] = sn_baseline
ner_df.reset_index(inplace=True, drop=True)
ner_df.loc[-1] = sn_reg_baseline
ner_df.reset_index(inplace=True, drop=True)
ner_df.loc[-1] = raw_baseline
ner_df.reset_index(inplace=True, drop=True)

ner_df

Unnamed: 0,Method,Reg. Type,Dropout Layers,UE Score,"(CoNNL-2003 (10%, token level), rejection-curve-auc)","(CoNNL-2003 (10%, token level), rcc-auc)","(CoNNL-2003 (10%, token level), rpp)","(CoNNL-2003 (10%, sequence level), rejection-curve-auc)","(CoNNL-2003 (10%, sequence level), rcc-auc)","(CoNNL-2003 (10%, sequence level), rpp)"
0,Mahalanobis distance,raw,-,mahalanobis_distance,2.23$\pm$0.00,15.59$\pm$4.26,0.19$\pm$0.05,4.33$\pm$1.29,37.67$\pm$8.31,2.68$\pm$0.46
1,Mahalanobis distance,reg,-,mahalanobis_distance,2.23$\pm$0.00,19.68$\pm$7.03,0.23$\pm$0.09,3.17$\pm$1.53,44.87$\pm$9.27,2.71$\pm$0.39
2,Mahalanobis distance,spectral_norm,-,mahalanobis_distance,2.23$\pm$0.00,12.96$\pm$4.63,0.16$\pm$0.06,4.96$\pm$1.42,34.16$\pm$8.46,2.57$\pm$0.47
3,Mahalanobis distance,reg+spectral_norm,-,mahalanobis_distance,2.23$\pm$0.00,56.38$\pm$57.03,0.36$\pm$0.24,3.23$\pm$1.33,49.15$\pm$13.13,3.14$\pm$0.60
4,Baseline,reg,-,max_prob,92.71$\pm$0.45,259.24$\pm$57.19,2.36$\pm$0.45,83.31$\pm$1.81,70.37$\pm$12.53,6.69$\pm$0.66
5,Baseline,spectral_norm,-,max_prob,92.58$\pm$0.46,282.61$\pm$55.74,2.51$\pm$0.46,83.71$\pm$1.97,68.77$\pm$13.23,7.96$\pm$0.98
6,Baseline,reg+spectral_norm,-,max_prob,92.27$\pm$0.36,313.58$\pm$50.63,2.82$\pm$0.36,82.21$\pm$1.70,78.59$\pm$11.46,8.26$\pm$0.74
7,Baseline,raw,-,max_prob,92.77$\pm$0.33,263.30$\pm$46.91,2.31$\pm$0.33,84.00$\pm$1.75,65.87$\pm$11.75,7.13$\pm$0.87


In [364]:
latex_table = ner_df.to_latex(bold_rows=False, index=False)
# firstly replace header
new_header = """\\begin{table*}[]
\\scalebox{0.45}{
\\begin{tabular}{l|l|l|l||l|l|l||l|l|l}
\\toprule
\\multirow{2}{*}{\\textbf{Method}} &          \\multirow{2}{*}{\\textbf{Reg. Type}} & 
\\multirow{2}{*}{\\textbf{Dropout. Layers}} & 
\\multirow{2}{*}{\\textbf{UE Score}} & \\multicolumn{3}{l}{\\textbf{CoNNL-2003 (10\\%, token level)}} & \\multicolumn{3}{l}{\\textbf{CoNNL-2003 (10\\%, sequence level)}} \\\\

      \\cline{5-10}
         & & & & rejection-curve-auc &   rcc-auc &      rpp & rejection-curve-auc &    rcc-auc &      rpp \\\\

\\midrule \\hline \\hline
"""
latex_table = new_header + latex_table.split('\\midrule')[1]

table_end = """\\bottomrule
\\end{tabular}
}
\\caption{\\label{tab:ner_maha} Comparison of different regularization types for Mahalanobis distance for CoNLL-2003.}
\\end{table*}
"""

latex_table = latex_table.split('\\bottomrule')[0] + table_end

In [365]:
double_hline_indices = [6]
new_table = ''
for idx, parts in enumerate(latex_table.split('\\\\')):
    if idx in double_hline_indices:
        new_table = "\\\\ \\hline \\hline".join([new_table, parts])
    else:
        new_table = "\\\\".join([new_table, parts])
new_table = new_table[2:]
# fix pm
new_table = new_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
# add italic at last string for raw baseline
#split_str = "Baseline &       raw &              - &          max\\_prob &"
#to_italic = new_table.split(split_str)[1]
#new_table = split_str.join([new_table.split(split_str)[0], ])

In [366]:
print(new_table)

\begin{table*}[]
\scalebox{0.45}{
\begin{tabular}{l|l|l|l||l|l|l||l|l|l}
\toprule
\multirow{2}{*}{\textbf{Method}} &          \multirow{2}{*}{\textbf{Reg. Type}} & 
\multirow{2}{*}{\textbf{Dropout. Layers}} & 
\multirow{2}{*}{\textbf{UE Score}} & \multicolumn{3}{l}{\textbf{CoNNL-2003 (10\%, token level)}} & \multicolumn{3}{l}{\textbf{CoNNL-2003 (10\%, sequence level)}} \\

      \cline{5-10}
         & & & & rejection-curve-auc &   rcc-auc &      rpp & rejection-curve-auc &    rcc-auc &      rpp \\

\midrule \hline \hline

 Mahalanobis distance &                raw &              - &  mahalanobis\_distance &                                      2.23$\pm$0.00 &                           15.59$\pm$4.26 &                        0.19$\pm$0.05 &                                      4.33$\pm$1.29 &                              37.67$\pm$8.31 &                           2.68$\pm$0.46 \\
 Mahalanobis distance &                reg &              - &  mahalanobis\_distance &                     

## Final table, with optinal hyp par

In [93]:
# Calc blocks for raw model
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/raw/'

ues = ['all', 'dpp', 'dpp_with_ood']
ues_names = ['MC', 'DDPP (+DPP) (Ours)', 'DDPP (+OOD) (Ours)']
ues_layers = ['all', 'last', 'last']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'variance', 'var_ratio', 'bald', 'sampled_max_prob']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']


baselines_dict = {'tokenrejection-curve-aucmax_prob': 92.77110214266666,
                  'tokenrcc-aucmax_prob': 263.29672836862,
                  'tokenrppmax_prob': 2.3132998839999996,
                  'seqrejection-curve-aucmax_prob': 84.00144927533333,
                  'seqrcc-aucmax_prob': 65.86561601085333,
                  'seqrppmax_prob': 7.133683399333333}
reg_df, baselines_dict = get_df(reg_path, '-')
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.184446145, 'tokenrcc-aucmax_prob': 230.81709306328332, 'tokenrppmax_prob': 1.8920894383333335, 'seqrejection-curve-aucmax_prob': 85.96980676333334, 'seqrcc-aucmax_prob': 69.59317634405001, 'seqrppmax_prob': 7.4613176516666675}
\begin{tabular}{lllllllll}
\toprule
            Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
                MC &         - &         PV &                                   1.25$\pm$0.14 &                     49.54$\pm$11.49 &                   0.53$\pm$0.13 &                                      2.50$\pm$1.33 &                         38.42$\pm$8.85 &                      4.42$\pm$1.17 \\
                MC &         - &        VR &                                   1.1

In [94]:
# Same for reg
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/reg/'

ues = ['all', 'dpp', 'dpp_with_ood']
ues_names = ['MC', 'DDPP (+DPP) (Ours)', 'DDPP (+OOD) (Ours)']
ues_layers = ['all', 'last', 'last']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'variance', 'var_ratio', 'bald', 'sampled_max_prob']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, _ = get_df(reg_path, 'CER', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.184446145, 'tokenrcc-aucmax_prob': 230.81709306328332, 'tokenrppmax_prob': 1.8920894383333335, 'seqrejection-curve-aucmax_prob': 85.96980676333334, 'seqrcc-aucmax_prob': 69.59317634405001, 'seqrppmax_prob': 7.4613176516666675}
\begin{tabular}{lllllllll}
\toprule
            Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
                MC &       CER &         PV &                                   1.34$\pm$0.19 &                     41.14$\pm$10.92 &                   0.48$\pm$0.13 &                                      2.45$\pm$1.31 &                         39.20$\pm$7.91 &                      4.22$\pm$0.65 \\
                MC &       CER &        VR &                                   1.2

In [95]:
# Same for metric
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/metric_opt_electra_3hyp/conll/metric/'
#reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/metric/'

ues = ['all', 'dpp', 'dpp_with_ood']
ues_names = ['MC', 'DDPP (+DPP) (Ours)', 'DDPP (+OOD) (Ours)']
ues_layers = ['all', 'last', 'last']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'variance', 'var_ratio', 'bald', 'sampled_max_prob']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, _ = get_df(reg_path, 'metric', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.184446145, 'tokenrcc-aucmax_prob': 230.81709306328332, 'tokenrppmax_prob': 1.8920894383333335, 'seqrejection-curve-aucmax_prob': 85.96980676333334, 'seqrcc-aucmax_prob': 69.59317634405001, 'seqrppmax_prob': 7.4613176516666675}
\begin{tabular}{lllllllll}
\toprule
            Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
                MC &    metric &         PV &                                   0.36$\pm$0.58 &                   170.17$\pm$119.63 &                   1.69$\pm$0.88 &                                     -2.92$\pm$1.87 &                        84.02$\pm$22.42 &                      8.00$\pm$1.72 \\
                MC &    metric &        VR &                                   0.5

In [63]:
# Now make the same for maha raw
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/raw/'

ues = ['maha_mc']
ues_names = ['MD']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance', 'sampled_mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, baselines_dict = get_df(reg_path, '-', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('sampled\_mahalanobis\_distance', 'SMD')

latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.24473404, 'tokenrcc-aucmax_prob': 222.95371713808333, 'tokenrppmax_prob': 1.8318425833333336, 'seqrejection-curve-aucmax_prob': 86.05555555666666, 'seqrcc-aucmax_prob': 66.95221565415001, 'seqrppmax_prob': 7.459637331666667}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type &                     UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
           MD &         - &         MD &                                   1.64$\pm$0.03 &                      12.14$\pm$3.53 &                   0.15$\pm$0.03 &                                      4.70$\pm$0.51 &                         27.04$\pm$4.10 &                      2.65$\pm$0.25 \\
           MD &         - & SMD &                                   1.65$\pm$0.03 

In [64]:
# Now make the same for maha reg
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/reg/'

ues = ['maha_mc']
ues_names = ['MD']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance', 'sampled_mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, baselines_dict = get_df(reg_path, 'CER', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('sampled\_mahalanobis\_distance', 'SMD')

latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.24473404, 'tokenrcc-aucmax_prob': 222.95371713808333, 'tokenrppmax_prob': 1.8318425833333336, 'seqrejection-curve-aucmax_prob': 86.05555555666666, 'seqrcc-aucmax_prob': 66.95221565415001, 'seqrppmax_prob': 7.459637331666667}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type &                     UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
           MD &       CER &         MD &                                   1.65$\pm$0.02 &                      10.14$\pm$2.48 &                   0.13$\pm$0.03 &                                      4.28$\pm$1.00 &                         29.34$\pm$6.29 &                      2.78$\pm$0.47 \\
           MD &       CER & SMD &                                   1.65$\pm$0.04 

In [96]:
# Now make the same for maha metric
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/metric_opt_electra_3hyp/conll/metric/'

ues = ['maha_mc']
ues_names = ['MD']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance', 'sampled_mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, baselines_dict = get_df(reg_path, 'metric', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('sampled\_mahalanobis\_distance', 'SMD')

latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.184446145, 'tokenrcc-aucmax_prob': 230.81709306328332, 'tokenrppmax_prob': 1.8920894383333335, 'seqrejection-curve-aucmax_prob': 85.96980676333334, 'seqrcc-aucmax_prob': 69.59317634405001, 'seqrppmax_prob': 7.4613176516666675}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type &                     UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
           MD &    metric &         MD &                                   0.94$\pm$0.40 &                     86.55$\pm$34.50 &                   0.91$\pm$0.38 &                                      1.94$\pm$1.77 &                        42.71$\pm$11.88 &                      3.05$\pm$0.46 \\
           MD &    metric & SMD &                                  -1.29$\pm$0.9

In [66]:
# Now make the same for maha raw sn
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/raw_sn/'

ues = ['maha_mc']
ues_names = ['MD SN']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance', 'sampled_mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, baselines_dict = get_df(reg_path, '-', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('sampled\_mahalanobis\_distance', 'SMD')

latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.24473404, 'tokenrcc-aucmax_prob': 222.95371713808333, 'tokenrppmax_prob': 1.8318425833333336, 'seqrejection-curve-aucmax_prob': 86.05555555666666, 'seqrcc-aucmax_prob': 66.95221565415001, 'seqrppmax_prob': 7.459637331666667}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type &                     UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
        MD SN &         - &         MD &                                   1.66$\pm$0.03 &                       8.79$\pm$2.47 &                   0.12$\pm$0.03 &                                      5.32$\pm$0.81 &                         22.99$\pm$5.18 &                      2.27$\pm$0.42 \\
        MD SN &         - & SMD &                                   1.65$\pm$0.03 

In [67]:
# Now make the same for maha reg sn
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/reg_sn/'

ues = ['maha_mc']
ues_names = ['MD SN']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance', 'sampled_mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, baselines_dict = get_df(reg_path, 'CER', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('sampled\_mahalanobis\_distance', 'SMD')

latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.24473404, 'tokenrcc-aucmax_prob': 222.95371713808333, 'tokenrppmax_prob': 1.8318425833333336, 'seqrejection-curve-aucmax_prob': 86.05555555666666, 'seqrcc-aucmax_prob': 66.95221565415001, 'seqrppmax_prob': 7.459637331666667}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type &                     UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
        MD SN &       CER &         MD &                                   1.59$\pm$0.12 &                      14.19$\pm$7.57 &                   0.20$\pm$0.13 &                                      3.92$\pm$1.12 &                         30.21$\pm$6.25 &                      2.66$\pm$0.48 \\
        MD SN &       CER & SMD &                                   1.49$\pm$0.18 

In [97]:
# Now make the same for maha metric sn
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/metric_opt_electra_3hyp/conll/metric/'

ues = ['maha_mc']
ues_names = ['MD SN']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance', 'sampled_mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, baselines_dict = get_df(reg_path, 'metric', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('sampled\_mahalanobis\_distance', 'SMD')

latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.184446145, 'tokenrcc-aucmax_prob': 230.81709306328332, 'tokenrppmax_prob': 1.8920894383333335, 'seqrejection-curve-aucmax_prob': 85.96980676333334, 'seqrcc-aucmax_prob': 69.59317634405001, 'seqrppmax_prob': 7.4613176516666675}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type &                     UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
        MD SN &    metric &         MD &                                   0.94$\pm$0.40 &                     86.55$\pm$34.50 &                   0.91$\pm$0.38 &                                      1.94$\pm$1.77 &                        42.71$\pm$11.88 &                      3.05$\pm$0.46 \\
        MD SN &    metric & SMD &                                  -1.29$\pm$0.9

In [75]:
# Calc blocks for DE
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/de/'

ues = ['last']
ues_names = ['Deep Ensemble']
ues_layers = ['all', 'last', 'last']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'variance', 'var_ratio', 'bald', 'sampled_max_prob']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, _ = get_df(reg_path, '-', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.24473404, 'tokenrcc-aucmax_prob': 222.95371713808333, 'tokenrppmax_prob': 1.8318425833333336, 'seqrejection-curve-aucmax_prob': 86.05555555666666, 'seqrcc-aucmax_prob': 66.95221565415001, 'seqrppmax_prob': 7.459637331666667}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
Deep Ensemble &         - &         PV &                                   1.41$\pm$0.06 &                      32.68$\pm$7.09 &                   0.46$\pm$0.09 &                                      3.65$\pm$0.61 &                         38.23$\pm$7.80 &                      3.69$\pm$0.39 \\
Deep Ensemble &         - &        VR &                                   1.26$\pm$0.06 &     

In [73]:
# Now make the same for raw SNGP
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/conll2003_electra_opt_hyp/sngp/'

ues = ['raw']
ues_names = ['SNGP']
ues_layers = ['-']
metrics = ['rejection-curve-auc', 'rcc-auc', 'rpp']
metric_names = ['rejection-curve-auc', 'rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'stds']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']



reg_df, baselines_dict = get_df(reg_path, '-', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('sampled\_mahalanobis\_distance', 'SMD')

latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.24473404, 'tokenrcc-aucmax_prob': 222.95371713808333, 'tokenrppmax_prob': 1.8318425833333336, 'seqrejection-curve-aucmax_prob': 86.05555555666666, 'seqrcc-aucmax_prob': 66.95221565415001, 'seqrppmax_prob': 7.459637331666667}
\begin{tabular}{lllllllll}
\toprule
       Method & Reg. Type & UE Score & (CoNLL-2003 (token level), rejection-curve-auc) & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rejection-curve-auc) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
         SNGP &         - &     stds &                                  -1.93$\pm$1.64 &                     87.09$\pm$51.27 &                   0.90$\pm$0.54 &                                    -24.90$\pm$1.26 &                        59.49$\pm$30.09 &                      5.29$\pm$0.60 \\
SR (baseline) &         - & MP &                                  92.97$\pm$0.38 &                    26

### Build table blocks for NER-metric

# Mixup

In [109]:
# Calc blocks for raw model
reg_path = '/notebook/ue/uncertainty-estimation/workdir/run_calc_ues_metrics/mixup_electra/conll/mixup/'

ues = ['all', 'last']
ues_names = ['MC-all', 'MC-last']
ues_layers = ['all', 'last']
metrics = ['rcc-auc', 'rpp']
metric_names = ['rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mixup']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']


baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                  'tokenrcc-aucmax_prob': 230.81709306328332,
                  'tokenrppmax_prob': 1.8920894383333335,
                  'seqrejection-curve-aucmax_prob': 85.96980676333334,
                  'seqrcc-aucmax_prob': 69.59317634405001,
                  'seqrppmax_prob': 7.4613176516666675}
reg_df, baselines_dict = get_df(reg_path, 'MSD', baselines_dict, 1)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrejection-curve-aucmax_prob': 93.184446145, 'tokenrcc-aucmax_prob': 230.81709306328332, 'tokenrppmax_prob': 1.8920894383333335, 'seqrejection-curve-aucmax_prob': 85.96980676333334, 'seqrcc-aucmax_prob': 69.59317634405001, 'seqrppmax_prob': 7.4613176516666675}
\begin{tabular}{lllllll}
\toprule
       Method & Reg. Type & UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
       MC-all &       MSD &    mixup &                    231.84$\pm$38.89 &                   2.26$\pm$0.34 &                        61.57$\pm$11.44 &                      5.88$\pm$0.73 \\
      MC-last &       MSD &    mixup &                    217.64$\pm$38.18 &                   2.10$\pm$0.35 &                        59.30$\pm$10.70 &                      5.70$\pm$0.71 \\
SR (baseline) &       MSD & MP &                    230.25$\pm$40.15 &                   2.15$\pm$0.35 &                

In [1]:
# Changed conll

In [19]:
# Calc blocks for raw model
reg_path = '/home/jovyan/uncertainty-estimation/workdir/run_calc_ues_metrics/workdir/run_calc_ues_metrics/electra_metric_no_sn/conll2003/'

ues = ['mc_all']
ues_names = ['MC']
ues_layers = ['all']
metrics = ['rcc-auc', 'rpp']
metric_names = ['rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']


baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                  'tokenrcc-aucmax_prob': 230.81709306328332,
                  'tokenrppmax_prob': 1.8920894383333335,
                  'seqrejection-curve-aucmax_prob': 85.96980676333334,
                  'seqrcc-aucmax_prob': 69.59317634405001,
                  'seqrppmax_prob': 7.4613176516666675}
reg_df, baselines_dict = get_df(reg_path, 'CER', {}, None)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrcc-aucmax_prob': 7.390677315349998, 'tokenrppmax_prob': 0.11931852833333334, 'seqrcc-aucmax_prob': 20.39109464188333, 'seqrppmax_prob': 2.574529161666667}
\begin{tabular}{lllllll}
\toprule
       Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
           MC &       CER &         PV &                       7.32$\pm$1.97 &                   0.11$\pm$0.02 &                         16.23$\pm$2.57 &                      2.02$\pm$0.22 \\
           MC &       CER &             BALD &                       7.50$\pm$1.91 &                   0.11$\pm$0.02 &                         16.40$\pm$2.43 &                      2.02$\pm$0.20 \\
           MC &       CER & SMP &                       6.93$\pm$1.68 &                   0.11$\pm$0.02 &                         16.75$\pm$2.15 &                      2.04$\pm$0.22 \\
SR (baseline) &      

In [18]:
# Calc blocks for raw model
reg_path = '/home/jovyan/uncertainty-estimation/workdir/run_calc_ues_metrics/workdir/run_calc_ues_metrics/electra_raw_no_sn/conll2003/'

ues = ['deepensemble']
ues_names = ['DDPP (+DPP) (Ours)']
ues_layers = ['all']
metrics = ['rcc-auc', 'rpp']
metric_names = ['rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'variance', 'bald', 'sampled_max_prob']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']


baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                  'tokenrcc-aucmax_prob': 230.81709306328332,
                  'tokenrppmax_prob': 1.8920894383333335,
                  'seqrejection-curve-aucmax_prob': 85.96980676333334,
                  'seqrcc-aucmax_prob': 69.59317634405001,
                  'seqrppmax_prob': 7.4613176516666675}
reg_df, baselines_dict = get_df(reg_path, '-', {}, None)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrcc-aucmax_prob': 10.25596100548, 'tokenrppmax_prob': 0.147355794, 'seqrcc-aucmax_prob': 23.12924720744, 'seqrppmax_prob': 2.9672757819999998}
\begin{tabular}{lllllll}
\toprule
            Method & Reg. Type &         UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
DDPP (+DPP) (Ours) &         - &         PV &                       5.10$\pm$0.81 &                   0.07$\pm$0.02 &                         15.33$\pm$3.55 &                      1.80$\pm$0.45 \\
DDPP (+DPP) (Ours) &         - &             BALD &                       4.95$\pm$1.13 &                   0.07$\pm$0.02 &                         15.33$\pm$3.61 &                      1.78$\pm$0.45 \\
DDPP (+DPP) (Ours) &         - & SMP &                       5.00$\pm$1.44 &                   0.07$\pm$0.02 &                         15.07$\pm$3.56 &                      1.71$\pm$0.46 \\
     SR (basel

In [23]:
# Calc blocks for raw model
reg_path = '/home/jovyan/uncertainty-estimation/workdir/run_calc_ues_metrics/workdir/run_calc_ues_metrics/electra_reg_sn/conll2003/'

ues = ['mahalanobis']
ues_names = ['DDPP (+DPP) (Ours)']
ues_layers = ['all']
metrics = ['rcc-auc', 'rpp']
metric_names = ['rcc-auc', 'rpp']
types = ['token', 'seq']
ue_methods = ['max_prob', 'mahalanobis_distance']
perc_metrics = ['rejection-curve-auc', 'rpp']
diff_metrics = ['rejection-curve-auc', 'roc-auc']


baselines_dict = {'tokenrejection-curve-aucmax_prob': 93.184446145,
                  'tokenrcc-aucmax_prob': 230.81709306328332,
                  'tokenrppmax_prob': 1.8920894383333335,
                  'seqrejection-curve-aucmax_prob': 85.96980676333334,
                  'seqrcc-aucmax_prob': 69.59317634405001,
                  'seqrppmax_prob': 7.4613176516666675}
reg_df, baselines_dict = get_df(reg_path, '-', {}, None)
print(baselines_dict)
ner_df = pd.concat([reg_df])
ner_df.reset_index(inplace=True, drop=True)

latex_table = ner_df.to_latex(bold_rows=False, index=False)
latex_table = latex_table.replace('\\$\\textbackslash pm\\$', '$\pm$')
latex_table = latex_table.replace('variance', 'PV')
latex_table = latex_table.replace('var\_ratio', 'VR')
latex_table = latex_table.replace('sampled\_entropy', 'SE')
latex_table = latex_table.replace('sampled\_max\_prob', 'SMP')
latex_table = latex_table.replace('mahalanobis\_distance', 'MD')
latex_table = latex_table.replace('max\_prob', 'MP')
latex_table = latex_table.replace('bald', 'BALD')
print(latex_table)

{'tokenrcc-aucmax_prob': 7.561215305199998, 'tokenrppmax_prob': 0.12058965166666666, 'seqrcc-aucmax_prob': 23.18829577556667, 'seqrppmax_prob': 2.8516418116666666}
\begin{tabular}{lllllll}
\toprule
            Method & Reg. Type &             UE Score & (CoNLL-2003 (token level), rcc-auc) & (CoNLL-2003 (token level), rpp) & (CoNLL-2003 (sequence level), rcc-auc) & (CoNLL-2003 (sequence level), rpp) \\
\midrule
DDPP (+DPP) (Ours) &         - & MD &                       7.54$\pm$1.71 &                   0.12$\pm$0.02 &                         18.05$\pm$3.00 &                      2.02$\pm$0.33 \\
     SR (baseline) &         - &             MP &                       7.56$\pm$1.46 &                   0.12$\pm$0.02 &                         23.19$\pm$3.53 &                      2.85$\pm$0.39 \\
\bottomrule
\end{tabular}

