In [1]:
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score

from ue4nlp.ue_scores import *


def calc_roc_aucs(probabilities, labels, sampled_probabilities, methods):
    predictions = np.argmax(probabilities, axis=-1)
    errors = (labels != predictions).astype('uint8')

    results = {}
    for name, method_function in methods.items():
        ue_scores = method_function(sampled_probabilities)
        results[name] = roc_auc_score(errors, ue_scores)
    
    max_prob = 1. - np.max(probabilities, axis=-1)
    results['max_prob'] = roc_auc_score(errors, max_prob)
    return results


def extract_result(time_dir, methods):
    with open(Path(time_dir) / 'dev_inference.json') as f:
        model_outputs = json.load(f)
    
    return calc_roc_aucs(np.asarray(model_outputs['probabilities']), 
                         np.asarray(model_outputs['true_labels']), 
                         np.asarray(model_outputs['sampled_probabilities']).transpose(1, 0, 2),
                         methods=methods)


def extract_all_results(data_path, mc_types, frac, methods):
    results = {}

    for mc_type in mc_types:
        mc_type_dir = data_path / mc_type / str(frac)

        mc_type_results = []
        for suffix in os.listdir(mc_type_dir):
            suffix_dir = mc_type_dir / suffix
            
            for date_fname in os.listdir(suffix_dir):
                date_dir = suffix_dir / date_fname

                for time_fname in os.listdir(date_dir):
                    time_dir = date_dir / time_fname

                    mc_type_results.append(extract_result(time_dir, methods=methods))

        results[mc_type] = pd.DataFrame.from_dict(mc_type_results, orient='columns')
        
    return results


def aggregate_results(data_path, frac, mc_types=None):
    data_path = Path(data_path)
    if mc_types is None:
        mc_types = ['DPP_last', 'MC_last', 'MC_all']
    
    default_methods = {
        'bald': bald,
        'sampled_max_prob': sampled_max_prob,
        'variance': probability_variance
    }

    all_results = extract_all_results(data_path, mc_types=mc_types, frac=frac, methods=default_methods)
        
    return all_results


def format_results(all_results, baseline_coords=('DPP_last', 'max_prob')):
    baseline_row = baseline_coords[0]
    baseline_column = baseline_coords[1]
    baseline = all_results[baseline_row][baseline_column]
    
    all_formatted_result = {}
    for mc_type, results in all_results.items():
        diff_res = results.drop(columns=baseline_column).subtract(baseline, axis='rows')
        mean_res = diff_res.mean(axis=0)
        std_res = diff_res.std(axis=0)

        diff_final_res = pd.DataFrame.from_records([mean_res, std_res], index=['mean', 'std']).T
        diff_final_res *= 100.

        def mean_std_str(row):
            return '{:.1f}±{:.1f}'.format(row[0], row[1])

        formatted_results = diff_final_res.apply(mean_std_str, raw=True, axis=1)
        baseline_percent = baseline*100
        formatted_results.loc['baseline (max_prob)'] = mean_std_str([baseline_percent.mean(), baseline_percent.std()])
        all_formatted_result[mc_type] = formatted_results
    
    return all_formatted_result


def covert_into_series(formatted_results):
    ser = pd.Series()
    ser['baseline (max_prob)'] = formatted_results['DPP_last']['baseline (max_prob)']
    all_series = [ser]
    for mc_type, res in formatted_results.items():
        ser = pd.Series()
        for i in res.index:
            if i == 'baseline (max_prob)':
                continue

            ser[f'{mc_type}_{i}'] = res[i]

        all_series.append(ser)
    return pd.concat(all_series)


def build_eval_table(dataset_paths, mc_types=None):
    series = []
    for path, frac, name in dataset_paths:
        agg_res = aggregate_results(path, frac=frac, mc_types=mc_types)
        formatted_res = format_results(agg_res)
        ser = covert_into_series(formatted_res)
        series.append((ser,name))
    
    return pd.DataFrame.from_records([e[0] for e in series], 
                                     index=[e[1] for e in series]).T

In [6]:
dataset_paths = [
    ('../workdir/runs_results/SST-2/', 0.2, 'SST-2'),
    ('../workdir/runs_results/CoLA/', 0., 'CoLA'),
    ('../workdir/runs_results/MRPC/', 0., 'MRPC')
]

mc_types = ['DPP_last', 'MC_last', 'MC_all']
eval_table = build_eval_table(dataset_paths, mc_types=mc_types)
eval_table

Unnamed: 0,SST-2,CoLA,MRPC
baseline (max_prob),82.3±1.8,78.2±1.7,79.9±3.1
DPP_last_bald,2.5±3.2,0.3±1.7,-3.2±1.2
DPP_last_sampled_max_prob,2.7±1.7,1.5±1.0,1.2±5.0
DPP_last_variance,2.4±2.6,0.7±1.5,-0.8±3.9
MC_last_bald,-5.2±2.1,-4.1±2.1,-3.4±2.3
MC_last_sampled_max_prob,-0.6±0.7,-0.0±0.1,1.7±1.7
MC_last_variance,-3.0±1.2,-1.9±1.1,-0.9±2.6
MC_all_bald,3.0±2.3,1.9±2.1,4.2±1.4
MC_all_sampled_max_prob,2.8±1.5,2.6±1.6,4.1±1.2
MC_all_variance,2.9±2.2,2.1±2.0,4.3±1.0


In [None]:
index_map = {'DPP_last_bald' : ''}

In [9]:
print(str(eval_table.to_latex()).replace('±', '$\pm$'))

\begin{tabular}{llll}
\toprule
{} &     SST-2 &      CoLA &      MRPC \\
\midrule
baseline (max\_prob)       &  82.3$\pm$1.8 &  78.2$\pm$1.7 &  79.9$\pm$3.1 \\
DPP\_last\_bald             &   2.5$\pm$3.2 &   0.3$\pm$1.7 &  -3.2$\pm$1.2 \\
DPP\_last\_sampled\_max\_prob &   2.7$\pm$1.7 &   1.5$\pm$1.0 &   1.2$\pm$5.0 \\
DPP\_last\_variance         &   2.4$\pm$2.6 &   0.7$\pm$1.5 &  -0.8$\pm$3.9 \\
MC\_last\_bald              &  -5.2$\pm$2.1 &  -4.1$\pm$2.1 &  -3.4$\pm$2.3 \\
MC\_last\_sampled\_max\_prob  &  -0.6$\pm$0.7 &  -0.0$\pm$0.1 &   1.7$\pm$1.7 \\
MC\_last\_variance          &  -3.0$\pm$1.2 &  -1.9$\pm$1.1 &  -0.9$\pm$2.6 \\
MC\_all\_bald               &   3.0$\pm$2.3 &   1.9$\pm$2.1 &   4.2$\pm$1.4 \\
MC\_all\_sampled\_max\_prob   &   2.8$\pm$1.5 &   2.6$\pm$1.6 &   4.1$\pm$1.2 \\
MC\_all\_variance           &   2.9$\pm$2.2 &   2.1$\pm$2.0 &   4.3$\pm$1.0 \\
\bottomrule
\end{tabular}

