In [103]:
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score

from ue4nlp.ue_scores import *


def calc_roc_aucs(probabilities, labels, sampled_probabilities, methods):
    predictions = np.argmax(probabilities, axis=-1)
    errors = (labels != predictions).astype('uint8')

    results = {}
    for name, method_function in methods.items():
        ue_scores = method_function(sampled_probabilities)
        results[name] = roc_auc_score(errors, ue_scores)
    
    max_prob = 1. - np.max(probabilities, axis=-1)
    results['max_prob'] = roc_auc_score(errors, max_prob)
    return results


def extract_result(time_dir, methods):
    with open(Path(time_dir) / 'dev_inference.json') as f:
        model_outputs = json.load(f)
    
    return calc_roc_aucs(np.asarray(model_outputs['probabilities']), 
                         np.asarray(model_outputs['true_labels']), 
                         np.asarray(model_outputs['sampled_probabilities']).transpose(1, 0, 2),
                         methods=methods)


def extract_all_results(data_path, mc_types, frac, methods):
    results = {}

    for mc_type in mc_types:
        mc_type_dir = data_path / mc_type / str(frac)

        mc_type_results = []
        for suffix in os.listdir(mc_type_dir):
            suffix_dir = mc_type_dir / suffix
            
            for date_fname in os.listdir(suffix_dir):
                date_dir = suffix_dir / date_fname

                for time_fname in os.listdir(date_dir):
                    time_dir = date_dir / time_fname

                    try:
                        mc_type_results.append(extract_result(time_dir, methods=methods))
                    except FileNotFoundError:
                        pass

        results[mc_type] = pd.DataFrame.from_dict(mc_type_results, orient='columns')
        
    return results


def aggregate_results(data_path, frac, mc_types=None):
    data_path = Path(data_path)
    if mc_types is None:
        mc_types = ['DPP_last', 'MC_last', 'MC_all']
    
    default_methods = {
        'bald': bald,
        'sampled_max_prob': sampled_max_prob,
        'variance': probability_variance
    }

    all_results = extract_all_results(data_path, mc_types=mc_types, frac=frac, methods=default_methods)

#     for k, v in all_results.items():
#         print(k)
#         print(v)
#         print()
        
    return all_results


def format_results(all_results, baseline_coords=('DPP_last', 'max_prob')):
    baseline_row = baseline_coords[0]
    baseline_column = baseline_coords[1]
    baseline = all_results[baseline_row][baseline_column]
    
    all_formatted_result = {}
    for mc_type, results in all_results.items():
        diff_res = results.drop(columns=baseline_column).subtract(baseline, axis='rows')
        mean_res = diff_res.mean(axis=0)
        std_res = diff_res.std(axis=0)

        diff_final_res = pd.DataFrame.from_records([mean_res, std_res], index=['mean', 'std']).T
        diff_final_res *= 100.

        def mean_std_str(row):
            return '{:.1f}±{:.1f}'.format(row[0], row[1])

        formatted_results = diff_final_res.apply(mean_std_str, raw=True, axis=1)
        baseline_percent = baseline*100
        formatted_results.loc['baseline (max_prob)'] = mean_std_str([baseline_percent.mean(), baseline_percent.std()])
        all_formatted_result[mc_type] = formatted_results
    
    return all_formatted_result


def improvement_over_baseline(results, baseline_col):
    baseline = results[baseline_col]
    diff_res = results.drop(columns=baseline_col).subtract(baseline, axis='rows')
    
    mean_res = diff_res.mean(axis=0)
    std_res = diff_res.std(axis=0)

    diff_final_res = pd.DataFrame.from_records([mean_res, std_res], index=['mean', 'std']).T
    diff_final_res *= 100.

    def mean_std_str(row):
        return '{:.1f}±{:.1f}'.format(row[0], row[1])

    formatted_results = diff_final_res.apply(mean_std_str, raw=True, axis=1)
    baseline_percent = baseline*100
    formatted_results.loc['baseline (max_prob)'] = mean_std_str([baseline_percent.mean(), baseline_percent.std()])
    return formatted_results


def covert_into_series(formatted_results):
    ser = pd.Series()
    ser['baseline (max_prob)'] = formatted_results['DPP_last']['baseline (max_prob)']
    all_series = [ser]
    for mc_type, res in formatted_results.items():
        ser = pd.Series()
        for i in res.index:
            if i == 'baseline (max_prob)':
                continue

            ser[f'{mc_type}_{i}'] = res[i]

        all_series.append(ser)
    return pd.concat(all_series)


def build_eval_table(dataset_paths, mc_types=None):
    series = []
    for path, frac, name in dataset_paths:
        agg_res = aggregate_results(path, frac=frac, mc_types=mc_types)
        formatted_res = format_results(agg_res)
        ser = covert_into_series(formatted_res)
        series.append((ser,name))
    
    return pd.DataFrame.from_records([e[0] for e in series], 
                                     index=[e[1] for e in series]).T

In [6]:
dataset_paths = [
    ('../workdir/runs_results/SST-2/', 0.2, 'SST-2'),
    ('../workdir/runs_results/CoLA/', 0., 'CoLA'),
    ('../workdir/runs_results/MRPC/', 0., 'MRPC')
]

mc_types = ['DPP_last', 'MC_last', 'MC_all']
eval_table = build_eval_table(dataset_paths, mc_types=mc_types)
eval_table

Unnamed: 0,SST-2,CoLA,MRPC
baseline (max_prob),82.3±1.8,78.2±1.7,79.9±3.1
DPP_last_bald,2.5±3.2,0.3±1.7,-3.2±1.2
DPP_last_sampled_max_prob,2.7±1.7,1.5±1.0,1.2±5.0
DPP_last_variance,2.4±2.6,0.7±1.5,-0.8±3.9
MC_last_bald,-5.2±2.1,-4.1±2.1,-3.4±2.3
MC_last_sampled_max_prob,-0.6±0.7,-0.0±0.1,1.7±1.7
MC_last_variance,-3.0±1.2,-1.9±1.1,-0.9±2.6
MC_all_bald,3.0±2.3,1.9±2.1,4.2±1.4
MC_all_sampled_max_prob,2.8±1.5,2.6±1.6,4.1±1.2
MC_all_variance,2.9±2.2,2.1±2.0,4.3±1.0


In [None]:
index_map = {'DPP_last_bald' : ''}

In [9]:
print(str(eval_table.to_latex()).replace('±', '$\pm$'))

\begin{tabular}{llll}
\toprule
{} &     SST-2 &      CoLA &      MRPC \\
\midrule
baseline (max\_prob)       &  82.3$\pm$1.8 &  78.2$\pm$1.7 &  79.9$\pm$3.1 \\
DPP\_last\_bald             &   2.5$\pm$3.2 &   0.3$\pm$1.7 &  -3.2$\pm$1.2 \\
DPP\_last\_sampled\_max\_prob &   2.7$\pm$1.7 &   1.5$\pm$1.0 &   1.2$\pm$5.0 \\
DPP\_last\_variance         &   2.4$\pm$2.6 &   0.7$\pm$1.5 &  -0.8$\pm$3.9 \\
MC\_last\_bald              &  -5.2$\pm$2.1 &  -4.1$\pm$2.1 &  -3.4$\pm$2.3 \\
MC\_last\_sampled\_max\_prob  &  -0.6$\pm$0.7 &  -0.0$\pm$0.1 &   1.7$\pm$1.7 \\
MC\_last\_variance          &  -3.0$\pm$1.2 &  -1.9$\pm$1.1 &  -0.9$\pm$2.6 \\
MC\_all\_bald               &   3.0$\pm$2.3 &   1.9$\pm$2.1 &   4.2$\pm$1.4 \\
MC\_all\_sampled\_max\_prob   &   2.8$\pm$1.5 &   2.6$\pm$1.6 &   4.1$\pm$1.2 \\
MC\_all\_variance           &   2.9$\pm$2.2 &   2.1$\pm$2.0 &   4.3$\pm$1.0 \\
\bottomrule
\end{tabular}



In [None]:
dataset_paths = [
    ('../workdir/run_avg_results/2020-11-13/03-42-02/', 0.2, 'SST-2'),
#     ('../workdir/runs_results/CoLA/', 0., 'CoLA'),
#     ('../workdir/runs_results/MRPC/', 0., 'MRPC')
]

mc_types = ['DPP_last', 'MC_last', 'MC_all']
eval_table = build_eval_table(dataset_paths, mc_types=mc_types)
eval_table

In [10]:
default_methods = {
        'bald': bald,
        'sampled_max_prob': sampled_max_prob,
        'variance': probability_variance
    }

extract_result('../workdir/run_avg_results/2020-11-13/03-42-02/results/101/101', methods=default_methods)

{'bald': 0.818877551020408,
 'sampled_max_prob': 0.8532003710575139,
 'variance': 0.8386479591836734,
 'max_prob': 0.8531496405380334}

In [126]:
def aggregate_runs(data_path, methods):
    results = []

    for model_seed in os.listdir(data_path):
        model_path = Path(data_path) / model_seed

        model_results = []
        for run_seed in os.listdir(model_path):
            run_dir = model_path / run_seed
            
            try:
                model_results.append(extract_result(run_dir, methods=methods))
            except FileNotFoundError:
                pass
            
        print(pd.DataFrame.from_dict(model_results, orient='columns'))
        model_avg_res = pd.DataFrame.from_dict(model_results, orient='columns').mean(axis=0)
        results.append(model_avg_res.to_dict())

    return pd.DataFrame.from_dict(results, orient='columns')

In [240]:
runs_dir = '../workdir/run_avg_results/2020-12-08/00-05-54//results/'
agg_res = aggregate_runs(runs_dir, methods=default_methods)
improvement = improvement_over_baseline(agg_res, baseline_col='max_prob')
print(improvement)
agg_res

       bald  sampled_max_prob  variance  max_prob
0  0.799062          0.841159  0.820638  0.851712
1  0.806660          0.821506  0.821623  0.851712
2  0.817917          0.853565  0.829995  0.851712
3  0.780113          0.847045  0.808630  0.851712
4  0.777486          0.819794  0.801900  0.851712
       bald  sampled_max_prob  variance  max_prob
0  0.808501          0.837654  0.813464  0.843407
1  0.807455          0.845043  0.846601  0.843407
2  0.854790          0.837565  0.867653  0.843407
3  0.843307          0.836742  0.866452  0.843407
4  0.779481          0.836586  0.804139  0.843407
       bald  sampled_max_prob  variance  max_prob
0  0.829362          0.815800  0.829642  0.840749
1  0.816188          0.831385  0.826520  0.840749
2  0.761081          0.844150  0.799935  0.840749
3  0.851641          0.861845  0.871295  0.840749
4  0.819955          0.810483  0.812830  0.840749
       bald  sampled_max_prob  variance  max_prob
0  0.786259          0.847466  0.809100  0.824695


Unnamed: 0,bald,sampled_max_prob,variance,max_prob
0,0.796248,0.836614,0.816557,0.851712
1,0.818707,0.838718,0.839662,0.843407
2,0.815645,0.832733,0.828044,0.840749
3,0.796446,0.825918,0.811684,0.824695
4,0.799952,0.824462,0.812566,0.821122


In [50]:
runs_dir = '../workdir/run_avg_results/2020-11-17/20-27-35/results'
agg_res = aggregate_runs(runs_dir, methods=default_methods)
improvement = improvement_over_baseline(agg_res, baseline_col='max_prob')
print(improvement)
agg_res

bald                   -40.9±7.5
sampled_max_prob        -2.2±3.7
variance               -12.1±4.9
baseline (max_prob)     83.6±1.3
dtype: object


Unnamed: 0,bald,sampled_max_prob,variance,max_prob
0,0.422186,0.873851,0.778987,0.851712
1,0.500835,0.833382,0.7536,0.843407
2,0.325347,0.83963,0.732386,0.840749
3,0.493481,0.755271,0.690708,0.824695
4,0.392354,0.770877,0.622702,0.821122


In [56]:
runs_dir = '../workdir/run_avg_results/2020-11-17/22-32-54/results/'
agg_res = aggregate_runs(runs_dir, methods=default_methods)
improvement  = improvement_over_baseline(agg_res, baseline_col='max_prob')
print(improvement)
agg_res

bald                   -36.4±4.9
sampled_max_prob        -3.9±2.7
variance               -11.7±3.3
baseline (max_prob)     83.6±1.3
dtype: object


Unnamed: 0,bald,sampled_max_prob,variance,max_prob
0,0.432411,0.838508,0.738532,0.851712
1,0.544386,0.818538,0.676154,0.843407
2,0.487848,0.8189,0.724938,0.840749
3,0.480888,0.750238,0.74858,0.824695
4,0.41601,0.761029,0.708924,0.821122


In [104]:
#runs_dir = '../workdir/run_avg_results/2020-11-17/23-23-18/results/'
runs_dir = '../workdir/run_avg_results/2020-11-24/11-53-40///results'
agg_res = aggregate_runs(runs_dir, methods=default_methods)
improvement  = improvement_over_baseline(agg_res, baseline_col='max_prob')
print(improvement)
agg_res

bald                   -2.0±4.2
sampled_max_prob       -0.6±1.8
variance               -1.4±2.4
baseline (max_prob)    83.6±1.3
dtype: object


Unnamed: 0,bald,sampled_max_prob,variance,max_prob
0,0.788602,0.828682,0.801079,0.851712
1,0.827484,0.851218,0.837365,0.843407
2,0.874158,0.839458,0.853342,0.840749
3,0.831598,0.797882,0.82044,0.824695
4,0.759322,0.832502,0.798144,0.821122


In [122]:
import yaml
from yaml import Loader as Loader


def get_model_type(model_path):
    model_type = model_path.split('/')[-2]
    return model_type


def collect_configs(dir_name):
    cfg_str = ''
    for model_seed in os.listdir(dir_name):
        model_path = Path(dir_name) / model_seed
        for run_seed in os.listdir(model_path):
            run_path = model_path / run_seed
            
            with open(run_path / '.hydra' / 'config.yaml') as f:
                cfg = yaml.load(f, Loader=Loader)
            
            cfg_str_new = '_'.join(str(e) for e in (cfg['ue']['dropout']['dry_run_dataset'], 
                                                    cfg['ue']['dropout']['mask_name'],
                                                    cfg['ue']['dropout']['max_frac'],
                                                    get_model_type(cfg['model']['model_name_or_path']))
                                  )
            
            if cfg_str:
                if cfg_str != cfg_str_new:
                    print('Error, different cfg_strs:', cfg_str, cfg_str_new)
            
            cfg_str = cfg_str_new
    
    return cfg_str


results_dir = '../workdir/run_avg_results/2020-12-07/'

for time_name in os.listdir(results_dir):
    print(time_name)
    runs_dir = Path(results_dir) / time_name / 'results'
    if not runs_dir.is_dir():
        continue
        
    agg_res = aggregate_runs(runs_dir, methods=default_methods)
    
    print(collect_configs(runs_dir))
    if agg_res.empty:
        print('Broken')
        continue
    
    improvement = improvement_over_baseline(agg_res, baseline_col='max_prob')
    print(improvement)
    print()

14-57-01
eval_ht_dpp_0.2_1
bald                   -4.0±3.5
sampled_max_prob       -0.5±1.1
variance               -2.7±2.4
baseline (max_prob)    80.8±0.8
dtype: object

16-15-31
train_ht_dpp_0.2_1
bald                   -3.5±1.7
sampled_max_prob       -2.1±1.7
variance               -2.8±1.6
baseline (max_prob)    80.8±0.8
dtype: object

15-13-05
eval_cov_dpp_0.30000000000000004_1
bald                   -3.9±2.1
sampled_max_prob       -0.1±0.2
variance               -1.7±1.4
baseline (max_prob)    80.8±0.8
dtype: object

15-25-06
eval_cov_dpp_0.4_1
bald                   -2.9±1.9
sampled_max_prob       -0.1±0.3
variance               -1.6±1.2
baseline (max_prob)    80.8±0.8
dtype: object

13-52-17
13-54-16
13-50-50
14-26-58
eval_cov_dpp_0.1_1
bald                   -2.3±1.7
sampled_max_prob       -0.2±0.8
variance               -1.2±1.4
baseline (max_prob)    80.8±0.8
dtype: object

13-50-47
13-54-18
15-56-54
eval_cov_dpp_0.6_1
bald                   -4.0±1.8
sampled_max_prob       -0

In [116]:
(40 / 876)*1000

45.662100456621005

In [117]:
(2 / 876)*1000

2.28310502283105

In [118]:
(13 / 876) * 1000

14.840182648401825

In [179]:
import torch
import torch.nn.functional as F

from transformers import set_seed

set_seed(10)
inpt = torch.ones(100)
F.dropout(inpt, p=0.8)

tensor([0., 0., 0., 0., 0., 0., 5., 0., 0., 0., 0., 5., 5., 5., 0., 0., 0., 0.,
        5., 0., 5., 0., 0., 0., 0., 5., 0., 0., 5., 0., 5., 0., 0., 0., 0., 0.,
        0., 5., 5., 0., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 5.,
        0., 0., 0., 5., 5., 0., 0., 0., 0., 0., 0., 5., 0., 0., 0., 0., 0., 5.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 5., 5., 0., 0., 0., 0., 0., 0., 0.,
        5., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
test_path = '/notebook/projects/active_learning/uncertainty-estimation/workdir/run_avg_results/2020-12-07/23-16-51/results/23/91/dev_inference.json'
import json

with open(test_path) as f:
    results = json.load(f)
    
results['sampled_probabilities'][0]

In [243]:
import torch

K = torch.rand(3, 3)
K

tensor([[0.4967, 0.4521, 0.0575],
        [0.0687, 0.0501, 0.0108],
        [0.0343, 0.1212, 0.0490]])

In [244]:
torch.mm(K, torch.inverse(K))

tensor([[ 1.0000e+00, -3.4490e-06,  9.8187e-08],
        [ 2.2846e-08,  1.0000e+00,  1.4661e-08],
        [ 1.8385e-08, -6.0536e-07,  1.0000e+00]])