# Imports

In [1]:
#######################################
### -------- Load libraries ------- ###
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizer
from transformers import TFAlbertModel,  AlbertConfig, AlbertTokenizer
from transformers import TFRobertaModel,  RobertaConfig, RobertaTokenizer
from transformers import TFDistilBertModel, BertTokenizer, DistilBertConfig
from transformers import TFXLMModel, XLMTokenizer, XLMConfig, TFSequenceSummary
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer, XLMRobertaConfig
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from official.nlp import optimization  # to create AdamW optimizer
import official.nlp.modeling.layers as layers
import tensorflow_addons as tfa
# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
import os
import time
from collections import defaultdict
import glob
#loggging
from datetime import datetime

from tensorflow import keras

import numpy as np
import random

#uncertainty
from robustness_metrics.metrics import uncertainty

#This metric computes the percentage ofcorrectly rejected examples, which is the percentage 
#of incorrect predictionsamong all the abstained examples.
from metrics import AbstainPrecision
''' Different from `AbstainPrecision`, `AbstainRecall` computes the percentage of
  correctly abstained examples among all the incorrect predictions that **could
  have been abstained**. '''
from metrics import AbstainRecall

#custom
#custom function
from Refactoring.loading import load, load_combined_languages, load_artificial_ood, get_reduced_label_dict, get_train_dict
from Refactoring.model import load_model, build_classifier_model, build_classifier_model_last4
from Refactoring.model_train_param import build_classifier_model_training, build_classifier_model_last4_training #includes training paramter
from Refactoring.model import bert_optimizer, get_layers, dlr_optimizer
from Refactoring.evaluation import test_prediction, save_report, save_metrics

from Refactoring.uncertainty_functions import create_Vanilla_pred, mc_dropout_sampling, mc_predictions, save_single_model_predictions, create_deep_ensemble_predictions, compute_metrics,compute_metrics_zero_shot,get_class_ranking,abstain_accuracy_top_k

2022-01-07 14:40:25.556230: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


# Settings

In [2]:
#####################################
######### Necessary Setting s########

method = "Deep Ensemble" #Deep Ensemble, MC Dropout, Vanilla

model_path = "EN_last4_concat_cls_1_multi_False_Distilbert_0.0001_5_20220107-091042_random_state42" #for MC Dropout & Vanilla

train_part2_ood = True #set to true if one wants to evaluate for artificial ood setting --> additional mapping after predicting needed

zero_shot = False


assert(train_part2_ood + zero_shot <2) #either artificial ood or zero-shot setting (or none, i.e. in-domain)

assert(method=="Vanilla" or method=="Deep Ensemble" or method=="MC Dropout")


#####################################
########## Seeds ####################

seed= 42 #42 standard, 0, 21, 99, 365

random.seed(seed)

np.random.seed(seed)



#####################################
######### Necessary Settings ########

language = "EN" #EN, DE, RO, All languages only choice if standard bert_type

test_language = "DE" #EN, DE, RO, All languages only choice if standard bert_type

language_model_relation = "specific" # specific vs multi

bert_type = "Distilbert"  #Roberta, #Distilbert, #BERT # RobertaXLM

epochs = 5 #5 is standard for bert models , 10 for roberta models

learning_rate = 1e-4 #5e-5 or 2e-5, 1e-4, 4e-4

layer_strategy = 'last4' #last, last4 --> automatically defaults to last2 for distilbert models

reduce_strategy = 'cls' # cls,mean, max

last_layer_strategy = "concat" #mean, max, concat --> only relevant for last4 layer strategy

decay_factor = 1 # only takes effect with layer wise lr

layer_wise_lr = False


if zero_shot:
    assert(language_model_relation == "multi")
    assert(language != test_language)

#####################################
#########  Default Settings  ########

#these were not changed in the end. sometimes still used for logging/naming purposes

size = "medium" #used only medium in the end

cased = False # always used recommended case for each model, thus do not change

class_weighting = False #relic of initial experiments

max_length = 40 #only 40 used

batch_size = 32 #only 32 used

val_size=0.2 #only 0.2 used

num_ensemble = 1 #only used for logging in with Vanilla method

freeze = False #if one wants to train classification head first only. placeholder is in training loop, 
                #but should be adjusted if one wants to use it 
                #(e.g. set different learning rates for the two training phases)

tf.get_logger().setLevel('ERROR')


In [3]:
'''
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())
'''

'\nfrom tensorflow.python.client import device_lib\nprint(device_lib.list_local_devices())\n'

# Loading

In [4]:
if not zero_shot:
    X_train, X_test, y_train, y_test, n_classes_test, label_dict_test = load(language=language, seed=seed)
    if train_part2_ood:
        label_dict_train, n_classes_reduced = get_reduced_label_dict(language=language)
else:
    X_train, X_test, y_train, y_test, n_classes_test, label_dict_test = load(language=test_language, seed=seed)
    label_dict_train, n_classes_train = get_train_dict(language=language)


  X_train, X_test, y_train, y_test, n_classes_test, label_dict_test = load(language=language, seed=seed)
  label_dict_train, n_classes_reduced = get_reduced_label_dict(language=language)


In [5]:
len_train = len(X_train)

In [6]:
tokenizer, transformer_model, config = load_model(layer_strategy=layer_strategy, bert_type=bert_type, cased=cased, language=language, language_model_relation=language_model_relation)

output hidden


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.
2022-01-07 14:40:44.118421: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2022-01-07 14:40:44.121221: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:65:00.0 name: GeForce GTX 1080 Ti computeCapability: 6.1
coreClock: 1.582GHz coreCount: 28 deviceMemorySize: 10.92GiB deviceMemoryBandwidth: 451.17GiB/s
2022-01-07 14:40:44.121246: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2022-01-07 14:40:44.124758: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcub

PreTrainedTokenizer(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel object at 0x7f070b52bd00>


In [7]:
#######################################
### ----- Evaluate the model ------ ###
# Ready test data
y_test_categorical = to_categorical(y_test,num_classes=n_classes_test)
test_x = tokenizer(
    text=list(X_test),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [8]:
if method == "Vanilla":
    if train_part2_ood:
        print("Model for Vanilla & Out-of-Domain") 
        if layer_strategy == 'last':
            model = build_classifier_model(seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, config=config, n_classes=n_classes_reduced)
        elif layer_strategy == 'last4':
            model = build_classifier_model_last4(seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, last_layer_strategy=last_layer_strategy, config=config, n_classes=n_classes_reduced)
    elif zero_shot:
        print("Model for Vanilla & Zero-Shot")         
        if layer_strategy == 'last':
            model = build_classifier_model(seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, config=config, n_classes=n_classes_train)
        elif layer_strategy == 'last4':
            model = build_classifier_model_last4(seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, last_layer_strategy=last_layer_strategy, config=config, n_classes=n_classes_train)    
    else:
        print("Model for Vanilla & In-Domain")  
        if layer_strategy == 'last':
            model = build_classifier_model(seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, config=config, n_classes=n_classes_test)
        elif layer_strategy == 'last4':
            model = build_classifier_model_last4(seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, last_layer_strategy=last_layer_strategy, config=config, n_classes=n_classes_test)
    model.summary()
elif method == "MC Dropout":
    if train_part2_ood:
        print("Model for MC Dropout & Out-of-Domain")
        if layer_strategy == 'last':
            model = build_classifier_model_training(training=True, seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, config=config, n_classes=n_classes_reduced)
        elif layer_strategy == 'last4':
            model = build_classifier_model_last4_training(training=True, seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, last_layer_strategy=last_layer_strategy, config=config, n_classes=n_classes_reduced)
    elif zero_shot:
        print("Model for MC Dropout & Zero-Shot")   
        if layer_strategy == 'last':
            model = build_classifier_model_training(training=True, seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, config=config, n_classes=n_classes_train)
        elif layer_strategy == 'last4':
            model = build_classifier_model_last4_training(training=True, seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, last_layer_strategy=last_layer_strategy, config=config, n_classes=n_classes_train)
    else:
        print("Model for MC Dropout & In-Domain")
        if layer_strategy == 'last':
            model = build_classifier_model_training(training=True, seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, config=config, n_classes=n_classes_test)
        elif layer_strategy == 'last4':
            model = build_classifier_model_last4_training(training=True, seed=None, tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, last_layer_strategy=last_layer_strategy, config=config, n_classes=n_classes_test)
    model.summary()

# Vanilla

In [9]:
if method == "Vanilla":
    if train_part2_ood:
        print("mapping predictions --> ood")
        predictions_probs = create_Vanilla_pred(model_path=model_path, test_x=test_x, model=model, ood=train_part2_ood,zero_shot=zero_shot, label_dict_train=label_dict_train, label_dict_test=label_dict_test)
        compute_metrics(method=method, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=predictions_probs,dropout_probs_mean=predictions_probs, dropout_probs_var=predictions_probs,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)
    elif zero_shot:
        print("mapping predictions --> zero_shot")
        df_dict = {}
        predictions_probs = create_Vanilla_pred(model_path=model_path, test_x=test_x, model=model, ood=train_part2_ood,zero_shot=zero_shot, label_dict_train=label_dict_train, label_dict_test=label_dict_test)
        for k in [1,3,5,10,20,30,40,50,60,70,80,90,100]:
            df = compute_metrics_zero_shot(method=method,k=k, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=predictions_probs,dropout_probs_mean=predictions_probs, dropout_probs_var=predictions_probs,num_ensemble=1, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)
            df_dict["combined"+"_"  + str(k) +"_" + model_path + "_" + str(language)+"_"+str(layer_strategy) + "_"+str(last_layer_strategy)+"_"+ str(reduce_strategy) + "_"+str(decay_factor) +"_" + str(language_model_relation) + "_" + str(cased)+ "_" + str(bert_type) +"_" + str(learning_rate) +datetime.now().strftime("%Y%m%d-%H%M%S")+ "_random_state" + str(seed) +"_technique_" +str(method)+"_ensembles_"+str(num_ensemble)+".csv"] = df        
        for key, value in df_dict.items():
            value.to_csv(key)
    else:
        print("NOT mapping predictions --> in-domain")
        predictions_probs = create_Vanilla_pred(model_path=model_path, test_x=test_x, model=model, ood=train_part2_ood,zero_shot=zero_shot)
        compute_metrics(method=method, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=predictions_probs,dropout_probs_mean=predictions_probs, dropout_probs_var=predictions_probs,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)
    

# MC Dropout

In [10]:
if method == "MC Dropout":
    if train_part2_ood:
        print("mapping predictions --> ood")
        for num_ensemble in [3,5,10,20,30,40,50,60,70,80,90,100]:#
            print(num_ensemble)
            dropout_probs_samples, dropout_probs_mean, dropout_probs_var = mc_predictions(num_ensemble=num_ensemble, model_path=model_path, test_x=test_x, model=model, ood=train_part2_ood,zero_shot=zero_shot, label_dict_train=label_dict_train, label_dict_test=label_dict_test)

            compute_metrics(method=method, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=dropout_probs_samples,dropout_probs_mean=dropout_probs_mean, dropout_probs_var=dropout_probs_var,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)
    elif zero_shot:
        print("mapping predictions --> zero-shot")
        df_dict = {}
        for num_ensemble in [3,5,10,20,30,40,50,60,70,80,90,100]:#
            dropout_probs_samples, dropout_probs_mean, dropout_probs_var = mc_predictions(num_ensemble=num_ensemble, model_path=model_path, test_x=test_x, model=model, ood=train_part2_ood,zero_shot=zero_shot, label_dict_train=label_dict_train, label_dict_test=label_dict_test)
            for k in [1,3,5,10,20,30,40,50,60,70,80,90,100]:
                df = compute_metrics_zero_shot(method=method,k=k, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=dropout_probs_samples,dropout_probs_mean=dropout_probs_mean, dropout_probs_var=dropout_probs_var,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)
                df_dict["combined"+"_"  + str(k) +"_" + model_path + "_" + str(language)+"_"+str(layer_strategy) + "_"+str(last_layer_strategy)+"_"+ str(reduce_strategy) + "_"+str(decay_factor) +"_" + str(language_model_relation) + "_" + str(cased)+ "_" + str(bert_type) +"_" + str(learning_rate) +datetime.now().strftime("%Y%m%d-%H%M%S")+ "_random_state" + str(seed) +"_technique_" +str(method)+"_ensembles_"+str(num_ensemble)+".csv"] = df
        for key, value in df_dict.items():
            value.to_csv(key)
    else:
        print("NOT mapping predictions --> in-domain")
        for num_ensemble in [3,5,10,20,30,40,50,60,70,80,90,100]:#
            print(num_ensemble)
            dropout_probs_samples, dropout_probs_mean, dropout_probs_var = mc_predictions(num_ensemble=num_ensemble, model_path=model_path, test_x=test_x, model=model, ood=train_part2_ood,zero_shot=zero_shot)

            compute_metrics(method=method, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=dropout_probs_samples,dropout_probs_mean=dropout_probs_mean, dropout_probs_var=dropout_probs_var,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)

# Deep Ensemble

## First execute the following cell to obtain deep ensemble predictions

In [None]:
if method == "Deep Ensemble":
    if train_part2_ood:
        print("Model for Deep Ensemble & OOD")   
        save_single_model_predictions(test_x=test_x,tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy, config=config,language=language,language_model_relation=language_model_relation,cased=cased,learning_rate=learning_rate,seed=seed,decay_factor=decay_factor,num_ensemble=num_ensemble, n_classes=n_classes_reduced)
    elif zero_shot:
        print("Model for Deep Ensemble & Zero-Shot")   
        save_single_model_predictions(test_x=test_x,tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy, config=config,language=language,language_model_relation=language_model_relation,cased=cased,learning_rate=learning_rate,seed=seed,decay_factor=decay_factor,num_ensemble=num_ensemble, n_classes=n_classes_train)
    else:
        print("Model for Deep Ensemble & In-Domain")   
        save_single_model_predictions(test_x=test_x,tokenizer=tokenizer, transformer_model=transformer_model, max_length=max_length,bert_type=bert_type, reduce_strategy=reduce_strategy, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy, config=config,language=language,language_model_relation=language_model_relation,cased=cased,learning_rate=learning_rate,seed=seed,decay_factor=decay_factor,num_ensemble=num_ensemble, n_classes=n_classes_test)


Model for Deep Ensemble & OOD
EN_last4_concat_cls_1_specific_False_Distilbert_0.0001_5_20220104-171641_random_state42
EN_last4_concat_cls_1_specific_False_Distilbert_0.0001_5_20220104-185410_random_state42


## Then after saving the predictions, load them and compute the uncertainty metrics

In [None]:
load_path="EN_last4_concat_cls_1_multi_False_Distilbert_0.0001_20220105-100906_random_state42_technique_For Deep Ensemble_ensembles_1.npy"
if method == "Deep Ensemble":
    if train_part2_ood:
        print("mapping predictions --> ood")
        for num_ensemble in [3,4,5,6,7,8,9,10]:#3,4,5,6,7,8,9,
            print(num_ensemble)
            model_prob_samples, model_probs_mean, model_probs_var = create_deep_ensemble_predictions(load_path=load_path,num_ensemble=num_ensemble, ood=train_part2_ood,zero_shot=zero_shot, label_dict_train=label_dict_train, label_dict_test=label_dict_test)
            compute_metrics(method=method, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=model_prob_samples,dropout_probs_mean=model_probs_mean, dropout_probs_var=model_probs_var,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)
    elif zero_shot:
        print("mapping predictions --> zero-shot")
        df_dict = {}
        for num_ensemble in [3,4,5,6,7,8,9,10]:#3,4,5,6,7,8,9,
            model_prob_samples, model_probs_mean, model_probs_var = create_deep_ensemble_predictions(load_path=load_path,num_ensemble=num_ensemble, ood=train_part2_ood,zero_shot=zero_shot, label_dict_train=label_dict_train, label_dict_test=label_dict_test)
            for k in [1,3,5,10,20,30,40,50,60,70,80,90,100]:
                df = compute_metrics_zero_shot(method=method,k=k, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=model_prob_samples,dropout_probs_mean=model_probs_mean, dropout_probs_var=model_probs_var,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)
                df_dict["combined"+"_"  + str(k) +"_" + model_path + "_" + str(language)+"_"+str(layer_strategy) + "_"+str(last_layer_strategy)+"_"+ str(reduce_strategy) + "_"+str(decay_factor) +"_" + str(language_model_relation) + "_" + str(cased)+ "_" + str(bert_type) +"_" + str(learning_rate) +datetime.now().strftime("%Y%m%d-%H%M%S")+ "_random_state" + str(seed) +"_technique_" +str(method)+"_ensembles_"+str(num_ensemble)+".csv"] = df
        for key, value in df_dict.items():
            value.to_csv(key)
    else:
        print("NOT mapping predictions --> in-domain")
        for num_ensemble in [3,4,5,6,7,8,9,10]:#3,4,5,6,7,8,9,
            print(num_ensemble)
            model_prob_samples, model_probs_mean, model_probs_var = create_deep_ensemble_predictions(load_path=load_path,num_ensemble=num_ensemble, ood=train_part2_ood,zero_shot=zero_shot)

            compute_metrics(method=method, y_test=y_test, y_test_categorical=y_test_categorical,dropout_probs_samples=model_prob_samples,dropout_probs_mean=model_probs_mean, dropout_probs_var=model_probs_var,num_ensemble=num_ensemble, language=language, layer_strategy=layer_strategy, last_layer_strategy=last_layer_strategy,reduce_strategy=reduce_strategy,decay_factor=decay_factor,language_model_relation=language_model_relation,cased=cased,bert_type=bert_type,learning_rate=learning_rate,seed=seed,model_path=model_path)        

mapping predictions --> zero-shot
k: 1


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


k: 3
k: 5
k: 10
k: 20
k: 30
k: 40
k: 50


# In case of OOM errors with MC Dropout,  the following may help

restarting kernel after, say, 70 dropout passes, and then running the large number of dropout passes not in a loop but one after one.

saving the predictions of >70 dropout passes similar to the procedure in the deep ensemble case and then loading them again after restarting kernel

In [None]:
'''
# Monte Carlo dropout inference.
dropout_logit_samples = [mc_dropout_sampling(test_x) for _ in range(num_ensemble)]
dropout_prob_samples = [tf.nn.softmax(dropout_logits, axis=-1) for dropout_logits in dropout_logit_samples]
dropout_prob_samples = tf.stack([dropout_prob_samples])[0]

np.save(str(num_ensemble) +" dropout ensembles samples", dropout_prob_samples)


dropout_prob_samples = np.load(str(num_ensemble) + " dropout ensembles samples.npy")
'''

splitting variance computation (if variance computation is the bottleneck):

In [None]:
'''
dropout_probs_var_1 = tf.math.reduce_variance(dropout_prob_samples[:,:,:60], axis=0)
dropout_probs_var_2 = tf.math.reduce_variance(dropout_prob_samples[:,:,60:120], axis=0)
dropout_probs_var_3 = tf.math.reduce_variance(dropout_prob_samples[:,:,120:180], axis=0)
dropout_probs_var_4 = tf.math.reduce_variance(dropout_prob_samples[:,:,180:], axis=0)


dropout_probs_var = np.concatenate((dropout_probs_var_1,dropout_probs_var_2,dropout_probs_var_3,dropout_probs_var_4), axis=1)
'''
