## Initialization

In [None]:
# check the GPU model
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

WORKING_DIR = "/content/drive/MyDrive/Research/CogEval-1/SNLI"
os.environ['WORKING_DIR'] = WORKING_DIR

In [None]:
%cd $WORKING_DIR

In [None]:
!pip install allennlp allennlp-models

### Check models

In [None]:
!mkdir $WORKING_DIR/output

In [None]:
!mkdir $WORKING_DIR/output/RoBERTa

In [None]:
!allennlp predict ./checkpoints/RoBERTa/model.tar.gz ./data/snli_1.0_train_lalor.jsonl  --use-dataset-reader --output-file ./output/RoBERTa/snli_roberta.txt

In [None]:
!mkdir $WORKING_DIR/output/ESIM

In [None]:
!allennlp predict ./checkpoints/ESIM/model.tar.gz ./data/snli_1.0_train_lalor.jsonl  --use-dataset-reader --output-file ./output/ESIM/snli_esim.txt

### Run inference

#### RoBERTa

In [None]:
model_path = "./checkpoints/RoBERTa/model.tar.gz"
output_path_noncalibrated = "./output/RoBERTa/lalor_snli_no_calibration.csv"
output_path_calibrated = "./output/RoBERTa/lalor_snli_calibrated_mc_dropout_50.csv"
logits_key_name = "logits"
prob_key_name = "probs"

#### ESIM

In [None]:
model_path = "./checkpoints/ESIM/model.tar.gz"
output_path_noncalibrated = "./output/ESIM/lalor_snli_no_calibration.csv"
output_path_calibrated = "./output/ESIM/lalor_snli_calibrated_mc_dropout_50.csv"
logits_key_name = "label_logits"
prob_key_name = "label_probs"

#### Load the model

In [None]:
# try running this cell several times in case of errors
from allennlp_models.pair_classification.predictors.textual_entailment import TextualEntailmentPredictor
from allennlp.models import archival

archive = archival.load_archive(model_path)
predictor = TextualEntailmentPredictor.from_archive(archive=archive, predictor_name='textual_entailment')

In [None]:
# from allennlp_models.pretrained import load_predictor
# predictor = load_predictor("pair-classification-roberta-snli")

In [None]:
# A test model run

premise = "It's a cat."
hypothesis = "It's Monday."
preds = predictor.predict(premise, hypothesis)

print(preds)

In [None]:
import pandas as pd
df = pd.read_csv('./human_data/snli_human_4gs.csv', sep=',', header=0)

df

#### Running without calibration

In [None]:
from scipy.stats import entropy

label_1 = []
label_2 = []
label_3 = []

sample_entropy = dict()
predictions = dict()

correct = 0

for ind, row in df.iterrows():

    premise = row['sentence_1']
    hypothesis = row['sentence_2']
    preds = predictor.predict(premise, hypothesis)

    label_1.append(preds[prob_key_name][0])
    label_2.append(preds[prob_key_name][1])
    label_3.append(preds[prob_key_name][2])

    guid = row['snli_id'] 

    sample_entropy[guid] = entropy(preds[prob_key_name], base=2)
    predictions[guid] = preds['label']

    if row['label'] == preds['label']:
      correct += 1

print(correct / len(df))

In [None]:
df['label_1'] = label_1
df['label_2'] = label_2
df['label_3'] = label_3
df['pred'] = list(predictions.values())
df['softmax_entropy'] = list(sample_entropy.values())

df

In [None]:
df.to_csv(output_path_noncalibrated, index=False, header=True)

##MC Dropout

In [None]:
# switch to train mode to enable drop out layers
predictor._model.train()

In [None]:
from scipy.special import softmax

# number of MC dropout iterations
iterations = 50

sm_sum = dict()

for counter in range(iterations):

    print('Iteration: ', counter)

    correct = 0

    for ind, row in df.iterrows():

        premise = row['sentence_1']
        hypothesis = row['sentence_2']
        preds = predictor.predict(premise, hypothesis)

        scores = softmax(preds[logits_key_name])

        guid = row['snli_id'] 

        if guid in sm_sum:
            for i in range(3):
              sm_sum[guid][i] += scores[i]
        else:
          sm_sum[guid] = scores.copy()

        if row['label'] == preds['label']:
          correct += 1

    print(correct / len(df))

In [None]:
# calculate means
for guid in sm_sum.keys():
  sm_sum[guid] = sm_sum[guid] / iterations

In [None]:
# calculcate entropies
from scipy.stats import entropy

sample_entropy = dict()

for guid in sorted(sm_sum.keys()):
  sample_entropy[guid] = entropy(sm_sum[guid], base=2)

In [None]:
# get the predictions
import numpy as np

predictions = dict()

for guid in sorted(sm_sum.keys()):
  predictions[guid] = np.argmax(sm_sum[guid])

In [None]:
label_map = {
    0: "entailment",
    1: "contradiction",
    2: "neutral"
}

In [None]:
prob_0 = []
prob_1 = []
prob_2 = []
entropy = []
preds = []

for ind, row in df.iterrows():
  
  sample_id = row['snli_id']

  prob_0.append(sm_sum[sample_id][0])
  prob_1.append(sm_sum[sample_id][1])
  prob_2.append(sm_sum[sample_id][2])
  entropy.append(sample_entropy[sample_id])
  preds.append(label_map[predictions[sample_id]])

In [None]:
df['prob_0'] = prob_0
df['prob_1'] = prob_1
df['prob_2'] = prob_2
df["pred"] = preds
df["softmax_entropy"] = entropy
df

In [None]:
# wrong predictions
df[df.label != df.pred]

In [None]:
# calibrated accuracy
1-df[df.label != df.pred].shape[0]/df.shape[0]

In [None]:
# some formatting
df = df.rename(columns={"label_1": "prob_0_no_cal", "label_2": "prob_1_no_cal", "label_3": "prob_2_no_cal" })
df = df.rename(columns={"prob_0": "prob_0_with_cal", "prob_1": "prob_1_with_cal", "prob_2": "prob_2_with_cal" })
df['entropy_with_cal'] = df['softmax_entropy']
df = df.drop(columns = ['softmax_entropy'])

In [None]:
df.to_csv(output_path_calibrated, index=False, header=True)