## Initialization

In [None]:
# check the GPU model
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

WORKING_DIR = "/content/drive/MyDrive/Research/CogEval-1/SNLI"
os.environ['WORKING_DIR'] = WORKING_DIR

In [None]:
%cd $WORKING_DIR

In [None]:
!pip install allennlp allennlp-models

### Check models

In [None]:
!mkdir $WORKING_DIR/output

In [None]:
!mkdir $WORKING_DIR/output/RoBERTa

In [None]:
!allennlp predict ./checkpoints/RoBERTa/model.tar.gz ./data/snli_1.0_train_lalor.jsonl  --use-dataset-reader --output-file ./output/RoBERTa/snli_roberta.txt

In [None]:
!mkdir $WORKING_DIR/output/ESIM

In [None]:
!allennlp predict ./checkpoints/ESIM/model.tar.gz ./data/snli_1.0_train_lalor.jsonl  --use-dataset-reader --output-file ./output/ESIM/snli_esim.txt

### Run inference

#### RoBERTa

In [1]:
model_path = "./checkpoints/RoBERTa-deneme/model.tar.gz"
output_path_noncalibrated = "./output_deneme/RoBERTa-deneme/lalor_snli_no_calibration.csv"
output_path_calibrated = "./output_deneme/RoBERTa-deneme/lalor_snli_calibrated_mc_dropout_50.csv"
logits_key_name = "logits"
prob_key_name = "probs"

#### ESIM

In [None]:
model_path = "./checkpoints/ESIM/model.tar.gz"
model_path = "/kuacc/users/mugekural/workfolder/dev/git/cogeval/model/language-inference/model_output_esim_deneme/model.tar.gz"
output_path_noncalibrated = "./output2/ESIM/lalor_snli_no_calibration.csv"
output_path_calibrated = "./output2/ESIM/lalor_snli_calibrated_mc_dropout_50.csv"
logits_key_name = "label_logits"
prob_key_name = "label_probs"

In [None]:
!pwd

#### Load the model

In [2]:
# try running this cell several times in case of errors
from allennlp_models.pair_classification.predictors.textual_entailment import TextualEntailmentPredictor
from allennlp.models import archival

archive = archival.load_archive(model_path)
predictor = TextualEntailmentPredictor.from_archive(archive=archive, predictor_name='textual_entailment')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# A test model run

premise = "It's a cat."
hypothesis = "It's Monday."
preds = predictor.predict(premise, hypothesis)

print(preds)

{'logits': [-3.872497797012329, 0.5222140550613403, 2.680448293685913], 'probs': [0.0012766036670655012, 0.1034320741891861, 0.8952913880348206], 'token_ids': [0, 243, 18, 10, 4758, 4, 2, 2, 243, 18, 302, 4, 2], 'label': 'neutral', 'tokens': ['<s>', 'It', "'s", 'Ġa', 'Ġcat', '.', '</s>', '</s>', 'It', "'s", 'ĠMonday', '.', '</s>']}


In [4]:
import pandas as pd
df = pd.read_csv('../../data/human/SNLI-lalor/snli_human_4gs.csv', sep=',', header=0)

#### Running without calibration

In [5]:
from scipy.stats import entropy

label_1 = []
label_2 = []
label_3 = []

sample_entropy = dict()
predictions = dict()
confs = dict()

correct = 0

for ind, row in df.iterrows():

    premise = row['sentence_1']
    hypothesis = row['sentence_2']
    preds = predictor.predict(premise, hypothesis)

    label_1.append(preds[prob_key_name][0])
    label_2.append(preds[prob_key_name][1])
    label_3.append(preds[prob_key_name][2])

    guid = row['snli_id'] 

    sample_entropy[guid] = entropy(preds[prob_key_name], base=2)
    predictions[guid] = preds['label']

    confs[guid] = max(preds[prob_key_name])

    if row['label'] == preds['label']:
      correct += 1

print(correct / len(df))

0.8333333333333334


In [6]:
df['label_1'] = label_1
df['label_2'] = label_2
df['label_3'] = label_3
df['pred'] = list(predictions.values())
df['softmax_entropy'] = list(sample_entropy.values())
df['confidence'] = list(confs.values())

df

Unnamed: 0,sample_id,snli_id,sentence_1,sentence_2,label,item_difficulty,average_accuracy,flesch_score_textstat,mean_grade_level_textstat,number_of_words,number_of_characters,label_1,label_2,label_3,pred,softmax_entropy,confidence
0,0,1947351225.jpg#0r1c,A little boy is opening gifts surrounded by a ...,The boy is being punished,contradiction,-1.759822,0.839139,78.75,6.0,19,101,0.002024,0.942937,0.055039,contradiction,0.328293,0.942937
1,1,3626964430.jpg#0r1e,"People playing cricket in the park, pine trees...","People are playing sports in the park, near th...",entailment,-2.179087,0.886270,94.15,5.0,22,118,0.925892,0.000916,0.073191,entailment,0.388188,0.925892
2,2,4576144189.jpg#3r1e,Some people hanging out on a large backyard deck.,people hanging out on deck,entailment,-3.137178,0.951844,81.29,3.0,14,76,0.983114,0.000377,0.016509,entailment,0.126183,0.983114
3,3,507370108.jpg#3r1n,A group of dancers are performing.,The audience is silent.,neutral,-1.982105,0.865779,83.32,3.0,10,58,0.002088,0.252782,0.745130,neutral,0.836381,0.745130
4,4,3361210233.jpg#0r1n,A large brown and white dog is carrying a stic...,A puppy is playing fetch with a stick.,neutral,-0.280872,0.565574,93.14,4.0,24,118,0.005251,0.005115,0.989635,neutral,0.093568,0.989635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,85,3381788544.jpg#0r1c,"A group of asian women in sports attire, and o...",Men are playing with a dog,contradiction,-0.622317,0.641393,84.17,5.0,25,122,0.000241,0.999244,0.000515,contradiction,0.009608,0.999244
86,86,3070485870.jpg#3r1n,A snowboarder is jumping over a snow slope.,A girl jumps her green snowboard over a slope.,neutral,-0.885338,0.695697,96.69,6.0,17,90,0.002433,0.005654,0.991913,neutral,0.074961,0.991913
87,87,4788967880.jpg#0r1n,a group of people on a dock lowering into the ...,The group was getting ready to go fishing on t...,neutral,-1.689649,0.829918,73.68,7.0,29,147,0.004945,0.019603,0.975452,neutral,0.184060,0.975452
88,88,4831683216.jpg#0r1c,A young girl in a bathing suit drinking a beve...,The girl is drinking milk from a sippy cup.,contradiction,0.523880,0.381148,85.18,6.0,23,113,0.002306,0.962326,0.035368,contradiction,0.244040,0.962326


In [8]:
df.to_csv(output_path_noncalibrated, index=False, header=True)

##MC Dropout

In [9]:
# switch to train mode to enable drop out layers
predictor._model.train()

BasicClassifier(
  (_text_field_embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): PretrainedTransformerEmbedder(
      (transformer_model): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0): RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): Linear(in_features=1024, out_features=1024, bias=True)
                  (key): Linear(in_features=1024, out_features=1024, bias=True)
                  (value): Linear(in_features=1024, out_features=1024, bias=True)
                  (dropout): D

In [10]:
from scipy.special import softmax

# number of MC dropout iterations
iterations = 50

sm_sum = dict()

for counter in range(iterations):

    print('Iteration: ', counter)

    correct = 0

    for ind, row in df.iterrows():

        premise = row['sentence_1']
        hypothesis = row['sentence_2']
        preds = predictor.predict(premise, hypothesis)

        scores = softmax(preds[logits_key_name])

        guid = row['snli_id'] 

        if guid in sm_sum:
            for i in range(3):
              sm_sum[guid][i] += scores[i]
        else:
          sm_sum[guid] = scores.copy()

        if row['label'] == preds['label']:
          correct += 1

    print(correct / len(df))

Iteration:  0
0.8666666666666667
Iteration:  1
0.8444444444444444
Iteration:  2
0.8666666666666667
Iteration:  3
0.8666666666666667
Iteration:  4
0.8666666666666667
Iteration:  5
0.8444444444444444
Iteration:  6
0.8444444444444444
Iteration:  7
0.8333333333333334
Iteration:  8
0.8444444444444444
Iteration:  9
0.8666666666666667
Iteration:  10
0.8666666666666667
Iteration:  11
0.8333333333333334
Iteration:  12
0.8444444444444444
Iteration:  13
0.8888888888888888
Iteration:  14
0.8333333333333334
Iteration:  15
0.8444444444444444
Iteration:  16
0.8333333333333334
Iteration:  17
0.8777777777777778
Iteration:  18
0.8555555555555555
Iteration:  19
0.8555555555555555
Iteration:  20
0.8777777777777778
Iteration:  21
0.8666666666666667
Iteration:  22
0.8333333333333334
Iteration:  23
0.8555555555555555
Iteration:  24
0.8666666666666667
Iteration:  25
0.8222222222222222
Iteration:  26
0.8555555555555555
Iteration:  27
0.8666666666666667
Iteration:  28
0.8555555555555555
Iteration:  29
0.8777777

In [11]:
# calculate means
for guid in sm_sum.keys():
  sm_sum[guid] = sm_sum[guid] / iterations

In [12]:
# calculcate entropies
from scipy.stats import entropy

sample_entropy = dict()
confs = dict()
for guid in sorted(sm_sum.keys()):
  sample_entropy[guid] = entropy(sm_sum[guid], base=2)
  confs[guid] = max(sm_sum[guid])

In [13]:
# get the predictions
import numpy as np

predictions = dict()

for guid in sorted(sm_sum.keys()):
  predictions[guid] = np.argmax(sm_sum[guid])

In [14]:
label_map = {
    0: "entailment",
    1: "contradiction",
    2: "neutral"
}

In [15]:
prob_0 = []
prob_1 = []
prob_2 = []
entropy = []
preds = []
confidences = []

for ind, row in df.iterrows():
  
  sample_id = row['snli_id']

  prob_0.append(sm_sum[sample_id][0])
  prob_1.append(sm_sum[sample_id][1])
  prob_2.append(sm_sum[sample_id][2])
  entropy.append(sample_entropy[sample_id])
  confidences.append(confs[sample_id])
  preds.append(label_map[predictions[sample_id]])

In [16]:
df['prob_0'] = prob_0
df['prob_1'] = prob_1
df['prob_2'] = prob_2
df["pred_label"] = preds
df["softmax_entropy"] = entropy
df["confidence"] = confidences
df

Unnamed: 0,sample_id,snli_id,sentence_1,sentence_2,label,item_difficulty,average_accuracy,flesch_score_textstat,mean_grade_level_textstat,number_of_words,...,label_1,label_2,label_3,pred,softmax_entropy,confidence,prob_0,prob_1,prob_2,pred_label
0,0,1947351225.jpg#0r1c,A little boy is opening gifts surrounded by a ...,The boy is being punished,contradiction,-1.759822,0.839139,78.75,6.0,19,...,0.002024,0.942937,0.055039,contradiction,0.718332,0.816097,0.004367,0.816097,0.179535,contradiction
1,1,3626964430.jpg#0r1e,"People playing cricket in the park, pine trees...","People are playing sports in the park, near th...",entailment,-2.179087,0.886270,94.15,5.0,22,...,0.925892,0.000916,0.073191,entailment,0.482420,0.900107,0.900107,0.001933,0.097960,entailment
2,2,4576144189.jpg#3r1e,Some people hanging out on a large backyard deck.,people hanging out on deck,entailment,-3.137178,0.951844,81.29,3.0,14,...,0.983114,0.000377,0.016509,entailment,0.166541,0.976117,0.976117,0.000558,0.023325,entailment
3,3,507370108.jpg#3r1n,A group of dancers are performing.,The audience is silent.,neutral,-1.982105,0.865779,83.32,3.0,10,...,0.002088,0.252782,0.745130,neutral,0.976751,0.636806,0.003952,0.359242,0.636806,neutral
4,4,3361210233.jpg#0r1n,A large brown and white dog is carrying a stic...,A puppy is playing fetch with a stick.,neutral,-0.280872,0.565574,93.14,4.0,24,...,0.005251,0.005115,0.989635,neutral,0.180662,0.976972,0.009543,0.013485,0.976972,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,85,3381788544.jpg#0r1c,"A group of asian women in sports attire, and o...",Men are playing with a dog,contradiction,-0.622317,0.641393,84.17,5.0,25,...,0.000241,0.999244,0.000515,contradiction,0.019154,0.998328,0.000394,0.998328,0.001278,contradiction
86,86,3070485870.jpg#3r1n,A snowboarder is jumping over a snow slope.,A girl jumps her green snowboard over a slope.,neutral,-0.885338,0.695697,96.69,6.0,17,...,0.002433,0.005654,0.991913,neutral,0.140995,0.982814,0.005607,0.011579,0.982814,neutral
87,87,4788967880.jpg#0r1n,a group of people on a dock lowering into the ...,The group was getting ready to go fishing on t...,neutral,-1.689649,0.829918,73.68,7.0,29,...,0.004945,0.019603,0.975452,neutral,0.211173,0.971666,0.008698,0.019636,0.971666,neutral
88,88,4831683216.jpg#0r1c,A young girl in a bathing suit drinking a beve...,The girl is drinking milk from a sippy cup.,contradiction,0.523880,0.381148,85.18,6.0,23,...,0.002306,0.962326,0.035368,contradiction,0.576731,0.875655,0.006076,0.875655,0.118270,contradiction


In [17]:
# wrong predictions
df[df.label != df.pred]

Unnamed: 0,sample_id,snli_id,sentence_1,sentence_2,label,item_difficulty,average_accuracy,flesch_score_textstat,mean_grade_level_textstat,number_of_words,...,label_1,label_2,label_3,pred,softmax_entropy,confidence,prob_0,prob_1,prob_2,pred_label
5,5,3512127856.jpg#2r1n,A little girl in pink stands in a yellow hallway.,The girl is standing outside her apartment.,neutral,-1.408479,0.788934,79.77,4.0,17,...,0.035175,0.870912,0.093913,contradiction,1.228618,0.576918,0.058689,0.576918,0.364393,contradiction
6,6,6775386430.jpg#1r1n,A young boy in a sweatshirt is doodling on a p...,The class pictures are on display.,contradiction,0.962349,0.289959,87.21,4.0,19,...,0.015454,0.194058,0.790489,neutral,0.890843,0.770697,0.02503,0.204273,0.770697,neutral
12,12,4900546628.jpg#2r1n,"Two men in front of a projector screen, one at...",Three men about to present their best idea yet...,neutral,-1.198531,0.754098,92.12,6.0,26,...,0.002554,0.837403,0.160043,contradiction,1.003144,0.601122,0.004084,0.601122,0.394795,contradiction
19,19,2496713113.jpg#3r1c,Man sweeping trash outside a large statue.,A man is on vacation.,contradiction,0.038431,0.491803,82.31,3.0,12,...,0.001839,0.331834,0.666327,neutral,0.927673,0.680817,0.002952,0.316231,0.680817,neutral
23,23,956164675.jpg#2r1e,A single runner is watched by onlookers in a r...,A man is running in a race.,entailment,-0.523092,0.619877,88.23,6.0,17,...,0.392709,0.007192,0.600099,neutral,1.073769,0.516449,0.472767,0.010784,0.516449,neutral
37,37,31648340.jpg#1r1n,A wielder works on wielding a beam into place ...,The wielder is working on a building.,neutral,-0.001275,0.501025,77.74,5.0,21,...,0.820799,0.003215,0.175986,entailment,0.842885,0.758579,0.758579,0.006957,0.234463,entailment
44,44,168728234.jpg#2r1n,"A man on stilts in a purple, yellow and white ...",A man is performing on stilts.,entailment,-1.117773,0.739754,88.23,4.0,17,...,0.434712,0.002141,0.563147,neutral,1.015589,0.573338,0.422887,0.003774,0.573338,neutral
46,46,218854747.jpg#3r1n,Two friends walk down the street with a stuffe...,Two friends are walking on the sidewalk.,entailment,0.622663,0.359631,96.69,5.0,17,...,0.055111,0.091466,0.853423,neutral,1.09653,0.729182,0.095845,0.174972,0.729182,neutral
51,51,4520820052.jpg#1r1c,A woman in her early fifties walks by herself ...,back in the fifties a woman could walk by hers...,contradiction,1.271968,0.233607,72.66,6.0,31,...,0.956099,0.002303,0.041598,entailment,0.330654,0.943601,0.943601,0.003186,0.053214,entailment
60,60,4758348295.jpg#2r1e,This man is walking past a wall that has art o...,A man walks past a wall mural.,entailment,-0.366622,0.585041,104.13,0.0,19,...,0.189946,0.001053,0.809001,neutral,0.850167,0.735139,0.262974,0.001887,0.735139,neutral


In [18]:
# calibrated accuracy
1-df[df.label != df.pred].shape[0]/df.shape[0]

0.8333333333333334

In [19]:
# some formatting
df = df.rename(columns={"label_1": "prob_0_no_cal", "label_2": "prob_1_no_cal", "label_3": "prob_2_no_cal" })
df = df.rename(columns={"prob_0": "prob_0_with_cal", "prob_1": "prob_1_with_cal", "prob_2": "prob_2_with_cal" })
df['entropy_with_cal'] = df['softmax_entropy']
df = df.drop(columns = ['softmax_entropy'])

In [20]:
df.to_csv(output_path_calibrated, index=False, header=True)