In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/CogEval/CogEval/Models

/content/drive/MyDrive/CogEval/CogEval/Models


In [3]:
models = ['tf-idf', 'lstm', 'roberta', 'davinci']
models.sort()

models

['davinci', 'lstm', 'roberta', 'tf-idf']

In [4]:
import pandas as pd

model_probs = dict()
model_predictions = dict()

for model in models:
    
    df = pd.read_csv(f"{model}.csv")
    model_probs[model] = df[['e', 'n', 'c']].values.tolist()   
    model_predictions[model] = df['pred_label'].tolist()

df = pd.read_csv("lstm.csv")
gold_labels = df['label'].tolist()

In [5]:
num_samples = len(model_probs['lstm'])
model_probs['random'] = [[0.33, 0.33, 0.33]] * num_samples

In [6]:
import random

random.seed(42)

rand_pred = []

for i in range(num_samples):
    cur_pred = random.randint(0,2)
    if cur_pred == 0:
        rand_pred.append('entailment')
    elif cur_pred == 1:
        rand_pred.append('neutral')
    else:
        rand_pred.append('contradictions')
        
model_predictions['random'] = rand_pred

In [7]:
models.append('random')

In [8]:
dist_labels = ['KL', "JSD", 'MSE']
subsets = ['ALL', 'AGREE', 'DISAGREE']

In [9]:
import numpy as np
from tensorflow import keras

from scipy.spatial import distance
from sklearn.metrics import mean_squared_error

def calc_dist(p, q):
    
    kl = keras.losses.KLDivergence()    
    kl = np.mean(kl(p, q).numpy())    
    jsd = np.mean([distance.jensenshannon(p[i], q[i]) for i in range(len(p))])    
    mse = mean_squared_error(p, q)
    
    return {
        "KL": kl, 
        "JSD": jsd, 
        "MSE": mse
    }

In [10]:
from itertools import compress

results = {
    "ALL": dict(),
    "AGREE": dict(),
    "DISAGREE": dict()
}

model_1 = []
model_2 = []

agree_count = []
disagree_count = []

for i in range(len(models)):
    for j in range(len(models)):
        if i != j:
            print(models[i], models[j])
            
            model_1.append(models[i])
            model_2.append(models[j])
            
            dist = calc_dist(model_probs[models[i]], model_probs[models[j]])
            results['ALL'][f"{models[i]}/{models[j]}"] = dist
            
            agreement_mask = [pi==pj for pi, pj in zip(model_predictions[models[i]], model_predictions[models[j]])]            
            probs_model_1 = list(compress(model_probs[models[i]], agreement_mask))
            probs_model_2 = list(compress(model_probs[models[j]], agreement_mask))
            
            agree_count.append(round(len(probs_model_1)/num_samples, 2))
            
            dist = calc_dist(probs_model_1, probs_model_2)           
            results['AGREE'][f"{models[i]}/{models[j]}"] = dist            
            
            disagreement_mask = np.logical_not(agreement_mask)
            probs_model_1 = list(compress(model_probs[models[i]], disagreement_mask))
            probs_model_2 = list(compress(model_probs[models[j]], disagreement_mask))
            
            disagree_count.append(round(len(probs_model_1)/num_samples, 2))
            
            dist = calc_dist(probs_model_1, probs_model_2)           
            results['DISAGREE'][f"{models[i]}/{models[j]}"] = dist

davinci lstm
davinci roberta
davinci tf-idf
davinci random
lstm davinci
lstm roberta
lstm tf-idf
lstm random
roberta davinci
roberta lstm
roberta tf-idf
roberta random
tf-idf davinci
tf-idf lstm
tf-idf roberta
tf-idf random
random davinci
random lstm
random roberta
random tf-idf


In [11]:
result_df = pd.DataFrame()

result_df['model_1'] = model_1
result_df['model_2'] = model_2

result_df    

Unnamed: 0,model_1,model_2
0,davinci,lstm
1,davinci,roberta
2,davinci,tf-idf
3,davinci,random
4,lstm,davinci
5,lstm,roberta
6,lstm,tf-idf
7,lstm,random
8,roberta,davinci
9,roberta,lstm


In [12]:
for subset in subsets:
    for metric in dist_labels:
        col_data = []
        for pair in results[subset].keys():
            col_data.append(results[subset][pair][metric])
            
        if subset == 'AGREE':
            result_df['AGREE_PCT'] = agree_count
        elif subset == 'DISAGREE':
            result_df['DISAGREE_PCT'] = disagree_count
            
        result_df[f"{subset}_{metric}"] = col_data       


result_df

Unnamed: 0,model_1,model_2,ALL_KL,ALL_JSD,ALL_MSE,AGREE_PCT,AGREE_KL,AGREE_JSD,AGREE_MSE,DISAGREE_PCT,DISAGREE_KL,DISAGREE_JSD,DISAGREE_MSE
0,davinci,lstm,0.98961,0.515907,0.202279,0.39,0.596225,0.424871,0.108339,0.61,1.239945,0.57384,0.262058
1,davinci,roberta,1.799331,0.384203,0.225927,0.59,0.242997,0.156198,0.020084,0.41,4.028675,0.710803,0.520784
2,davinci,tf-idf,1.073438,0.486596,0.209484,0.46,0.446671,0.351016,0.078732,0.54,1.597876,0.600041,0.318889
3,davinci,random,0.970052,0.522268,0.194216,0.33,1.020904,0.537572,0.204613,0.67,0.944626,0.514616,0.189017
4,lstm,davinci,4.273191,0.515907,0.202279,0.39,3.08919,0.424871,0.108339,0.61,5.026647,0.57384,0.262058
5,lstm,roberta,2.415047,0.479181,0.181738,0.44,1.640905,0.386428,0.092484,0.56,3.03436,0.553384,0.253141
6,lstm,tf-idf,0.482856,0.273797,0.081883,0.23,0.22619,0.180833,0.030735,0.77,0.560972,0.30209,0.09745
7,lstm,random,0.0876,0.131543,0.018355,0.23,0.076888,0.121172,0.015747,0.77,0.09086,0.134699,0.019149
8,roberta,davinci,2.641902,0.384203,0.225927,0.59,0.447398,0.156198,0.020084,0.41,5.78538,0.710803,0.520784
9,roberta,lstm,0.880154,0.479181,0.181738,0.44,0.508859,0.386428,0.092484,0.56,1.177191,0.553384,0.253141


In [13]:
result_df.to_csv("SNLI_model_dist.csv", header=True, index=False, sep=',')

In [18]:
from itertools import compress

labels = ["contradiction", "neutral", "entailment"]

results_per_label = {
   label: dict() for label in labels
}

model_1 = []
model_2 = []

for i in range(len(models)):
    for j in range(len(models)):
        if i != j:
            print(models[i], models[j])
            
            model_1.append(models[i])
            model_2.append(models[j])

            for label in labels:           
            
              label_mask = [gold_labels[i] == label for i in range(len(gold_labels))]            
              probs_model_1 = list(compress(model_probs[models[i]], label_mask))
              probs_model_2 = list(compress(model_probs[models[j]], label_mask))

              dist = calc_dist(model_probs[models[i]], model_probs[models[j]])
              results_per_label[label][f"{models[i]}/{models[j]}"] = dist          
              
           

davinci lstm
davinci roberta
davinci tf-idf
davinci random
lstm davinci
lstm roberta
lstm tf-idf
lstm random
roberta davinci
roberta lstm
roberta tf-idf
roberta random
tf-idf davinci
tf-idf lstm
tf-idf roberta
tf-idf random
random davinci
random lstm
random roberta
random tf-idf


In [19]:
results_per_label['contradiction'].keys()

dict_keys(['davinci/lstm', 'davinci/roberta', 'davinci/tf-idf', 'davinci/random', 'lstm/davinci', 'lstm/roberta', 'lstm/tf-idf', 'lstm/random', 'roberta/davinci', 'roberta/lstm', 'roberta/tf-idf', 'roberta/random', 'tf-idf/davinci', 'tf-idf/lstm', 'tf-idf/roberta', 'tf-idf/random', 'random/davinci', 'random/lstm', 'random/roberta', 'random/tf-idf'])

In [20]:
result_per_label_df = pd.DataFrame()

result_per_label_df['model_1'] = model_1
result_per_label_df['model_2'] = model_2

result_per_label_df    

Unnamed: 0,model_1,model_2
0,davinci,lstm
1,davinci,roberta
2,davinci,tf-idf
3,davinci,random
4,lstm,davinci
5,lstm,roberta
6,lstm,tf-idf
7,lstm,random
8,roberta,davinci
9,roberta,lstm


In [21]:
for label in labels:
    for metric in dist_labels:
        col_data = []
        for pair in results_per_label[label].keys():
            col_data.append(results_per_label[label][pair][metric])

        result_per_label_df[f"{label}_{metric}"] = col_data

result_per_label_df

Unnamed: 0,model_1,model_2,contradiction_KL,contradiction_JSD,contradiction_MSE,neutral_KL,neutral_JSD,neutral_MSE,entailment_KL,entailment_JSD,entailment_MSE
0,davinci,lstm,0.98961,0.515907,0.202279,0.98961,0.515907,0.202279,0.98961,0.515907,0.202279
1,davinci,roberta,1.799331,0.384203,0.225927,1.799331,0.384203,0.225927,1.799331,0.384203,0.225927
2,davinci,tf-idf,1.073438,0.486596,0.209484,1.073438,0.486596,0.209484,1.073438,0.486596,0.209484
3,davinci,random,0.970052,0.522268,0.194216,0.970052,0.522268,0.194216,0.970052,0.522268,0.194216
4,lstm,davinci,4.273191,0.515907,0.202279,4.273191,0.515907,0.202279,4.273191,0.515907,0.202279
5,lstm,roberta,2.415047,0.479181,0.181738,2.415047,0.479181,0.181738,2.415047,0.479181,0.181738
6,lstm,tf-idf,0.482856,0.273797,0.081883,0.482856,0.273797,0.081883,0.482856,0.273797,0.081883
7,lstm,random,0.0876,0.131543,0.018355,0.0876,0.131543,0.018355,0.0876,0.131543,0.018355
8,roberta,davinci,2.641902,0.384203,0.225927,2.641902,0.384203,0.225927,2.641902,0.384203,0.225927
9,roberta,lstm,0.880154,0.479181,0.181738,0.880154,0.479181,0.181738,0.880154,0.479181,0.181738


In [23]:
result_per_label_df.to_csv("SNLI_model_dist_per_label.csv", header=True, index=False, sep=',')

In [28]:
from itertools import compress
from sklearn.metrics import jaccard_score

scoring_modes = ['micro', 'macro', 'weighted']

results_jaccard = {
    mode: dict() for mode in scoring_modes
}


model_1 = []
model_2 = []

for i in range(len(models)):
    for j in range(len(models)):
        if i != j:
            print(models[i], models[j])
            
            model_1.append(models[i])
            model_2.append(models[j])

            dist = jaccard_score(model_predictions[models[i]], model_predictions[models[j]], average='micro')
            results_jaccard['micro'][f"{models[i]}/{models[j]}"] = dist          

            dist = jaccard_score(model_predictions[models[i]], model_predictions[models[j]], average='macro')
            results_jaccard['macro'][f"{models[i]}/{models[j]}"] = dist          

            dist = jaccard_score(model_predictions[models[i]], model_predictions[models[j]], average='weighted')
            results_jaccard['weighted'][f"{models[i]}/{models[j]}"] = dist          

results_jaccard           

davinci lstm
davinci roberta
davinci tf-idf
davinci random
lstm davinci
lstm roberta
lstm tf-idf
lstm random
roberta davinci
roberta lstm
roberta tf-idf
roberta random
tf-idf davinci
tf-idf lstm
tf-idf roberta
tf-idf random
random davinci
random lstm
random roberta
random tf-idf


{'micro': {'davinci/lstm': 0.2413793103448276,
  'davinci/roberta': 0.41732283464566927,
  'davinci/tf-idf': 0.2949640287769784,
  'davinci/random': 0.2,
  'lstm/davinci': 0.2413793103448276,
  'lstm/roberta': 0.2857142857142857,
  'lstm/tf-idf': 0.1320754716981132,
  'lstm/random': 0.1320754716981132,
  'roberta/davinci': 0.41732283464566927,
  'roberta/lstm': 0.2857142857142857,
  'roberta/tf-idf': 0.40625,
  'roberta/random': 0.125,
  'tf-idf/davinci': 0.2949640287769784,
  'tf-idf/lstm': 0.1320754716981132,
  'tf-idf/roberta': 0.40625,
  'tf-idf/random': 0.1464968152866242,
  'random/davinci': 0.2,
  'random/lstm': 0.1320754716981132,
  'random/roberta': 0.125,
  'random/tf-idf': 0.1464968152866242},
 'macro': {'davinci/lstm': 0.20345816260394808,
  'davinci/roberta': 0.4026976953578169,
  'davinci/tf-idf': 0.2667989417989418,
  'davinci/random': 0.13125,
  'lstm/davinci': 0.20345816260394808,
  'lstm/roberta': 0.28993252361673416,
  'lstm/tf-idf': 0.1327185244587009,
  'lstm/rando

In [29]:
jaccard_df = pd.DataFrame()

jaccard_df['model_1'] = model_1
jaccard_df['model_2'] = model_2

jaccard_df    

Unnamed: 0,model_1,model_2
0,davinci,lstm
1,davinci,roberta
2,davinci,tf-idf
3,davinci,random
4,lstm,davinci
5,lstm,roberta
6,lstm,tf-idf
7,lstm,random
8,roberta,davinci
9,roberta,lstm


In [31]:
for mode in scoring_modes:
  jaccard_df[mode] = list(results_jaccard[mode].values())

jaccard_df

Unnamed: 0,model_1,model_2,micro,macro,weighted
0,davinci,lstm,0.241379,0.203458,0.294601
1,davinci,roberta,0.417323,0.402698,0.445845
2,davinci,tf-idf,0.294964,0.266799,0.327593
3,davinci,random,0.2,0.13125,0.255
4,lstm,davinci,0.241379,0.203458,0.20977
5,lstm,roberta,0.285714,0.289933,0.283359
6,lstm,tf-idf,0.132075,0.132719,0.132019
7,lstm,random,0.132075,0.09855,0.146879
8,roberta,davinci,0.417323,0.402698,0.406161
9,roberta,lstm,0.285714,0.289933,0.292091


In [32]:
jaccard_df.to_csv("SNLI_model_jaccard_similarity.csv", header=True, index=False, sep=',')