In [1]:
models = ['tf-idf', 'lstm', 'roberta', 'davinci']
models.sort()

models

['davinci', 'lstm', 'roberta', 'tf-idf']

In [2]:
import pandas as pd

model_probs = dict()
model_predictions = dict()

for model in models:
    
    df = pd.read_csv(f"{model}.csv")
    model_probs[model] = df[['e', 'n', 'c']].values.tolist()   
    model_predictions[model] = df['pred_label'].tolist()

In [3]:
num_samples = len(model_probs['lstm'])
model_probs['random'] = [[0.33, 0.33, 0.33]] * num_samples

In [4]:
import random

random.seed(42)

rand_pred = []

for i in range(num_samples):
    cur_pred = random.randint(0,2)
    if cur_pred == 0:
        rand_pred.append('entailment')
    elif cur_pred == 1:
        rand_pred.append('neutral')
    else:
        rand_pred.append('contradictions')
        
model_predictions['random'] = rand_pred

In [5]:
models.append('random')

In [6]:
dist_labels = ['KL', "JSD", 'MSE']
subsets = ['ALL', 'AGREE', 'DISAGREE']

In [7]:
import numpy as np
from tensorflow import keras

from scipy.spatial import distance
from sklearn.metrics import mean_squared_error

def calc_dist(p, q):
    
    kl = keras.losses.KLDivergence()    
    kl = np.mean(kl(p, q).numpy())    
    jsd = np.mean([distance.jensenshannon(p[i], q[i]) for i in range(len(p))])    
    mse = mean_squared_error(p, q)
    
    return {
        "KL": kl, 
        "JSD": jsd, 
        "MSE": mse
    }

In [8]:
from itertools import compress

results = {
    "ALL": dict(),
    "AGREE": dict(),
    "DISAGREE": dict()
}

model_1 = []
model_2 = []

agree_count = []
disagree_count = []

for i in range(len(models)):
    for j in range(len(models)):
        if i != j:
            print(models[i], models[j])
            
            model_1.append(models[i])
            model_2.append(models[j])
            
            dist = calc_dist(model_probs[models[i]], model_probs[models[j]])
            results['ALL'][f"{models[i]}/{models[j]}"] = dist
            
            agreement_mask = [pi==pj for pi, pj in zip(model_predictions[models[i]], model_predictions[models[j]])]            
            probs_model_1 = list(compress(model_probs[models[i]], agreement_mask))
            probs_model_2 = list(compress(model_probs[models[j]], agreement_mask))
            
            agree_count.append(round(len(probs_model_1)/num_samples, 2))
            
            dist = calc_dist(probs_model_1, probs_model_2)           
            results['AGREE'][f"{models[i]}/{models[j]}"] = dist            
            
            disagreement_mask = np.logical_not(agreement_mask)
            probs_model_1 = list(compress(model_probs[models[i]], disagreement_mask))
            probs_model_2 = list(compress(model_probs[models[j]], disagreement_mask))
            
            disagree_count.append(round(len(probs_model_1)/num_samples, 2))
            
            dist = calc_dist(probs_model_1, probs_model_2)           
            results['DISAGREE'][f"{models[i]}/{models[j]}"] = dist

davinci lstm
davinci roberta
davinci tf-idf
davinci random
lstm davinci
lstm roberta
lstm tf-idf
lstm random
roberta davinci
roberta lstm
roberta tf-idf
roberta random
tf-idf davinci
tf-idf lstm
tf-idf roberta
tf-idf random
random davinci
random lstm
random roberta
random tf-idf


2023-01-03 13:16:47.258749: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2023-01-03 13:16:47.258775: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: tilek-PC
2023-01-03 13:16:47.258779: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: tilek-PC
2023-01-03 13:16:47.258874: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.86.1
2023-01-03 13:16:47.258888: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.86.1
2023-01-03 13:16:47.258891: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 515.86.1
2023-01-03 13:16:47.259700: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical op

In [9]:
result_df = pd.DataFrame()

result_df['model_1'] = model_1
result_df['model_2'] = model_2

result_df    

Unnamed: 0,model_1,model_2
0,davinci,lstm
1,davinci,roberta
2,davinci,tf-idf
3,davinci,random
4,lstm,davinci
5,lstm,roberta
6,lstm,tf-idf
7,lstm,random
8,roberta,davinci
9,roberta,lstm


In [10]:
for subset in subsets:
    for metric in dist_labels:
        col_data = []
        for pair in results[subset].keys():
            col_data.append(results[subset][pair][metric])
            
        if subset == 'AGREE':
            result_df['AGREE_PCT'] = agree_count
        elif subset == 'DISAGREE':
            result_df['DISAGREE_PCT'] = disagree_count
            
        result_df[f"{subset}_{metric}"] = col_data       


result_df

Unnamed: 0,model_1,model_2,ALL_KL,ALL_JSD,ALL_MSE,AGREE_PCT,AGREE_KL,AGREE_JSD,AGREE_MSE,DISAGREE_PCT,DISAGREE_KL,DISAGREE_JSD,DISAGREE_MSE
0,davinci,lstm,0.98961,0.515907,0.202279,0.39,0.596225,0.424871,0.108339,0.61,1.239945,0.57384,0.262058
1,davinci,roberta,1.799331,0.384203,0.225927,0.59,0.242997,0.156198,0.020084,0.41,4.028675,0.710803,0.520784
2,davinci,tf-idf,1.073438,0.486596,0.209484,0.46,0.446671,0.351016,0.078732,0.54,1.597876,0.600041,0.318889
3,davinci,random,0.970052,0.522268,0.194216,0.33,1.020904,0.537572,0.204613,0.67,0.944626,0.514616,0.189017
4,lstm,davinci,4.273191,0.515907,0.202279,0.39,3.08919,0.424871,0.108339,0.61,5.026647,0.57384,0.262058
5,lstm,roberta,2.415047,0.479181,0.181738,0.44,1.640905,0.386428,0.092484,0.56,3.03436,0.553384,0.253141
6,lstm,tf-idf,0.482856,0.273797,0.081883,0.23,0.22619,0.180833,0.030735,0.77,0.560972,0.30209,0.09745
7,lstm,random,0.0876,0.131543,0.018355,0.23,0.076888,0.121172,0.015747,0.77,0.09086,0.134699,0.019149
8,roberta,davinci,2.641902,0.384203,0.225927,0.59,0.447398,0.156198,0.020084,0.41,5.78538,0.710803,0.520784
9,roberta,lstm,0.880154,0.479181,0.181738,0.44,0.508859,0.386428,0.092484,0.56,1.177191,0.553384,0.253141


In [11]:
result_df.to_csv("SNLI_model_dist.csv", header=True, index=False, sep=',')