In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
# from scipy import cluster as clst
# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.decomposition import PCA

## Import data

In [15]:
#df_dev = pd.read_csv('sts-dev.csv', delimiter='\t' , error_bad_lines=False)
df_dev = pd.read_csv('sts-test.csv', delimiter='\t' , error_bad_lines=False)
clear_output()

In [17]:
# Check for errors
# df_dev[pd.isna(df_dev["sentence2"])]

## Loading models

In [3]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2 )
config.output_hidden_states = True

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False

clear_output()

## Get sentences and words representations

In [19]:
# Get sentence representations BERT
# sentences = get_representations(df_dev, tokenizer, model, 768)
# sentences = np.load("bert-stsb-dev.npy", allow_pickle=True).tolist()
sentences = get_representations(df_dev, tokenizer, model, 768)
words = getWords(sentences)
clear_output()

In [20]:
# HELPER
np.save("bert-stsb-test.npy", np.asarray(sentences, dtype=object))

## Get representations via baseline, global method, local (isotropy enhancing) method 

In [21]:
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [22]:
# BASELINE
baseline_sentence_rep = mean_pooling(words, sentences)
baseline_score = similarity(baseline_sentence_rep, 768)

# GLOBAL METHOD
global_representations = global_method(np.asarray(words), n_pc_global, 768)
global_sentence_rep = mean_pooling(global_representations, sentences)
global_score = similarity(global_sentence_rep, 768)

# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, 768)
# calculating sentence representations based on mean word (isotropised) representations
sentence_rep = mean_pooling(isotropic_representations, sentences)
local_score = similarity(sentence_rep, 768)



In [23]:
# performance
print("Spearman Correlation baseline: ",sper_corrcoef(df_dev['corr'], baseline_score))
print("Spearman Correlation global: ",sper_corrcoef(df_dev['corr'], global_score))
print("Spearman Correlation local: ",sper_corrcoef(df_dev['corr'], local_score))

# isotropy of space
print("Isotropy baseline: ", isotropy(baseline_sentence_rep))
print("Isotropy global: ", isotropy(global_representations))
print("Isotropy local: ", isotropy(isotropic_representations))

Spearman Correlation baseline:  48.11715446651223
Spearman Correlation global:  64.38035591129334
Spearman Correlation local:  68.58722123795884
Isotropy baseline:  0.00011038668
Isotropy global:  0.5206524086147497
Isotropy local:  0.7618754996633805
