In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
import random

### Load model & Data

In [None]:
dfd = pd.read_csv('sts-dev.csv', delimiter='\t' , error_bad_lines=False)
dfd = dfd[np.logical_not(pd.isna(dfd["sentence2"]))]

In [3]:
dfd

Unnamed: 0,type,subtype,year,num,corr,sentence1,sentence2
0,main-captions,MSRvid,2012test,0,5.00,A man with a hard hat is dancing.,A man wearing a hard hat is dancing.
1,main-captions,MSRvid,2012test,2,4.75,A young child is riding a horse.,A child is riding a horse.
2,main-captions,MSRvid,2012test,3,5.00,A man is feeding a mouse to a snake.,The man is feeding a mouse to the snake.
3,main-captions,MSRvid,2012test,7,2.40,A woman is playing the guitar.,A man is playing guitar.
4,main-captions,MSRvid,2012test,8,2.75,A woman is playing the flute.,A man is playing a flute.
...,...,...,...,...,...,...,...
1439,main-news,headlines,2015,1417,4.60,World's oldest man dies at 116,Japan: World's Oldest Man Ever Dies Aged 116
1440,main-news,headlines,2015,1426,3.40,Pakistanis vote in landmark election,Pakistan holds landmark election as 29 killed ...
1441,main-news,headlines,2015,1447,0.40,Algeria president gets therapy after stroke: s...,Bulgarian president tries to break election st...
1442,main-news,headlines,2015,1471,0.00,Yemen: Fighting in north kills over 120,Janet Yellen: Five things you might not know


In [4]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = True

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [24]:
# GPT-2
casing = "gpt2" 
tokenizer = GPT2Tokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)
config = GPT2Config()
config.output_hidden_states = True

model = TFGPT2Model.from_pretrained(casing, config=config)
model.trainable = False

emb_len = 768
clear_output()

# GPT2
n_cluster = 10
n_pc = 30
n_pc_global = 30

In [17]:
# RoBERTa
casing = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)
config = RobertaConfig.from_pretrained(casing)
config.output_hidden_states = True

model = TFRobertaModel.from_pretrained(casing, config=config)
model.trainable = False
emb_len = 768
clear_output()

# RoBERTa
n_cluster = 27
n_pc = 12
n_pc_global = 25

### Get representations

In [25]:
sentences = get_representations(dfd, tokenizer, model, emb_len)
words = getWords(sentences)

1441
1442
1443


In [26]:
# Save partial results
np.save("gpt2-stsb-dev.npy",np.asarray(sentences, dtype=object))

### Calculate isotropies

In [29]:
# baseline
isotropy(np.asarray(words,dtype=np.float64))

1.2712387193789697e-126

In [30]:
# isotropy when clustering with different k
ks = [1,3,6,9,20]
for k in ks:
    impr_words = cluster_and_zero_mean(words,k,emb_len)
    impr_words = impr_words.reshape((impr_words.shape[0], impr_words.shape[2]))
    print(isotropy(np.asarray(impr_words,dtype=np.float64)))

3.623403960360555e-220
1.2120785667158076e-73
3.369456262598478e-61
7.059176146923158e-54
8.420955618933535e-101
