In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
import random

### Load model & Data

In [3]:
dfd = pd.read_csv('sts-dev.csv', delimiter='\t' , error_bad_lines=False)
dfd = dfd[np.logical_not(pd.isna(dfd["sentence2"]))]
clear_output()

In [4]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = True

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [68]:
# GPT-2
casing = "gpt2" 
tokenizer = GPT2Tokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)
config = GPT2Config()
config.output_hidden_states = True

model = TFGPT2Model.from_pretrained(casing, config=config)
model.trainable = False

emb_len = 768
clear_output()

# GPT2
n_cluster = 10
n_pc = 30
n_pc_global = 30

In [71]:
# RoBERTa
casing = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)
config = RobertaConfig.from_pretrained(casing)
config.output_hidden_states = True

model = TFRobertaModel.from_pretrained(casing, config=config)
model.trainable = False
emb_len = 768
clear_output()

# RoBERTa
n_cluster = 27
n_pc = 12
n_pc_global = 25

### Get CWRs at different layers of the model

In [72]:
def get_representations_all_layers(data_, tokenizer, model, emb_length):
    sentences = []
    for i in range(len(data_)):
        print(i)
        # First sentence
        inputs = tokenizer.encode(
            data_['sentence1'].iloc[i], add_special_tokens=True)
        inputs = np.asarray(inputs, dtype='int32').reshape((1, -1))

        # getting the representation of the last layer
        output = model(inputs)[2]
        #print(output)
        output = np.asarray(output).reshape((13,-1, emb_length))
        
        # Removing CLS and SEP tokens
        idx = [0, len(output[0])-1]
        output = np.delete(output, idx, axis=1)
        #output = np.asarray(output).reshape((-1, emb_length))

        sentences.append(output)

        # Second sentence
        inputs = tokenizer.encode(
            data_['sentence2'].iloc[i], add_special_tokens=True)
        inputs = np.asarray(inputs, dtype='int32').reshape((1, -1))

        output = model(inputs)[2]
        output = np.asarray(output).reshape((13,-1, emb_length))

        # Removing CLS and SEP tokens
        idx = [0, len(output[0])-1]
        output = np.delete(output, idx, axis=1)
        # output = np.asarray(output).reshape((-1, emb_length))

        sentences.append(output)
        if i % 10 == 0:
           clear_output()

    return sentences

In [73]:
def getWordsAtLayer(sentences, layer):
    """ Get words (tokens) representations in a list at a layer by removing the sentences axis. """
    words = []
    for i in range(len(sentences)):
        for j in range(len(sentences[i][0])):
            words.append(sentences[i][layer][j])

    return words

In [74]:
reps = get_representations_all_layers(dfd, tokenizer, model, emb_len)

1441
1442
1443


In [75]:
# Compute isotropy at each layer
for lay in range(13):
    wordsatLayer = getWordsAtLayer(reps, lay)
    print(isotropy(np.asarray(wordsatLayer, dtype=np.float64)))

0.008965204001531902
2.4606133484093245e-07
8.581332820159886e-10
4.210619442620398e-09
5.377646965360977e-12
4.853519784312229e-10
3.133070468963168e-10
1.317334163571853e-10
1.414390580381122e-10
1.3529625036818678e-10
6.511797473015736e-11
1.4053177125577774e-10
2.6920787864719667e-06
