In [47]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
from transformers.tokenization_utils import TextInputPair
from sklearn.neural_network import MLPClassifier
from copy import deepcopy
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from joblib import dump, load
from datasets import load_dataset

### Load BERT & Data

In [2]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = False # if true outputs all layers

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [32]:
train_stop_ix = ds["train"].num_rows
dev_stop_ix = ds["train"].num_rows + ds["validation"].num_rows

In [33]:
## Read data, append sentences..
ds = load_dataset('glue', 'mrpc')
df = pd.concat([ds["train"].to_pandas(), ds["validation"].to_pandas(), ds["test"].to_pandas()])

Reusing dataset glue (C:\Users\Beni\.cache\huggingface\datasets\glue\mrpc\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 120.00it/s]


### Encode (tokenize) all pairs of sentences

In [36]:
tuple_list = list(zip(df["sentence1"].to_list(), df["sentence2"].to_list()))
encodings = tokenizer.batch_encode_plus(tuple_list, max_length=64, pad_to_max_length=True)

In [37]:
# embarray = np.zeros((len(df), len(encodings["input_ids"][0]), 768), dtype=np.float32)
embarray = np.load("mrpc-bert-embs.npy")

In [None]:
# embarray = get_model_features(df, 15, encodings, model, embarray)

In [40]:
# np.save("mrpc-bert-embs.npy",embarray)

## Get baseline, local & global representations

In [41]:
words = getWords(embarray)

In [42]:
# BASELINE
baseline_sentence_rep = embarray.reshape((-1,768*64))

In [43]:
# GLOBAL METHOD
global_representations = global_method(np.asarray(words), n_pc_global, emb_len)
global_sentence_rep = flatten_pooling(global_representations, embarray)

In [71]:
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
sentence_rep = flatten_pooling(isotropic_representations, embarray)

## Train & Test

In [45]:
# Simple train/test split - all MRPC splits (including labelled test split) are available
Y = np.asarray((df["label"]).to_list())
Y_tr = Y[:train_stop_ix]
Y_dev = Y[train_stop_ix:dev_stop_ix]
Y_te = Y[dev_stop_ix:]

### Baseline

In [49]:
reps_base = np.asarray(baseline_sentence_rep)
X_tr = reps_base[:train_stop_ix]
X_dev = reps_base[train_stop_ix:dev_stop_ix]
X_te = reps_base[dev_stop_ix:]

In [58]:
clf1, score1 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.7009803921568627
epoch 2, score: 0.7377450980392157
epoch 3, score: 0.7205882352941176
epoch 4, score: 0.7524509803921569
epoch 5, score: 0.7524509803921569
epoch 6, score: 0.7524509803921569
epoch 7, score: 0.7475490196078431
epoch 8, score: 0.7573529411764706
epoch 9, score: 0.7524509803921569
epoch 10, score: 0.7401960784313726


In [59]:
clf1.score(X_te,Y_te)

0.6968115942028985

### Global

In [60]:
reps_global = np.asarray(global_sentence_rep)
X_tr = reps_global[:train_stop_ix]
X_dev = reps_global[train_stop_ix:dev_stop_ix]
X_te = reps_global[dev_stop_ix:]

In [61]:
clf2, score2 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.7132352941176471
epoch 2, score: 0.7254901960784313
epoch 3, score: 0.7254901960784313
epoch 4, score: 0.7132352941176471
epoch 5, score: 0.7083333333333334
epoch 6, score: 0.7156862745098039
epoch 7, score: 0.7132352941176471
epoch 8, score: 0.7156862745098039
epoch 9, score: 0.7156862745098039
epoch 10, score: 0.7181372549019608


In [62]:
clf2.score(X_te,Y_te)

0.7020289855072464

### Local

In [72]:
reps_local = np.asarray(sentence_rep)
X_tr = reps_local[:train_stop_ix]
X_dev = reps_local[train_stop_ix:dev_stop_ix]
X_te = reps_local[dev_stop_ix:]

In [75]:
clf3, score3 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.6764705882352942
epoch 2, score: 0.7205882352941176
epoch 3, score: 0.7377450980392157
epoch 4, score: 0.7058823529411765
epoch 5, score: 0.6936274509803921
epoch 6, score: 0.6985294117647058
epoch 7, score: 0.6936274509803921
epoch 8, score: 0.6985294117647058
epoch 9, score: 0.7009803921568627
epoch 10, score: 0.7034313725490197


In [76]:
clf3.score(X_te,Y_te)

0.7008695652173913