In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
from transformers.tokenization_utils import TextInputPair
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from joblib import dump, load
from datasets import load_dataset

### Load BERT & Data

In [2]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = False # if true outputs all layers

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [3]:
ds = load_dataset('glue', 'sst2')
df_tr = ds["train"].to_pandas().sample(n=7000,random_state=123)
df = pd.concat([df_tr, ds["validation"].to_pandas(), ds["test"].to_pandas()])

Reusing dataset glue (C:\Users\Beni\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 49.99it/s]


In [4]:
train_stop_ix = len(df_tr)
dev_stop_ix = len(df_tr) + ds["validation"].num_rows

### Encode (tokenize) all pairs of sentences

In [5]:
tuple_list = df["sentence"].to_list()
encodings = tokenizer.batch_encode_plus(tuple_list, max_length=64, pad_to_max_length=True)

In [None]:
embarray = np.load("sst2-bert-embs.npy")

Run commented cells below if no saved embeddings

In [7]:
# embarray = np.zeros((len(df), len(encodings["input_ids"][0]), 768), dtype=np.float32)
# embarray = get_model_features(df, 15, encodings, model, embarray)

In [18]:
# np.save("sst2-bert-embs.npy",embarray)

## Get baseline, local & global representations

In [8]:
words = getWords(embarray)

In [9]:
# BASELINE
baseline_sentence_rep = embarray.reshape((-1,768*64))

In [10]:
# GLOBAL METHOD - run commented lines if not available
global_sentence_rep = np.load("global_sent_sst2.npy")
# global_representations = global_method(np.asarray(words), n_pc_global, emb_len)
# global_sentence_rep = flatten_pooling(global_representations, embarray)
# np.save("global_sent_sst2.npy",global_sentence_rep)

In [40]:
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
sentence_rep = flatten_pooling(isotropic_representations, embarray)

## Train & Test

In [41]:
# Simple train/test split - only for quick testing purposes. Reported scores are based on GLUE/SuperGLUE official test submissions.
Y = np.asarray((df["label"]).to_list())
Y_tr = Y[:train_stop_ix]
Y_dev = Y[train_stop_ix:dev_stop_ix]
Y_te = Y[dev_stop_ix:]

### Baseline

In [50]:
reps_base = np.asarray(baseline_sentence_rep)
X_tr = reps_base[:train_stop_ix]
X_dev = reps_base[train_stop_ix:dev_stop_ix]
X_te = reps_base[dev_stop_ix:]

In [51]:
clf1, score1 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.7901376146788991
epoch 2, score: 0.8325688073394495
epoch 3, score: 0.8256880733944955
epoch 4, score: 0.8532110091743119
epoch 5, score: 0.841743119266055
epoch 6, score: 0.8405963302752294
epoch 7, score: 0.823394495412844
epoch 8, score: 0.8394495412844036
epoch 9, score: 0.8256880733944955
epoch 10, score: 0.8268348623853211


### Global

In [44]:
reps_global = np.asarray(global_sentence_rep)
X_tr = reps_global[:train_stop_ix]
X_dev = reps_global[train_stop_ix:dev_stop_ix]
X_te = reps_global[dev_stop_ix:]

In [45]:
clf2, score2 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.7775229357798165
epoch 2, score: 0.8130733944954128
epoch 3, score: 0.8107798165137615
epoch 4, score: 0.8130733944954128
epoch 5, score: 0.8061926605504587
epoch 6, score: 0.801605504587156
epoch 7, score: 0.8096330275229358
epoch 8, score: 0.8061926605504587
epoch 9, score: 0.8061926605504587
epoch 10, score: 0.8119266055045872


### Local

In [46]:
reps_local = np.asarray(sentence_rep)
X_tr = reps_local[:train_stop_ix]
X_dev = reps_local[train_stop_ix:dev_stop_ix]
X_te = reps_local[dev_stop_ix:]

In [47]:
clf3, score3 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.7786697247706422
epoch 2, score: 0.801605504587156
epoch 3, score: 0.8027522935779816
epoch 4, score: 0.7958715596330275
epoch 5, score: 0.7970183486238532
epoch 6, score: 0.7970183486238532
epoch 7, score: 0.7970183486238532
epoch 8, score: 0.7993119266055045
epoch 9, score: 0.7993119266055045
epoch 10, score: 0.7981651376146789


## Get test set predictions

In [35]:
dump(clf1, "clf1_sst2.joblib")
dump(clf2, "clf2_sst2.joblib")
dump(clf3, "clf3_sst2.joblib")
# clf1 = load("clf1_sst2.joblib")
# clf2 = load("clf2_sst2.joblib")
# clf3 = load("clf3_sst2.joblib")

['clf3_sst2.joblib']

In [21]:
dft = ds["test"].to_pandas()

In [52]:
preds_baseline = clf1.predict(baseline_sentence_rep[dev_stop_ix:])
preds_global = clf2.predict(global_sentence_rep[dev_stop_ix:])
preds_local = clf3.predict(sentence_rep[dev_stop_ix:])

In [53]:
dft["index"] = dft["idx"]
dft["prediction"] = preds_local
dft[["index","prediction"]].to_csv("../results/SST2_local4.tsv", index=False, sep="\t")
dft["prediction"] = preds_global
dft[["index","prediction"]].to_csv("../results/SST2_global4.tsv", index=False, sep="\t")
dft["prediction"] = preds_baseline
dft[["index","prediction"]].to_csv("../results/SST2_baseline1.tsv", index=False, sep="\t")