In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
from transformers.tokenization_utils import TextInputPair
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from joblib import dump, load
from datasets import load_dataset

In [2]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = False # if true outputs all layers

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [9]:
ds = load_dataset('glue', 'sst2')
df_tr = ds["train"].to_pandas().sample(n=7000,random_state=123)
df = pd.concat([df_tr, ds["validation"].to_pandas(), ds["test"].to_pandas()])

Reusing dataset glue (C:\Users\Beni\.cache\huggingface\datasets\glue\sst2\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 749.92it/s]


In [10]:
train_stop_ix = len(df_tr)
dev_stop_ix = len(df_tr) + ds["validation"].num_rows

### Encode (tokenize) all pairs of sentences

In [15]:
tuple_list = df["sentence"].to_list()
encodings = tokenizer.batch_encode_plus(tuple_list, max_length=64, pad_to_max_length=True)

In [16]:
embarray = np.zeros((len(df), len(encodings["input_ids"][0]), 768), dtype=np.float32)

In [17]:
# embarray = get_model_features(df, 15, encodings, model, embarray)
embarray = np.load("sst2-bert-embs.npy")

9615
9630
9645
9660
9675
9690
9705


In [18]:
# np.save("sst2-bert-embs.npy",embarray)

## Get baseline, local & global representations

In [19]:
words = getWords(embarray)

In [20]:
# BASELINE
baseline_sentence_rep = embarray.reshape((-1,768*64))

In [21]:
# GLOBAL METHOD
# global_representations = global_method(np.asarray(words), n_pc_global, emb_len)
# global_sentence_rep = flatten_pooling(global_representations, embarray)
global_sentence_rep = np.load("global_sent_sst2.npy")

In [22]:
# np.save("global_sent_sst2.npy",global_sentence_rep)

In [23]:
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
sentence_rep = flatten_pooling(isotropic_representations, embarray)

## Train & Test

In [24]:
Y = np.asarray((df["label"]).to_list())
Y_tr = Y[:train_stop_ix]
Y_dev = Y[train_stop_ix:dev_stop_ix]
Y_te = Y[dev_stop_ix:]

### Baseline

In [25]:
reps_base = np.asarray(baseline_sentence_rep)
X_tr = reps_base[:train_stop_ix]
X_dev = reps_base[train_stop_ix:dev_stop_ix]
X_te = reps_base[dev_stop_ix:]

In [26]:
clf1, score1 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.8119266055045872
epoch 2, score: 0.8130733944954128
epoch 3, score: 0.8348623853211009
epoch 4, score: 0.8314220183486238
epoch 5, score: 0.8405963302752294
epoch 6, score: 0.8245412844036697
epoch 7, score: 0.8371559633027523
epoch 8, score: 0.8360091743119266
epoch 9, score: 0.8291284403669725
epoch 10, score: 0.8371559633027523


### Global

In [27]:
reps_global = np.asarray(global_sentence_rep)
X_tr = reps_global[:train_stop_ix]
X_dev = reps_global[train_stop_ix:dev_stop_ix]
X_te = reps_global[dev_stop_ix:]

In [28]:
clf2, score2 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.7855504587155964
epoch 2, score: 0.819954128440367
epoch 3, score: 0.819954128440367
epoch 4, score: 0.8165137614678899
epoch 5, score: 0.8153669724770642
epoch 6, score: 0.8130733944954128
epoch 7, score: 0.8188073394495413
epoch 8, score: 0.8211009174311926
epoch 9, score: 0.823394495412844
epoch 10, score: 0.8256880733944955


### Local

In [29]:
reps_local = np.asarray(sentence_rep)
X_tr = reps_local[:train_stop_ix]
X_dev = reps_local[train_stop_ix:dev_stop_ix]
X_te = reps_local[dev_stop_ix:]

In [34]:
clf3, score3 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.7844036697247706
epoch 2, score: 0.8084862385321101
epoch 3, score: 0.7889908256880734
epoch 4, score: 0.8004587155963303
epoch 5, score: 0.8038990825688074
epoch 6, score: 0.8038990825688074
epoch 7, score: 0.7993119266055045
epoch 8, score: 0.801605504587156
epoch 9, score: 0.8038990825688074
epoch 10, score: 0.8038990825688074


## Get test set predictions

In [35]:
dump(clf1, "clf1_sst2.joblib")
dump(clf2, "clf2_sst2.joblib")
dump(clf3, "clf3_sst2.joblib")
# clf1 = load("clf1_sst2.joblib")
# clf2 = load("clf2_sst2.joblib")
# clf3 = load("clf3_sst2.joblib")

['clf3_sst2.joblib']

In [36]:
dft = ds["test"].to_pandas()

In [37]:
preds_baseline = clf1.predict(baseline_sentence_rep[dev_stop_ix:])
preds_global = clf2.predict(global_sentence_rep[dev_stop_ix:])
preds_local = clf3.predict(sentence_rep[dev_stop_ix:])

In [None]:
dft["index"] = dft["idx"]
dft["prediction"] = preds_local
dft[["index","prediction"]].to_csv("SST2_local.tsv", index=False, sep="\t")
dft["prediction"] = preds_global
dft[["index","prediction"]].to_csv("SST2_global.tsv", index=False, sep="\t")
dft["prediction"] = preds_baseline
dft[["index","prediction"]].to_csv("SST2_baseline.tsv", index=False, sep="\t")