In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
from transformers.tokenization_utils import TextInputPair
from sklearn.neural_network import MLPClassifier
from copy import deepcopy
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from joblib import dump, load
from datasets import load_dataset

In [2]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = False # if true outputs all layers

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [5]:
ds = load_dataset('glue', 'cola')
df = pd.concat([ds["train"].to_pandas(), ds["validation"].to_pandas(), ds["test"].to_pandas()])

Reusing dataset glue (C:\Users\Beni\.cache\huggingface\datasets\glue\cola\1.0.0\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 3/3 [00:00<00:00, 999.12it/s]


In [7]:
train_stop_ix = ds["train"].num_rows
dev_stop_ix = ds["train"].num_rows + ds["validation"].num_rows

### Encode (tokenize) all pairs of sentences

In [10]:
tuple_list = df["sentence"].to_list()
encodings = tokenizer.batch_encode_plus(tuple_list, max_length=64, pad_to_max_length=True)

In [12]:
# embarray = np.zeros((len(df), len(encodings["input_ids"][0]), 768), dtype=np.float32)
embarray = np.load("cola-bert-embs.npy")

In [13]:
# embarray = get_model_features(df, 15, encodings, model, embarray)

10515
10530
10545
10560
10575
10590
10605
10620
10635
10650
10665


In [14]:
# np.save("cola-bert-embs.npy",embarray)

## Get baseline, local & global representations

In [15]:
words = getWords(embarray)

In [16]:
# BASELINE
baseline_sentence_rep = embarray.reshape((-1,768*64))

In [17]:
# GLOBAL METHOD
# global_representations = global_method(np.asarray(words), n_pc_global, emb_len)
# global_sentence_rep = flatten_pooling(global_representations, embarray)
global_sentence_rep = np.load("global_sent_cola.npy")

In [18]:
# np.save("global_sent_cola.npy",global_sentence_rep)

In [19]:
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
sentence_rep = flatten_pooling(isotropic_representations, embarray)

## Train & Test

In [21]:
Y = np.asarray((df["label"]).to_list())
Y_tr = Y[:train_stop_ix]
Y_dev = Y[train_stop_ix:dev_stop_ix]
Y_te = Y[dev_stop_ix:]

### Baseline

In [41]:
reps_base = np.asarray(baseline_sentence_rep)
X_tr = reps_base[:train_stop_ix]
X_dev = reps_base[train_stop_ix:dev_stop_ix]
X_te = reps_base[dev_stop_ix:]

In [51]:
# Might get warnings as the baseline does not learn to predict any 0s
clf1, score1 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev, scorer="matthew")

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 1, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 2, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 3, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 4, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 5, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 6, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 7, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 8, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


epoch 9, score: 0.0
epoch 10, score: 0.0


  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


### Global

In [54]:
reps_global = np.asarray(global_sentence_rep)
X_tr = reps_global[:train_stop_ix]
X_dev = reps_global[train_stop_ix:dev_stop_ix]
X_te = reps_global[dev_stop_ix:]

In [55]:
clf2, score2 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev, scorer="matthew")

epoch 1, score: 0.37013254584947247
epoch 2, score: 0.4304667313379628
epoch 3, score: 0.4659816741871816
epoch 4, score: 0.4297676508136882
epoch 5, score: 0.47273610719301584
epoch 6, score: 0.4411903546350633
epoch 7, score: 0.4147456011736498
epoch 8, score: 0.4287204670167934
epoch 9, score: 0.4735050808072697
epoch 10, score: 0.44204253274357935


### Local

In [56]:
reps_local = np.asarray(sentence_rep)
X_tr = reps_local[:train_stop_ix]
X_dev = reps_local[train_stop_ix:dev_stop_ix]
X_te = reps_local[dev_stop_ix:]

In [57]:
clf3, score3 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev, scorer="matthew")

epoch 1, score: 0.34822354985655257
epoch 2, score: 0.42798502066185135
epoch 3, score: 0.39651592378586736
epoch 4, score: 0.42294874068517124
epoch 5, score: 0.4186898423718698
epoch 6, score: 0.41932980960936644
epoch 7, score: 0.4172194430405653
epoch 8, score: 0.4346077035289075
epoch 9, score: 0.43153554161096935
epoch 10, score: 0.4221833158823561


## Get test set predictions

In [29]:
dump(clf1, "clf1_cola.joblib")
dump(clf2, "clf2_cola.joblib")
dump(clf3, "clf3_cola.joblib")
# clf1 = load("clf1_cola.joblib")
# clf2 = load("clf2_cola.joblib")
# clf3 = load("clf3_cola.joblib")

['clf3_cola.joblib']

In [31]:
dft = ds["test"].to_pandas()

In [32]:
preds_baseline = clf1.predict(baseline_sentence_rep[dev_stop_ix:])
preds_global = clf2.predict(global_sentence_rep[dev_stop_ix:])
preds_local = clf3.predict(sentence_rep[dev_stop_ix:])

In [35]:
dft["index"] = dft["idx"]
dft["prediction"] = preds_local
dft[["index","prediction"]].to_csv("CoLA_local.tsv", index=False, sep="\t")
dft["prediction"] = preds_global
dft[["index","prediction"]].to_csv("CoLA_global.tsv", index=False, sep="\t")
dft["prediction"] = preds_baseline
dft[["index","prediction"]].to_csv("CoLA_baseline.tsv", index=False, sep="\t")