In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
from transformers.tokenization_utils import TextInputPair
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from joblib import dump, load

### Load BERT & Data

In [2]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = False # if true outputs all layers

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [3]:
## Read data, append sentences..
df = pd.read_json('../data/BoolQ/train.jsonl', lines=True)

### Encode (tokenize) all pairs of sentences

In [4]:
tuple_list = list(zip(df["question"].to_list(), df["passage"].to_list()))
encodings = tokenizer.batch_encode_plus(tuple_list, max_length=64, pad_to_max_length=True)

In [5]:
# embarray = np.zeros((len(df), len(encodings["input_ids"][0]), 768), dtype=np.float32)
embarray = np.load("boolq-bert-embs.npy")

In [None]:
# embarray = get_model_features(df, 10, encodings, model, embarray)

In [7]:
# np.save("boolq-bert-embs.npy",embarray)

## Get baseline, local & global representations

In [6]:
words = getWords(embarray)

In [7]:
# BASELINE
baseline_sentence_rep = embarray.reshape((-1,768*64))

In [8]:
# GLOBAL METHOD
# global_representations = global_method(np.asarray(words), n_pc_global, emb_len)
# global_sentence_rep = flatten_pooling(global_representations, embarray)
# np.save("global_sent_boolq.npy",global_sentence_rep)
global_sentence_rep = np.load("global_sent_boolq.npy")

In [12]:
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
sentence_rep = flatten_pooling(isotropic_representations, embarray)

## Train & Test

In [14]:
# Simple train/test split - only for quick testing purposes. Reported scores are based on GLUE/SuperGLUE official test submissions.
Y = np.asarray((df["label"]).to_list())
Y_tr = Y[:]
#Y_te = Y[2000:]

### Baseline

In [15]:
avg_reps_base = np.asarray(baseline_sentence_rep)
X_tr = avg_reps_base[:]

In [16]:
clf1 = MLPClassifier(hidden_layer_sizes=(100,), activation="relu", solver="adam", learning_rate_init=5e-3 )
clf1.fit(X_tr, Y_tr)

MLPClassifier(learning_rate_init=0.005)

### Global

In [17]:
avg_reps_global = np.asarray(global_sentence_rep)
X_tr = avg_reps_global[:]
#X_te = avg_reps_global[2000:]

In [18]:
clf2 = MLPClassifier(hidden_layer_sizes=(100,), activation="relu", solver="adam", learning_rate_init=5e-3 )
clf2.fit(X_tr, Y_tr)
#clf2.score(X_te,Y_te)

MLPClassifier(learning_rate_init=0.005)

### Local

In [19]:
avg_reps_local = np.asarray(sentence_rep)
X_tr = avg_reps_local[:]
#X_te = avg_reps_local[2000:]

In [20]:
clf3 = MLPClassifier(hidden_layer_sizes=(100,), activation="relu", solver="adam", learning_rate_init=5e-3 )
clf3.fit(X_tr, Y_tr)
#clf3.score(X_te,Y_te)

MLPClassifier(learning_rate_init=0.005)

## Get test set predictions

In [21]:
dump(clf1, "clf1_boolq.joblib")
dump(clf2, "clf2_boolq.joblib")
dump(clf3, "clf3_boolq.joblib")
# clf1 = load("clf1_boolq.joblib")
# clf2 = load("clf2_boolq.joblib")
# clf3 = load("clf3_boolq.joblib")


['clf3_boolq.joblib']

In [22]:
dft = pd.read_json('../data/BoolQ/test.jsonl', lines=True)

In [25]:
tuple_list = list(zip(dft["passage"].to_list(), dft["question"].to_list()))
encodings = tokenizer.batch_encode_plus(tuple_list, max_length=64, pad_to_max_length=True)

In [26]:
embarraytest = np.zeros((len(dft), len(encodings["input_ids"][0]), 768), dtype=np.float32)
embarraytest = get_model_features(dft, 15, encodings, model, embarraytest)

3015
3030
3045
3060
3075
3090
3105
3120
3135
3150
3165
3180
3195
3210
3225
3240
3255


In [27]:
np.save("boolq-bert-embs-test.npy",embarraytest)

In [28]:
words = getWords(embarraytest)

In [29]:
# BASELINE
baseline_sentence_rep = embarraytest.reshape((-1,768*64))
# GLOBAL METHOD
global_representations = global_method(np.asarray(words), n_pc_global, emb_len)
global_sentence_rep = flatten_pooling(global_representations, embarraytest)
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
# calculating sentence representations based on mean word (isotropised) representations
sentence_rep = flatten_pooling(isotropic_representations, embarraytest)

In [30]:
preds_baseline = clf1.predict(baseline_sentence_rep)
preds_global = clf2.predict(global_sentence_rep)
preds_local = clf3.predict(sentence_rep)

In [33]:
dft["label"] = preds_local
dft["label"] = dft["label"].replace([False,True],["false","true"])
dft[["idx","label"]].to_json("BoolQ_local.jsonl", lines=True, orient="records")
dft["label"] = preds_global
dft["label"] = dft["label"].replace([False,True],["false","true"])
dft[["idx","label"]].to_json("BoolQ_global.jsonl", lines=True, orient="records")
dft["label"] = preds_baseline
dft["label"] = dft["label"].replace([False,True],["false","true"])
dft[["idx","label"]].to_json("BoolQ_baseline.jsonl", lines=True, orient="records")