In [1]:
from IPython.display import clear_output
from functions import *
import numpy as np
import pandas as pd
from transformers import *
from transformers.tokenization_utils import TextInputPair
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
import pickle
import scipy as sc
import math as mt
from joblib import dump, load

### Load BERT & Data

In [2]:
# BERT
casing = "bert-base-uncased" 
tokenizer = BertTokenizer.from_pretrained(casing, do_lower_case=True, add_special_tokens=True)

config = BertConfig(dropout=0.2, attention_dropout=0.2 ) #hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2
config.output_hidden_states = False # if true outputs all layers

model = TFBertModel.from_pretrained(casing, config = config)
model.trainable = False
emb_len = 768
clear_output()

# BERT
n_cluster = 27 # Number of clusters to use
n_pc = 12 # Number of main principal components to drop for local method
n_pc_global = 15 # Number of main principal components to drop for global method

In [3]:
## Read data, append sentences..
df_tr = pd.read_json('../data/WiC/train.jsonl', lines=True)
df_dev = pd.read_json('../data/WiC/val.jsonl', lines=True)
df_te = pd.read_json('../data/WiC/test.jsonl', lines=True)
df_te["label"] = False
train_stop_ix = len(df_tr) 
dev_stop_ix = len(df_tr) + len(df_dev)

In [4]:
df = pd.concat([df_tr, df_dev, df_te])

### Encode (tokenize) all pairs of sentences

In [5]:
tuple_list = list(zip(df["sentence1"].to_list(), df["sentence2"].to_list()))
encodings = tokenizer.batch_encode_plus(tuple_list, max_length=64, pad_to_max_length=True)

In [None]:
embarray = np.load("wic-bert-embs.npy")

Run commented cells below if no saved embeddings

In [7]:
#embarray = np.zeros((len(df), len(encodings["input_ids"][0]), 768), dtype=np.float32)

In [14]:
#embarray = get_model_features(df, 10, encodings, model, embarray)

7410
7420
7430
7440
7450
7460
7470


In [15]:
#np.save("wic-bert-embs.npy",embarray)

## Get baseline, local & global representations

In [8]:
words = getWords(embarray)

In [9]:
# BASELINE
baseline_sentence_rep = embarray.reshape((-1,768*64))

In [10]:
# GLOBAL METHOD - run commented lines if not available
global_sentence_rep = np.load("global_sent_wic.npy")
# global_representations = global_method(np.asarray(words), n_pc_global, emb_len)
# global_sentence_rep = flatten_pooling(global_representations, embarray)
# np.save("global_sent_wic.npy",global_sentence_rep)

In [11]:
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
sentence_rep = flatten_pooling(isotropic_representations, embarray)

## Train & Test

In [12]:
# Simple train/test split - only for quick testing purposes. Reported scores are based on GLUE/SuperGLUE official test submissions.
Y = np.asarray((df["label"]).to_list())
Y_tr = Y[:train_stop_ix]
Y_dev = Y[train_stop_ix:dev_stop_ix]
Y_te = Y[dev_stop_ix:]

### Baseline

In [50]:
reps_base = np.asarray(baseline_sentence_rep)
X_tr = reps_base[:train_stop_ix]
X_dev = reps_base[train_stop_ix:dev_stop_ix]
X_te = reps_base[dev_stop_ix:]

In [51]:
clf1, score1 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.5407523510971787
epoch 2, score: 0.5532915360501567
epoch 3, score: 0.5877742946708464
epoch 4, score: 0.6050156739811913
epoch 5, score: 0.622257053291536
epoch 6, score: 0.6394984326018809
epoch 7, score: 0.6332288401253918
epoch 8, score: 0.6379310344827587
epoch 9, score: 0.6159874608150471
epoch 10, score: 0.6394984326018809


### Global

In [52]:
reps_global = np.asarray(global_sentence_rep)
X_tr = reps_global[:train_stop_ix]
X_dev = reps_global[train_stop_ix:dev_stop_ix]
X_te = reps_global[dev_stop_ix:]

In [53]:
clf2, score2 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.609717868338558
epoch 2, score: 0.64576802507837
epoch 3, score: 0.6473354231974922
epoch 4, score: 0.6285266457680251
epoch 5, score: 0.6379310344827587
epoch 6, score: 0.6426332288401254
epoch 7, score: 0.6536050156739812
epoch 8, score: 0.6410658307210031
epoch 9, score: 0.6410658307210031
epoch 10, score: 0.6489028213166145


### Local

In [54]:
# LOCAL METHOD
isotropic_representations = cluster_based(np.asarray(words), n_cluster, n_pc, emb_len)
sentence_rep = flatten_pooling(isotropic_representations, embarray)

In [55]:
reps_local = np.asarray(sentence_rep)
X_tr = reps_local[:train_stop_ix]
X_dev = reps_local[train_stop_ix:dev_stop_ix]
X_te = reps_local[dev_stop_ix:]

In [56]:
clf3, score3 = get_best_classifier(10,X_tr,Y_tr,X_dev,Y_dev)

epoch 1, score: 0.5924764890282131
epoch 2, score: 0.6175548589341693
epoch 3, score: 0.6128526645768025
epoch 4, score: 0.6285266457680251
epoch 5, score: 0.6050156739811913
epoch 6, score: 0.6112852664576802
epoch 7, score: 0.6112852664576802
epoch 8, score: 0.6112852664576802
epoch 9, score: 0.6050156739811913
epoch 10, score: 0.609717868338558


## Get test set predictions

In [30]:
dump(clf1, "clf1_wic.joblib")
dump(clf2, "clf2_wic.joblib")
dump(clf3, "clf3_wic.joblib")
# clf1 = load("clf1_wic.joblib")
# clf2 = load("clf2_wic.joblib")
# clf3 = load("clf3_wic.joblib")

['clf3_wic.joblib']

In [19]:
dft = pd.read_json('../data/WiC/test.jsonl', lines=True)

In [57]:
preds_baseline = clf1.predict(baseline_sentence_rep[dev_stop_ix:])
preds_global = clf2.predict(global_sentence_rep[dev_stop_ix:])
preds_local = clf3.predict(sentence_rep[dev_stop_ix:])

In [58]:
dft["label"] = preds_local
dft["label"] = dft["label"].replace([False,True],["false","true"])
dft[["idx","label"]].to_json("../results/WiC_local5.jsonl", lines=True, orient="records")
dft["label"] = preds_global
dft["label"] = dft["label"].replace([False,True],["false","true"])
dft[["idx","label"]].to_json("../results/WiC_global5.jsonl", lines=True, orient="records")
dft["label"] = preds_baseline
dft["label"] = dft["label"].replace([False,True],["false","true"])
dft[["idx","label"]].to_json("../results/WiC_baseline5.jsonl", lines=True, orient="records")