# Testing BertPipeline

In [21]:
import sys
sys.path.append('../ml/BERT')
import masking
import BERT
from Vectorisation import Vectorisation
from Config import Config

import pickle
import tensorflow as tf
import numpy as np
with open("../../data/ml4science_data.pkl", "rb") as fp:
    data_dict = pickle.load(fp)

config = Config(EPOCH=100)
vec = Vectorisation(config=config)

In [22]:
with open('../../data/sequences.pkl', 'rb') as f:
    decoded_sequences = pickle.load(f)

for i in range(3): print(decoded_sequences[0][i])
print(len(decoded_sequences))

[0, 0, 0, 1, 0, 0, 0, 0, 0, 7.889]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 75.141, 0.0]
[0.0, 0.0, 0.0, 1.0, 0, 0, 0, 0, 0, 3.25]
254


### Vectorising sequences

In [23]:
seps = vec.sep_from_seq(decoded_sequences)
encoded_sequences = vec.encode(decoded_sequences, seps)

print(encoded_sequences.shape)
print(encoded_sequences[0])

(254, 128)
[26  2 26 21 21 26 23 25 21 23  2 21 26 21 21 26 21 25  3  8  3  3  3  3
  3  3  3  3  7  8  4  2  8  3 16 20 21 26 21 23 21  3  8  7  8  3  5  8
  4  8  2  8  3  8 10  9 10  9  9 15 20 11  9 14 13 11  9 14 13 14 10 11
  9  9 13 14 13 14 11  9 10 14 13 10 11  9 14  9 14 15 20 15 15 17 20  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]


### Masking sequences, split into train and test
x -> masked  
y -> unmasked

In [59]:
x_masked_encoded, y_masked_encoded, sample_weights = masking.mask_input_and_labels(encoded_sequences, config.TOKEN_DICT)
x_tr, y_tr, w_tr = x_masked_encoded[:200], y_masked_encoded[:200], sample_weights[:200]
x_te, y_te, w_te = x_masked_encoded[200:], y_masked_encoded[200:], sample_weights[200:]

print('full set:', x_masked_encoded.shape, y_masked_encoded.shape, sample_weights.shape)
print('train set:', x_tr.shape, y_tr.shape, w_tr.shape)
print('test set:', x_te.shape, y_te.shape, w_te.shape)

# this is the dataset that will be used for training
mlm_ds = tf.data.Dataset.from_tensor_slices((x_tr, y_tr, w_tr))

full set: (254, 128) (254, 128) (254, 128)
train set: (200, 128) (200, 128) (200, 128)
test set: (54, 128) (54, 128) (54, 128)


In [25]:
from BERTPipeline import BERTPipeline

bert = BERTPipeline(config)
bert.train(mlm_ds)

Model: "masked_bert_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_4 (InputLayer)        [(None, 128)]                0         []                            
                                                                                                  
 word_embedding (Embedding)  (None, 128, 64)              1728      ['input_4[0][0]']             
                                                                                                  
 tf.__operators__.add_9 (TF  (None, 128, 64)              0         ['word_embedding[0][0]']      
 OpLambda)                                                                                        
                                                                                                  
 encoder_0/multiheadattenti  (None, 128, 64)              16640     ['tf.__operato

### Predicting on the masked test set

In [32]:
pred = bert.predict(x_te)

(54, 128)


In [60]:
print(pred.shape)
print('pred:', pred[19])
print('test:', y_te[19])
print((pred[19] == y_te[19])*1)

(54, 128)
pred: [26 25 21 21 21 26 26 21 21 26 25 26 22 22 26 23 26 25 21 11 10 22 22 22
 26 26 26 22 22 22 22 20 20 20 22 22 22 22 22 22 26 26 26 10 10 10 10 10
 10 22 21 10 10 10 10 10 10 22  4 22 22 22 17  4  4  8  8 10 26 26 10 10
 10 10  8 19 10 10 10 10 19 19 19 23 10 10 10 10 26 20 22  8 10 10 10 10
 10 22 26 22 10 10 10 10 10 10 22 22 22 22 22 26 26 26 22 22 22 26 10 10
 10 22 22 21 22 22 10 10]
test: [25 26  2 21 21  2 26 21 21 26 25 22 22 22 26 23 26 25  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]
[0 0 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [48]:
print("Accuracy brut: ", np.mean(np.sum(pred == y_te, axis=1) / (y_te.shape[1])))
print("Accuracy brut: ", np.sum(pred == y_te) / np.size(y_te))
print("Accuracy without taking in acount padding: ", np.sum((pred == y_te) * (y_te != 0)) / np.sum(y_te != 0))

Accuracy brut:  0.4195601851851852
Accuracy brut:  0.4195601851851852
Accuracy without taking in acount padding:  0.7827260458839406


In [72]:
where_equal = (pred == y_te)
where_masked = (x_te == config.TOKEN_DICT['[MASK]'])

print("Accuracy on masked tokens: ", np.mean(np.sum(where_equal * where_masked, axis=1) / np.sum(where_masked, axis=1)))
print(np.sum(where_equal * where_masked, axis=1).shape)
print(np.sum(x_te == config.TOKEN_DICT['[MASK]'], axis=1).shape)

Accuracy on masked tokens:  0.8260644243771552
(54,)
(54,)
