# Testing BertPipeline

In [27]:
import sys
sys.path.append('../ml/BERT')
import masking
import BERT
from Vectorisation import Vectorisation
from Config import Config

import pickle
import tensorflow as tf
import numpy as np
with open("../../data/ml4science_data.pkl", "rb") as fp:
    data_dict = pickle.load(fp)

config = Config(EPOCH=100)
vec = Vectorisation(config=config)

In [28]:
with open('../../data/sequences.pkl', 'rb') as f:
    decoded_sequences = pickle.load(f)

for i in range(3): print(decoded_sequences[0][i])
print(len(decoded_sequences))

[0, 0, 0, 1, 0, 0, 0, 0, 0, 7.889]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 75.141, 0.0]
[0.0, 0.0, 0.0, 1.0, 0, 0, 0, 0, 0, 3.25]
254


### Vectorising sequences

In [29]:
seps = vec.sep_from_seq(decoded_sequences)
encoded_sequences = vec.encode(decoded_sequences, seps)

print(encoded_sequences.shape)
print(encoded_sequences[0])

(254, 512)
[26  2 26 21 21 26 23 25 21 23  2 21 26 21 21 26 21 25  3  8  3  3  3  3
  3  3  3  3  7  8  4  2  8  3 16 20 21 26 21 23 21  3  8  7  8  3  5  8
  4  8  2  8  3  8 10  9 10  9  9 15 20 11  9 14 13 11  9 14 13 14 10 11
  9  9 13 14 13 14 11  9 10 14 13 10 11  9 14  9 14 15 20 15 15 17 20  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0 

### Masking sequences, split into train and test
x -> masked  
y -> unmasked

In [30]:
x_masked_encoded, y_masked_encoded, sample_weights = masking.mask_input_and_labels(encoded_sequences, config.TOKEN_DICT)
x_tr, y_tr, w_tr = x_masked_encoded[:200], y_masked_encoded[:200], sample_weights[:200]
x_te, y_te, w_te = x_masked_encoded[200:], y_masked_encoded[200:], sample_weights[200:]

print('full set:', x_masked_encoded.shape, y_masked_encoded.shape, sample_weights.shape)
print('train set:', x_tr.shape, y_tr.shape, w_tr.shape)
print('test set:', x_te.shape, y_te.shape, w_te.shape)

# this is the dataset that will be used for training
mlm_ds = tf.data.Dataset.from_tensor_slices((x_tr, y_tr, w_tr))

full set: (254, 512) (254, 512) (254, 512)
train set: (200, 512) (200, 512) (200, 512)
test set: (54, 512) (54, 512) (54, 512)


In [31]:
from BERTPipeline import BERTPipeline

bert = BERTPipeline(config)
bert.train(mlm_ds)

### Predicting on the masked test set

In [32]:
pred1 = bert.predict(x_te, only_masked=True)
pred2 = bert.predict(x_te, only_masked=True)



In [38]:
print(pred1.shape)
print('pred:', pred1[0][:50])
print('mask:', x_te[0][:50])
print('test:', y_te[0][:50])
print((pred1[0] == y_te[0])[:50]*1)

(54, 512)
pred: [26  2 21 21 26 23 21 21 23 26 21 26 22  2 21  2 26 21 25 26 21 26 21 23
 21 26 21 21 21 21 26 21 21 22 21 22 15 20 16 20 16 15 20 17 15 20 15 20
 17 15]
mask: [26  2 21 21 26 23 21 21 23 26  1 26 22  2 21  2 26 21 25 26 21 26 21 23
 21 26 21 21 21 21 26 21 21 22 21 22  1 20 16 20 16 15 20 17 15 20 15 20
 17 15]
test: [26  2 21 21 26 23 21 21 23 26 23 26 22  2 21  2 26 21 25 26 21 26 21 23
 21 26 21 21 21 21 26 21 21 22 21 22 15 20 16 20 16 15 20 17 15 20 15 20
 17 15]
[1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [46]:
print(pred2.shape)
print('pred:', pred2[0][:50])
print('mask:', x_te[0][:50])
print('test:', y_te[0][:50])
print((pred2[0] == y_te[0])[:50]*1)

assert ((pred1 == pred2).all())

(54, 512)
pred: [26  2 21 21 26 23 21 21 23 26 21 26 22  2 21  2 26 21 25 26 21 26 21 23
 21 26 21 21 21 21 26 21 21 22 21 22 15 20 16 20 16 15 20 17 15 20 15 20
 17 15]
mask: [26  2 21 21 26 23 21 21 23 26  1 26 22  2 21  2 26 21 25 26 21 26 21 23
 21 26 21 21 21 21 26 21 21 22 21 22  1 20 16 20 16 15 20 17 15 20 15 20
 17 15]
test: [26  2 21 21 26 23 21 21 23 26 23 26 22  2 21  2 26 21 25 26 21 26 21 23
 21 26 21 21 21 21 26 21 21 22 21 22 15 20 16 20 16 15 20 17 15 20 15 20
 17 15]
[1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [41]:
where_equal = (pred1 == y_te)
where_masked = (x_te == config.TOKEN_DICT['[MASK]'])

print("Accuracy brut (per student mean): ", np.mean(np.sum(pred1 == y_te, axis=1) / (y_te.shape[1])))
print("Accuracy brut (per student mean): ", np.sum(pred1 == y_te) / np.size(y_te))
print("Accuracy without padding (per student mean): ", np.mean(np.sum((pred1 == y_te) * (y_te != 0), axis=1) / np.sum(y_te != 0, axis=1)))
# print("Accuracy on masked tokens (per student mean): ", np.mean(np.sum(where_equal * where_masked, axis=1) / np.sum(where_masked, axis=1)))

Accuracy brut (per student mean):  0.9844473379629629
Accuracy brut (per student mean):  0.9844473379629629
Accuracy without padding (per student mean):  0.9184737124264202


In [42]:
print("Accuracy brut (per total): ", np.sum(pred1 == y_te) / np.size(y_te))
print("Accuracy without padding (per total): ", np.sum((pred1 == y_te) * (y_te != 0)) / np.sum(y_te != 0))
print("Accuracy on masked tokens: ", np.sum(where_equal * where_masked) / np.sum(where_masked))

Accuracy brut (per total):  0.9844473379629629
Accuracy without padding (per total):  0.9166828134082542
Accuracy on masked tokens:  0.41903171953255425


In [43]:
print(np.sum(where_equal[19]))
print(where_masked[19])
print("Accuracy on masked tokens: ", np.sum(where_equal * where_masked) / np.sum(where_masked))

512
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False Fals