# Testing BertPipeline

In [1]:
import sys
sys.path.append('../ml/BERT')
import masking
import BERT
from Vectorisation import Vectorisation
from Config import Config

import pickle
import tensorflow as tf
import numpy as np
with open("../../data/ml4science_data.pkl", "rb") as fp:
    data_dict = pickle.load(fp)

config = Config(EPOCH=100)
vec = Vectorisation(config=config)




In [2]:
with open('../../data/sequences.pkl', 'rb') as f:
    decoded_sequences = pickle.load(f)

for i in range(3): print(decoded_sequences[0][i])
print(len(decoded_sequences))

[0, 0, 0, 1, 0, 0, 0, 0, 0, 7.889]
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 75.141, 0.0]
[0.0, 0.0, 0.0, 1.0, 0, 0, 0, 0, 0, 3.25]
254


### Vectorising sequences

In [3]:
seps = vec.sep_from_seq(decoded_sequences)
encoded_sequences = vec.encode(decoded_sequences, seps)

print(encoded_sequences.shape)
print(encoded_sequences[0])

(254, 512)
[26  2 26 21 21 26 23 25 21 23  2 21 26 21 21 26 21 25  3  8  3  3  3  3
  3  3  3  3  7  8  4  2  8  3 16 20 21 26 21 23 21  3  8  7  8  3  5  8
  4  8  2  8  3  8 10  9 10  9  9 15 20 11  9 14 13 11  9 14 13 14 10 11
  9  9 13 14 13 14 11  9 10 14 13 10 11  9 14  9 14 15 20 15 15 17 20  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0 

### Masking sequences, split into train and test
x -> masked  
y -> unmasked

In [4]:
x_masked_encoded, y_masked_encoded, sample_weights = masking.mask_input_and_labels(encoded_sequences, config.TOKEN_DICT)
x_tr, y_tr, w_tr = x_masked_encoded[:200], y_masked_encoded[:200], sample_weights[:200]
x_te, y_te, w_te = x_masked_encoded[200:], y_masked_encoded[200:], sample_weights[200:]

print('full set:', x_masked_encoded.shape, y_masked_encoded.shape, sample_weights.shape)
print('train set:', x_tr.shape, y_tr.shape, w_tr.shape)
print('test set:', x_te.shape, y_te.shape, w_te.shape)

# this is the dataset that will be used for training
mlm_ds = tf.data.Dataset.from_tensor_slices((x_tr, y_tr, w_tr))

full set: (254, 512) (254, 512) (254, 512)
train set: (200, 512) (200, 512) (200, 512)
test set: (54, 512) (54, 512) (54, 512)


In [5]:
from BERTPipeline import BERTPipeline

bert = BERTPipeline(config)
bert.train(mlm_ds)




### Predicting on the masked test set

In [13]:
pred = bert.predict(x_te, only_masked=True)



In [17]:
print(pred.shape)
print('pred:', pred[19][:50])
print('mask:', x_te[19][:50])
print('test:', y_te[19][:50])
print((pred[19] == y_te[19])[:50]*1)

(54, 512)
pred: [26 26  2 21 21  2 26 21 21 21 25 22 22 11 26 23 26 25  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]
mask: [ 1 26  2  1 21  2 26 21 21  1 25 22 22 11 26 23 26 25  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]
test: [25 26  2 21 21  2 26 21 21 26 25 22 22 22 26 23 26 25  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]
[0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [8]:
where_equal = (pred == y_te)
where_masked = (x_te == config.TOKEN_DICT['[MASK]'])

print("Accuracy brut (per student mean): ", np.mean(np.sum(pred == y_te, axis=1) / (y_te.shape[1])))
print("Accuracy brut (per student mean): ", np.sum(pred == y_te) / np.size(y_te))
print("Accuracy without padding (per student mean): ", np.mean(np.sum((pred == y_te) * (y_te != 0), axis=1) / np.sum(y_te != 0, axis=1)))
print("Accuracy on masked tokens (per student mean): ", np.mean(np.sum(where_equal * where_masked, axis=1) / np.sum(where_masked, axis=1)))

Accuracy brut (per student mean):  0.9853877314814815
Accuracy brut (per student mean):  0.9853877314814815
Accuracy without padding (per student mean):  0.91818394282683
Accuracy on masked tokens (per student mean):  nan


  print("Accuracy on masked tokens (per student mean): ", np.mean(np.sum(where_equal * where_masked, axis=1) / np.sum(where_masked, axis=1)))


In [21]:
print("Accuracy brut (per total): ", np.sum(pred == y_te) / np.size(y_te))
print("Accuracy without padding (per total): ", np.sum((pred == y_te) * (y_te != 0)) / np.sum(y_te != 0))
print("Accuracy on masked tokens: ", np.sum(where_equal * where_masked) / np.sum(where_masked))

Accuracy brut (per total):  0.9853877314814815
Accuracy without padding (per total):  0.9217205967835691
Accuracy on masked tokens:  0.3333333333333333


In [26]:
print(np.sum(where_equal[19]))
print(where_masked[19])
print("Accuracy on masked tokens: ", np.sum(where_equal * where_masked) / np.sum(where_masked))

509
[ True False False  True False False False False False  True False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False Fals