# BERT's playground 
Hello there! Welcome on BERT's playground. You may play with BERT here and see what he can do but always make sure he feels respected and admired.

## Setups


In [34]:
import pickle
import tensorflow as tf
import masking
import BERT
import numpy as np

from tensorflow import keras
from Vectorisation import Vectorisation
from Config import Config
from MaskedTextGenerator import MaskedTextGenerator

with open("./ml4science_data.pkl", "rb") as fp:
    data_dict = pickle.load(fp)

config = Config()
vec = Vectorisation(config=config)

In [35]:
# Prepare data for masked language model
encoded = vec.encode_dict(data_dict)
x_masked_encoded, y_masked_encoded, sample_weights = masking.mask_input_and_labels(encoded, config.TOKEN_DICT)
print(x_masked_encoded.shape, y_masked_encoded.shape, sample_weights.shape)

mlm_ds = tf.data.Dataset.from_tensor_slices((x_masked_encoded, y_masked_encoded, sample_weights))
mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)

print(mlm_ds)

(254, 128) (254, 128) (254, 128)
<_BatchDataset element_spec=(TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), TensorSpec(shape=(None, 128), dtype=tf.int32, name=None), TensorSpec(shape=(None, 128), dtype=tf.float64, name=None))>


In [36]:
sample_tokens = x_masked_encoded[0:1]
print(y_masked_encoded[0])

[26  2 26 21 21 26 23 25 21 23  2 21 26 21 21 26 21 25  3  8  3  3  3  3
  3  3  3  3  7  8  4  2  8  3 16 20 21 26 21 23 21  3  8  7  8  3  5  8
  4  8  2  8  3  8 10  9 10  9  9 15 20 11  9 14 13 11  9 14 13 14 10 11
  9  9 13 14 13 14 11  9 10 14 13 10 11  9 14  9 14 15 20 15 15 17 20  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]


In [37]:
print(x_masked_encoded[0])

[26  2 26 21 21 26  9 25 21 23  2 21 26 21 21 26 21  1  3  8  3  1  3  3
  3  3  3  3  7  8  1  2  8  3 16 20 21 26 21 23 21  3  8  7  8  3  5  8
  4  1  2  8  3  1 10  9 10  9  9 15  1 11  9  1  1 11  9 14 13 14 10  1
  9  9 13 14 13 14 11  9 10 14 13 10 11  1 14  9  1 15 20 15 15 17 20  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0]


In [38]:
generator_callback = MaskedTextGenerator(sample_tokens, config.TOKEN_DICT['[MASK]'])

bert_masked_model = BERT.create_masked_language_bert_model(config)
bert_masked_model.summary()

Model: "masked_bert_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None, 128)]                0         []                            
                                                                                                  
 word_embedding (Embedding)  (None, 128, 64)              1728      ['input_10[0][0]']            
                                                                                                  
 tf.__operators__.add_9 (TF  (None, 128, 64)              0         ['word_embedding[0][0]']      
 OpLambda)                                                                                        
                                                                                                  
 encoder_0/multiheadattenti  (None, 128, 64)              16640     ['tf.__operato

In [64]:
# A callback in Keras is a function that is called at certain points during the training process. -> Here is called after each epoch during the training
# Here we use to see the "performance" at each epoch while predicting on a "test set" aka the sample_tokens
bert_masked_model.fit(mlm_ds, epochs=20, callbacks=[generator_callback]) 
bert_masked_model.save("bert_models/bert_mlm.keras")

Epoch 1/20
 1/16 [>.............................] - ETA: 0s - loss: 1.8673

prediction shape: (1, 128, 27)
mask pred shape: (11, 27)
best results shape: (5, 3)

masked nb: 	 1                2                3                4                5                
predictions: 	[26 21 23]       [26  9 21]       [10 14  9]       [ 9 14 10]       [ 9 20 10]       
probabilities: 	[0.32 0.28 0.16] [0.22 0.2  0.17] [0.2  0.2  0.15] [0.18 0.15 0.14] [0.21 0.12 0.12] 

Epoch 2/20
prediction shape: (1, 128, 27)
mask pred shape: (11, 27)
best results shape: (5, 3)

masked nb: 	 1                2                3                4                5                
predictions: 	[21 26 23]       [ 9 14 21]       [14 15 11]       [14  9 20]       [ 9 20 15]       
probabilities: 	[0.26 0.22 0.13] [0.25 0.18 0.14] [0.23 0.15 0.13] [0.16 0.14 0.12] [0.17 0.16 0.13] 

Epoch 3/20
prediction shape: (1, 128, 27)
mask pred shape: (11, 27)
best results shape: (5, 3)

masked nb: 	 1                2                3                4                5                
predictions: 	[21 26

In [65]:
predictions = bert_masked_model.predict(x_masked_encoded[0:1])

predictions_max = np.argmax(predictions, axis=2)
print("Predictions:\n",predictions_max)

print("Original:\n", x_masked_encoded[0:1])

Predictions:
 [[26 26 26 21 21 26 21 25 21 23 21 21 26 21 21 26 21 21 21 21 21  9  3  3
   3  3  3  3 15  8 15 15  8 20 16 20 15 15 15 15  3  3  3  3  8  3  8  8
   8  8 14  8  3  8 10  9 10  9  9  9 14 11  9  9  9 14  9 14 10 14 10  9
   9  9 10 14 10 14 14  9 10 14 13 10 11  9 14  9 14 14 20 14 14 11 20 14
  14 14  9  9  9  9  9 20 20 20 20 20 20 20 20 20 15 15 15 15 15 15 15  3
   9  9  9  9  9  9 14 14]]
Original:
 [[26  2 26 21 21 26  9 25 21 23  2 21 26 21 21 26 21  1  3  8  3  1  3  3
   3  3  3  3  7  8  1  2  8  3 16 20 21 26 21 23 21  3  8  7  8  3  5  8
   4  1  2  8  3  1 10  9 10  9  9 15  1 11  9  1  1 11  9 14 13 14 10  1
   9  9 13 14 13 14 11  9 10 14 13 10 11  1 14  9  1 15 20 15 15 17 20  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0]]


In [68]:
print(predictions_max == y_masked_encoded[0:1])
# print accuracy
print("Accuracy brut: ", np.sum(predictions_max == y_masked_encoded[0:1]) / len(y_masked_encoded[0:1]))
# print accuracy without padding
print("Accuracy without taking in acount padding: ", np.sum((predictions_max == y_masked_encoded[0:1]) * (y_masked_encoded[0:1] != 0)) / np.sum(y_masked_encoded[0:1] != 0))

[[ True False  True  True  True  True False  True  True  True False  True
   True  True  True  True  True False False False False False  True  True
   True  True  True  True False  True False False  True False  True  True
  False False False False False  True False False  True  True False  True
  False  True False  True  True  True  True  True  True  True  True False
  False  True  True False False False  True  True False  True  True False
   True  True False  True False  True False  True  True  True  True  True
   True  True  True  True  True False  True False False False  True False
  False False False False False False False False False False False False
  False False False False False False False False False False False False
  False False False False False False False False]]
Accuracy brut:  59.0
Accuracy without taking in acount padding:  0.6210526315789474


In [42]:
# This how we can load a Keras model
"""# Load OUR pretrained bert model
mlm_model = keras.models.load_model(
    "bert_mlm_imdb.h5", custom_objects={"MaskedLanguageModel": MaskedLanguageModel}
)"""

# Kinda failed attempt to create a end to end model (we don't really need it) -> But it's a good example of how to create a model with a custom loss function and reshape the output
"""def get_end_to_end(model):
    inputs = keras.Input(shape=(None,))
    outputs = model(inputs)
    reshaped_outputs = keras.layers.Lambda(lambda x: keras.backend.argmax(x, axis=-1))(outputs)
    end_to_end_model = keras.Model(inputs, reshaped_outputs, name="end_to_end_model")
    optimizer = keras.optimizers.Adam(learning_rate=config.bert.LR)
    end_to_end_model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return end_to_end_model

end_to_end_classification_model = get_end_to_end(bert_masked_model)

# Build dataset for end to end model input (will be used at the end)
test_raw_classifier_ds = tf.data.Dataset.from_tensor_slices((x_masked_encoded, y_masked_encoded)).batch(config.BATCH_SIZE)

end_to_end_classification_model.evaluate(test_raw_classifier_ds)"""