# Imports

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from dataclasses import dataclass
import pandas as pd
import numpy as np
import glob
import re
from pprint import pprint
import pickle
import json
from tqdm import tqdm

# Config

In [2]:
@dataclass
class Config:
    MAX_LEN = 45
    BATCH_SIZE = 10000
    LR = 0.001
    VOCAB_SIZE = 29
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 1


config = Config()

# Load and preprocess Data

In [6]:
## Load Data
data_location = "../../data/words_250000_train.txt"
with open(data_location,'r') as f:
    word_list = f.read().splitlines()
word_list[:2], len(word_list)

(['aaa', 'aaaaaa'], 227300)

In [7]:
def preprocess_word_list(word_list, min_word_len = 2):
    ## Strip Lowercase Remove < min_word_len  length
    word_list = list(set(word_list))
    lower_word_list = []
    for word in word_list:
        if(len(word) >= min_word_len):
            lower_word_list.append(word.strip().lower())
    lower_word_list[:2], len(lower_word_list)

    ## Get max_train_word_length
    # max_train_word_length = max([len(word) for word in lower_word_list])

    # ## Prepare subword list
    # subword_set = set()
    # for cur_word_length in tqdm(range(min_word_len,  max_train_word_length+1)):
    #     for word in lower_word_list:
    #         if(len(word) >= cur_word_length):
    #             for i in range(len(word) - cur_word_length + 1):
    #                 subword_set.add(word[i: (i+cur_word_length)])
    # subword_list = list(set(list(subword_set)))

    ## Prepare sentence list
    sentence_list = []
    for word in lower_word_list:
        sentence = " ".join(list(word))
        sentence_list.append(sentence)
    return sentence_list

In [8]:
sentence_list = preprocess_word_list(word_list.copy())
sentence_list[:2], len(sentence_list)

(['f o g l o g g e d', 's m o o d g i n g'], 227283)

# Data Preparation for MLM

## Get vectorize layer

In [9]:
vectorize_layer = TextVectorization(
    max_tokens=config.VOCAB_SIZE,
    standardize=None,
    output_mode="int",
    output_sequence_length=config.MAX_LEN
)
vectorize_layer.adapt(sentence_list)
vocab = vectorize_layer.get_vocabulary()
vocab = vocab[2 : config.VOCAB_SIZE - 1] + ["_"]
vectorize_layer.set_vocabulary(vocab)

2023-10-22 18:34:00.806328: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
len(vectorize_layer.get_vocabulary())

29

In [11]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'e',
 'i',
 'a',
 'n',
 'o',
 'r',
 's',
 't',
 'l',
 'c',
 'u',
 'd',
 'p',
 'm',
 'h',
 'g',
 'y',
 'b',
 'f',
 'v',
 'k',
 'w',
 'z',
 'x',
 'q',
 'j',
 '_']

In [12]:
# Get mask token id for masked language model
mask_token_id = vectorize_layer(["_"]).numpy()[0][0]
mask_token_id

28

## Encod Sentences

In [13]:
def encode(texts):
    encoded_texts = vectorize_layer(texts)
    return encoded_texts.numpy()

In [14]:
# Prepare data for masked language model
encoded_sentence_array = encode(sentence_list)

In [15]:
print(encoded_sentence_array.shape)
encoded_sentence_array

(227283, 45)


array([[20,  6, 17, ...,  0,  0,  0],
       [ 8, 15,  6, ...,  0,  0,  0],
       [13,  7,  4, ...,  0,  0,  0],
       ...,
       [ 7,  2,  8, ...,  0,  0,  0],
       [ 8,  2,  7, ...,  0,  0,  0],
       [17, 12, 10, ...,  0,  0,  0]])

In [16]:
for c in sentence_list[0].split(" "):
    c_id = vectorize_layer([c]).numpy()[0][0]
    print(f"{c} - {c_id}", end=" | ")

f - 20 | o - 6 | g - 17 | l - 10 | o - 6 | g - 17 | g - 17 | e - 2 | d - 13 | 

In [17]:
print(len(encoded_sentence_array[0]))
encoded_sentence_array[0]

45


array([20,  6, 17, 10,  6, 17, 17,  2, 13,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [18]:
inp_mask = np.random.rand(*encoded_sentence_array.shape) < 0.50
inp_mask

array([[ True,  True, False, ..., False,  True, False],
       [ True, False, False, ...,  True,  True,  True],
       [False,  True, False, ..., False, False, False],
       ...,
       [ True,  True,  True, ..., False,  True,  True],
       [ True, False, False, ..., False, False, False],
       [False,  True, False, ...,  True,  True, False]])

## Get masked inputs and labels

In [24]:
import random
random.uniform(0, 1)

0.5593131015077699

In [42]:
def get_masked_input_and_labels(encoded_texts):
    mask_probability = random.uniform(0, 1)
    inp_mask = np.random.rand(*encoded_texts.shape) < mask_probability
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [mask] for inp_mask
    encoded_texts_masked[inp_mask] = mask_token_id  # mask token is the last in the dict

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights

In [43]:
for i in range(5):
    if(i == 0):
        x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(encoded_sentence_array)
    else:
        x_masked_train_, y_masked_labels_, sample_weights_ = get_masked_input_and_labels(encoded_sentence_array)
        x_masked_train = np.concatenate((x_masked_train, x_masked_train_), axis=0)
        y_masked_labels = np.concatenate((y_masked_labels, y_masked_labels_), axis=0)
        sample_weights = np.concatenate((sample_weights, sample_weights_), axis=0)

In [44]:
print(x_masked_train.shape, y_masked_labels.shape, sample_weights.shape)

(1136415, 45) (1136415, 45) (1136415, 45)


In [45]:
mlm_ds = tf.data.Dataset.from_tensor_slices((x_masked_train, y_masked_labels, sample_weights))
mlm_ds = mlm_ds.shuffle(1136415, reshuffle_each_iteration=True).batch(config.BATCH_SIZE)

In [46]:
len(mlm_ds)

114

# Create BERT model (Pretraining Model) for masked language modeling

In [47]:
def bert_module(query, key, value, i):
    # Multi headed self-attention
    attention_output = layers.MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name="encoder_{}/multiheadattention".format(i),
    )(query, key, value)
    attention_output = layers.Dropout(0.1, name="encoder_{}/att_dropout".format(i))(
        attention_output
    )
    attention_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/att_layernormalization".format(i)
    )(query + attention_output)

    # Feed-forward layer
    ffn = keras.Sequential(
        [
            layers.Dense(config.FF_DIM, activation="relu"),
            layers.Dense(config.EMBED_DIM),
        ],
        name="encoder_{}/ffn".format(i),
    )
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1, name="encoder_{}/ffn_dropout".format(i))(
        ffn_output
    )
    sequence_output = layers.LayerNormalization(
        epsilon=1e-6, name="encoder_{}/ffn_layernormalization".format(i)
    )(attention_output + ffn_output)
    return sequence_output


def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc


loss_fn = keras.losses.SparseCategoricalCrossentropy(
    reduction=tf.keras.losses.Reduction.NONE
)
loss_tracker = tf.keras.metrics.Mean(name="loss")


class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None

        with tf.GradientTape() as tape:
            predictions = self(features, training=True)
            loss = loss_fn(labels, predictions, sample_weight=sample_weight)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Compute our own metrics
        loss_tracker.update_state(loss, sample_weight=sample_weight)

        # Return a dict mapping metric names to current value
        return {"loss": loss_tracker.result()}

    @property
    def metrics(self):
        # We list our `Metric` objects here so that `reset_states()` can be
        # called automatically at the start of each epoch
        # or at the start of `evaluate()`.
        # If you don't implement this property, you have to call
        # `reset_states()` yourself at the time of your choosing.
        return [loss_tracker]


def create_masked_language_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype=tf.int64)

    word_embeddings = layers.Embedding(
        config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
    )(inputs)
    position_embeddings = layers.Embedding(
        input_dim=config.MAX_LEN,
        output_dim=config.EMBED_DIM,
        weights=[get_pos_encoding_matrix(config.MAX_LEN, config.EMBED_DIM)],
        name="position_embedding",
    )(tf.range(start=0, limit=config.MAX_LEN, delta=1))
    embeddings = word_embeddings + position_embeddings

    encoder_output = embeddings
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)

    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
        encoder_output
    )
    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")

    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
    mlm_model.compile(optimizer=optimizer)
    return mlm_model


id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}

In [48]:
class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, top_k=2):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        return id2token[id]

    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)

        masked_index = np.where(self.sample_tokens == mask_token_id)
        masked_index = masked_index[1]
        mask_prediction = prediction[0][masked_index]

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
        values = mask_prediction[0][top_indices]

        for i in range(len(top_indices)):
            p = top_indices[i]
            v = values[i]
            tokens = np.copy(sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(sample_tokens[0].numpy()),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)


sample_tokens = vectorize_layer(["c _ t"])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

In [49]:
generator_callback

<__main__.MaskedTextGenerator at 0x7fa9ff427a50>

In [50]:
bert_masked_model = create_masked_language_bert_model()
bert_masked_model.summary()

Model: "masked_bert_model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 45)]         0                                            
__________________________________________________________________________________________________
word_embedding (Embedding)      (None, 45, 128)      3712        input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.add (TFOpLambd (None, 45, 128)      0           word_embedding[0][0]             
__________________________________________________________________________________________________
encoder_0/multiheadattention (M (None, 45, 128)      66048       tf.__operators__.add[0][0]       
                                                                 tf.__operators__.

In [51]:
bert_masked_model.fit(
    mlm_ds, 
    epochs=10, 
    callbacks=[generator_callback]
)

Epoch 1/10


2023-10-22 19:01:08.859937: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


{'input_text': 'c _ t',
 'predicted mask token': 'o',
 'prediction': 'c o t',
 'probability': 0.23519625}
{'input_text': 'c _ t',
 'predicted mask token': 'a',
 'prediction': 'c a t',
 'probability': 0.18642396}
Epoch 2/10
{'input_text': 'c _ t',
 'predicted mask token': 'o',
 'prediction': 'c o t',
 'probability': 0.37396398}
{'input_text': 'c _ t',
 'predicted mask token': 'a',
 'prediction': 'c a t',
 'probability': 0.21297932}
Epoch 3/10
{'input_text': 'c _ t',
 'predicted mask token': 'o',
 'prediction': 'c o t',
 'probability': 0.32207373}
{'input_text': 'c _ t',
 'predicted mask token': 'a',
 'prediction': 'c a t',
 'probability': 0.3042035}
Epoch 4/10
{'input_text': 'c _ t',
 'predicted mask token': 'a',
 'prediction': 'c a t',
 'probability': 0.37888426}
{'input_text': 'c _ t',
 'predicted mask token': 'o',
 'prediction': 'c o t',
 'probability': 0.28567722}
Epoch 5/10
{'input_text': 'c _ t',
 'predicted mask token': 'a',
 'prediction': 'c a t',
 'probability': 0.34287637}
{'i

<tensorflow.python.keras.callbacks.History at 0x7fa956e28a10>

# predict function

In [52]:
guessed_letters = []

In [53]:
def predict_letter(model, id2token, token2id, word = "b _ y", special_tokens = ["", "[UNK]", "_"]):
    clean_word = word.strip().lower()
    
    encoded_word = [token2id[c] for c in clean_word.split(" ")]
    len_word = len(encoded_word)
    encoded_word = np.array(encoded_word)
    encoded_word = np.pad(encoded_word, (0, 45-len_word))
    encoded_word = encoded_word.reshape(1, 45)
    
    model_output = model.predict(encoded_word)[0]
    
    blank_index_list = [c_index for c_index, c in enumerate(clean_word.split(" ")) if(c == "_")]
    model_output = model_output[blank_index_list]
    
    model_output = np.max(model_output, axis = 0)
    model_output = np.argsort(model_output)[::-1]
    for id in model_output:
        token = id2token[id]
        if(token not in (guessed_letters + special_tokens)):
            guessed_letters.append(token)
            return token

In [57]:
word = "b _ y"
predict_letter(bert_masked_model, id2token, token2id, word)

'o'

In [58]:
guessed_letters

['r', 'l', 'a', 'o']

# Save required elements

In [59]:
bert_masked_model.save("bert_mlm.h5", save_format="tf")

In [60]:
pickle.dump(id2token, open("id2token.pickle", "wb"))

In [61]:
pickle.dump(token2id, open("token2id.pickle", "wb"))