In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
import pandas as pd
import numpy as np
import os
import re
from pprint import pprint

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
# set up configuration
from dataclasses import dataclass
@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 32
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8  # used in bert model
    FF_DIM = 128  # used in bert model
    NUM_LAYERS = 1


config = Config()

## Load the data

In [3]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  61.2M      0  0:00:01  0:00:01 --:--:-- 61.1M


In [4]:
import glob

def get_text_list(files):
    """Utility function.
    
    Returns list of files from the data downloaded.
    """
    text_list = []
    for file in files:
        with open(file) as f:
            for line in f:
                text_list.append(line)
    return text_list

In [5]:
def get_data_from_text(folder_name):
    """Extract the text from each folder"""
    pos_files = glob.glob('/kaggle/working/aclImdb/' + folder_name + '/pos/*.txt')
    neg_files = glob.glob('/kaggle/working/aclImdb/' + folder_name + '/neg/*.txt')
    
    print(len(pos_files))
    pos_texts = get_text_list(pos_files)
    neg_texts = get_text_list(neg_files)
    
    df = pd.DataFrame({'review':pos_texts + neg_texts,
                      'sentiment': [0] * len(pos_texts) + [1]* len(neg_texts)})
    #df = df.sample(len(df)).reset_index(drop=True)
    
    return df

In [6]:
train_df = get_data_from_text('train')
test_df = get_data_from_text('test')

12500
12500


In [7]:
train_df.tail()

Unnamed: 0,review,sentiment
24995,This is a fair little show about the paranorma...,1
24996,"Since I'd seen the other three, I figured I mi...",1
24997,When robot hordes start attacking major cities...,1
24998,Too bad neither the animals or Eddie Murphy ha...,1
24999,I can't believe that so much talent can be was...,1


## Preprocessing

In [8]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), ""
    )

In [9]:
def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens = ["[MASK]"]):
    """Builds the text vectorization layer"""
    vectorize_layer = layers.TextVectorization(max_tokens = vocab_size, 
                                              output_mode='int',
                                              standardize=custom_standardization,
                                              output_sequence_length = max_seq)
    vectorize_layer.adapt(texts)
    
    #insert mask to vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2: vocab_size - len(special_tokens)] + ['[mask]']
    
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer

In [10]:
# get all the data for the vocabulary

all_data = pd.concat([train_df, test_df])

In [11]:
vectorize_layer = get_vectorize_layer(all_data['review'].values.tolist(), 
                   config.VOCAB_SIZE, 
                   config.MAX_LEN)

In [12]:
mask_token_id = vectorize_layer(['[mask]']).numpy()[0][0]
mask_token_id

29999

In [13]:
def encode(texts):
    encoded_texts = vectorize_layer(texts)
    return encoded_texts.numpy()

In [14]:
X_train = encode(train_df['review'].values)
X_test = encode(test_df['review'].values)

y_train = train_df['sentiment'].values
y_test = test_df['sentiment'].values

In [15]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(1000).batch(config.BATCH_SIZE)
test_df = tf.data.Dataset.from_tensor_slices((X_test, y_test)).shuffle(1000).batch(config.BATCH_SIZE)

In [16]:
# for masked model modelling

def get_masked_inputs_and_labels(encoded_texts):
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    inp_mask[encoded_texts <= 2] = False
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    labels[inp_mask] = encoded_texts[inp_mask]
    
        # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[
        inp_mask_2mask
    ] = mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)
    
    return encoded_texts_masked, y_labels, sample_weights

In [17]:
X_masked_train, y_masked_labels, sampled_weights = get_masked_inputs_and_labels(encode(all_data['review'].values))

In [18]:
mlm_ds = tf.data.Dataset.from_tensor_slices((X_masked_train, y_masked_labels, sampled_weights))
mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)

## ML Model

### Masked Language Modelling Pretraining objective

In [19]:
def bert_module(query, key, value, i):
    # attention output
    attention_output = layers.MultiHeadAttention(num_heads = config.NUM_HEAD,
                                                key_dim = config.EMBED_DIM//config.NUM_HEAD)(query, key, value)
    attention_output = layers.Dropout(0.1)(attention_output)
    attention_output = layers.LayerNormalization(epsilon = 1e-6)(query + attention_output)
    
    # ffn output
    ffn = models.Sequential([
        layers.Dense(config.FF_DIM, activation='relu'),
        layers.Dense(config.EMBED_DIM)
    ])
    
    ffn_output = ffn(attention_output)
    ffn_output = layers.Dropout(0.1)(ffn_output)
    
    sequence_output = layers.LayerNormalization(epsilon = 1e-6)(attention_output + ffn_output)
    
    return sequence_output

In [20]:
def positional_encoding(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    
    return pos_enc

In [21]:
loss_fn = keras.losses.SparseCategoricalCrossentropy(reduction = tf.keras.losses.Reduction.NONE)
loss_tracker = tf.keras.metrics.Mean(name='loss')

In [22]:
class MaskedLanguageModel(tf.keras.Model):
    def train_step(self, inputs):
        if len(inputs) == 3:
            features, labels, sample_weight = inputs
        else:
            features, labels = inputs
            sample_weight = None
            
        with tf.GradientTape() as tape:
            predictions = self(features, training = True)
            loss = loss_fn(labels, predictions, sample_weight = sample_weight)
            
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        loss_tracker.update_state(loss, sample_weight = sample_weight)
        
        return {'loss': loss_tracker.result()}
    
    @property
    def metrics(self):    
        return [loss_tracker]
    

In [23]:
def create_masked_language_bert_model():
    inputs = layers.Input((config.MAX_LEN,), dtype=tf.int64)
    word_embeddings = layers.Embedding(config.VOCAB_SIZE, config.EMBED_DIM)(inputs)
    
    pos_embeddings = layers.Embedding(input_dim = config.MAX_LEN, 
                                      output_dim = config.EMBED_DIM,
                                weights = [positional_encoding(config.MAX_LEN, config.EMBED_DIM)])(tf.range(start=0, limit=config.MAX_LEN, delta=1))
    embeddings = word_embeddings + pos_embeddings
    
    encoder_output = embeddings
    
    for i in range(config.NUM_LAYERS):
        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)
        
    mlm_output = layers.Dense(config.VOCAB_SIZE, activation='softmax')(encoder_output)
    mlm_model = MaskedLanguageModel(inputs, mlm_output)
    
    optimizer = keras.optimizers.Adam(learning_rate = config.LR)
    mlm_model.compile(optimizer = optimizer)
    
    return mlm_model

In [24]:
model = create_masked_language_bert_model()
model.summary()

Model: "masked_language_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 256, 128)     3840000     ['input_1[0][0]']                
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 256, 128)    0           ['embedding[0][0]']              
 da)                                                                                              
                                                                                                  
 multi_head_attention (MultiHea  (None, 256, 128)    66048       ['tf.__operat

In [25]:
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}

In [26]:
class MaskedTextGenerator(keras.callbacks.Callback):
    def __init__(self, sample_tokens, top_k=5):
        self.sample_tokens = sample_tokens
        self.k = top_k

    def decode(self, tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

    def convert_ids_to_tokens(self, id):
        return id2token[id]

    def on_epoch_end(self, epoch, logs=None):
        prediction = self.model.predict(self.sample_tokens)

        masked_index = np.where(self.sample_tokens == mask_token_id)
        masked_index = masked_index[1]
        mask_prediction = prediction[0][masked_index]

        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
        values = mask_prediction[0][top_indices]

        for i in range(len(top_indices)):
            p = top_indices[i]
            v = values[i]
            tokens = np.copy(sample_tokens[0])
            tokens[masked_index[0]] = p
            result = {
                "input_text": self.decode(sample_tokens[0].numpy()),
                "prediction": self.decode(tokens),
                "probability": v,
                "predicted mask token": self.convert_ids_to_tokens(p),
            }
            pprint(result)

In [27]:
sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
generator_callback = MaskedTextGenerator(sample_tokens.numpy())

In [None]:
model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])

Epoch 1/5
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'this',
 'prediction': 'i have watched this this and it was awesome',
 'probability': 0.0996622}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'i',
 'prediction': 'i have watched this i and it was awesome',
 'probability': 0.047355846}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'movie',
 'prediction': 'i have watched this movie and it was awesome',
 'probability': 0.040854603}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'a',
 'prediction': 'i have watched this a and it was awesome',
 'probability': 0.035512988}
{'input_text': 'i have watched this [mask] and it was awesome',
 'predicted mask token': 'is',
 'prediction': 'i have watched this is and it was awesome',
 'probability': 0.024780596}
Epoch 2/5
{'input_text': 'i have watched this [mask] and it was aw

In [None]:
model.save("bert_mlm_imdb.h5")

## References

1. [End-to-End masked language modelling](https://keras.io/examples/nlp/masked_language_modeling/)