In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [32]:
# Fetch the data, same as in book
shakespeare_url = "https://homl.info/shakespeare"  # shortcut URL
filepath = tf.keras.utils.get_file("shakespeare.txt", shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read() # shakespeare_text is now a string

Downloading data from https://homl.info/shakespeare


In [33]:
# Split on characters, keep punctuation as well as upper- and lowercase letters
text_vec_layer = tf.keras.layers.TextVectorization(
  split="character", standardize=None) # also keep upper case etc.
# shakespeare_text is a string and adapt expects a dataset or list
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [34]:
print(text_vec_layer.get_vocabulary())
print(len(text_vec_layer.get_vocabulary()))

['', '[UNK]', ' ', 'e', 't', 'o', 'a', 'h', 's', 'r', 'n', 'i', '\n', 'l', 'd', 'u', 'm', 'y', ',', 'w', 'f', 'c', 'g', 'I', 'b', 'p', ':', '.', 'A', 'v', 'k', 'T', "'", 'E', 'O', 'N', 'R', 'S', 'L', 'C', ';', 'W', 'U', 'H', 'M', 'B', '?', 'G', '!', 'D', '-', 'F', 'Y', 'P', 'K', 'V', 'j', 'q', 'x', 'z', 'J', 'Q', 'Z', 'X', '3', '&', '$']
67


In [35]:
encoded -= 2  # drop tokens 0 (pad) and 1 (unknown), which we will not use
              # use broadcasting to subtract 2 from all values
n_tokens = text_vec_layer.vocabulary_size() - 2  # number of distinct chars = 65
dataset_size = len(encoded)  # total number of chars = 1,115,394

In [27]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices(sequence)
    ds = ds.window(length + 1, shift=1, drop_remainder=True)
    ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
    if shuffle:
        ds = ds.shuffle(100_000, seed=seed)
    ds = ds.batch(batch_size)
    return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [36]:
SEQ_LENGTH = 100

In [37]:
# Create training, validation and test data set. Same as in book.
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length=SEQ_LENGTH, shuffle=True,
                       seed=42)
valid_set = to_dataset(encoded[1_000_000:1_060_000], length=SEQ_LENGTH)
test_set = to_dataset(encoded[1_060_000:], length=SEQ_LENGTH)

In [38]:
for x, y in train_set.take(1):
  print(x.shape, y.shape)

(32, 100) (32, 100)


In [39]:
class BertEncoderBlock(tf.keras.layers.Layer):
  def __init__(self, num_heads, embed_size, **kwargs):
    super().__init__(**kwargs)

    self.num_heads = num_heads
    self.embed_size = embed_size
    self.multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size//num_heads)
    self.layer_norm = tf.keras.layers.LayerNormalization()
    self.dense1 = tf.keras.layers.Dense(units=4*embed_size, activation="relu")
    self.dense2 = tf.keras.layers.Dense(units=embed_size)

  def call(self, inputs):
    # Multi-Head (Self)-Attention Block
    skip = inputs
    inputs = self.multi_head_attention(
        query=inputs,
        value=inputs,
        use_causal_mask=False
    )
    inputs = self.layer_norm(
        tf.keras.layers.Add()([inputs, skip])
    )

    # Feedforward block
    skip = inputs
    inputs = self.dense1(inputs)
    inputs = self.dense2(inputs)
    inputs = self.layer_norm(
        tf.keras.layers.Add()([inputs, skip])
    )

    return inputs

In [40]:
class BERTModel(tf.keras.Model):

  def __init__(self, n_tokens, embed_size, num_blocks, num_heads, max_seq_length, **kwargs):

        super().__init__(**kwargs)

        self.num_heads = num_heads
        self.max_seq_length = max_seq_length

        # Layers
        self.embed_layer = tf.keras.layers.Embedding(
            input_dim=n_tokens,
            output_dim=embed_size,
            name='embedding')
        self.pos_embed_layer = tf.keras.layers.Embedding(
            input_dim=max_seq_length,
            output_dim=embed_size,
            name='positional_embedding')
        self.encoder_blocks = [BertEncoderBlock(
            num_heads=num_heads,
            embed_size=embed_size,
            name='BertBlock' + str(i)) for i in range(num_blocks)]
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)


  def call(self, inputs):

        embeddings = self.embed_layer(inputs)
        pos_embeddings = self.pos_embed_layer(tf.range(self.max_seq_length))


        embeddings = embeddings + pos_embeddings # Rely on broadcasting

        for encoder_block in self.encoder_blocks:
            embeddings = encoder_block(embeddings)

        output = self.layer_norm(embeddings)

        return output

In [41]:
n_tokens = 65
embed_size=32
num_blocks=2
num_heads=4

model = BERTModel(n_tokens=n_tokens, embed_size=embed_size, num_blocks=num_blocks, num_heads=num_heads, max_seq_length=SEQ_LENGTH)

In [42]:
for x, y in train_set.take(1):
  print(x.shape)
  print(model(x).shape)

(32, 100)
(32, 100, 32)


In [43]:
model.summary()

Model: "bert_model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  2080      
                                                                 
 positional_embedding (Embe  multiple                  3200      
 dding)                                                          
                                                                 
 BertBlock0 (BertEncoderBlo  multiple                  12640     
 ck)                                                             
                                                                 
 BertBlock1 (BertEncoderBlo  multiple                  12640     
 ck)                                                             
                                                                 
 layer_normalization_23 (La  multiple                  64        
 yerNormalization)                                    

In [44]:
model.compile(
    optimizer='nadam',
    loss='sparse_categorical_crossentropy',
    metrics=["accuracy"]
)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
model.fit(train_set, validation_data=valid_set, epochs=20, callbacks=[early_stopping])