In [1]:
import tensorflow as tf

In [15]:
class GPTDecoderBlock(tf.keras.layers.Layer) :
  def __init__(self, num_heads, embed_size, **kwargs):
    super().__init__(**kwargs)

    self.num_heads = num_heads
    self.embed_size = embed_size
    self.multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_size//num_heads)
    self.layer_norm = tf.keras.layers.LayerNormalization()
    self.dense1 = tf.keras.layers.Dense(units=4*embed_size, activation="relu")
    self.dense2 = tf.keras.layers.Dense(units=embed_size)


  def call(self, inputs):
    # Masked Multi-Head (Self)-Attention Block
    skip = inputs
    inputs = self.multi_head_attention(
        query=inputs,
        value=inputs,
        use_causal_mask=True
    )
    inputs = self.layer_norm(
        tf.keras.layers.Add()([inputs, skip])
    )

    # Feedforward block
    skip = inputs
    inputs = self.dense1(inputs)
    inputs = self.dense2(inputs)
    inputs = self.layer_norm(
        tf.keras.layers.Add()([inputs, skip])
    )

    return inputs

In [16]:
X = tf.constant(0., shape=(2, 10, 64))
block = GPTDecoderBlock(num_heads=4, embed_size=64)
block(X)

<tf.Tensor: shape=(2, 10, 64), dtype=float32, numpy=
array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float32)>

In [17]:
url = "https://homl.info/shakespeare"
filepath = tf.keras.utils.get_file("shakespeare.txt", url)
with open(filepath) as f:
  text = f.read()

Downloading data from https://homl.info/shakespeare


In [18]:
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vec_layer.adapt([text])

In [20]:
encoded = text_vec_layer([text])[0]
encoded

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [22]:
len(text_vec_layer.get_vocabulary())

41

In [26]:
encoded -= 2
n_tokens = text_vec_layer.vocabulary_size() - 2
dataset_size = len(encoded)

In [27]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length + 1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))
  if shuffle:
    ds = ds.shuffle(100_000, seed=seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [39]:
SEQ_LENGTH = 100
tf.random.set_seed(42)
train_set = to_dataset (encoded [:1_000_000], length=SEQ_LENGTH, shuffle=True, seed=42)
valid_set = to_dataset (encoded [1_000_000:1_060_000], length=SEQ_LENGTH)
test_set = to_dataset(encoded [1_060_000:], length=SEQ_LENGTH)

In [36]:
class GPTModel(tf.keras.Model):

    def __init__(self, n_tokens, embed_size, num_blocks, num_heads, max_seq_length, **kwargs):

        super().__init__(**kwargs)

        self.num_heads = num_heads
        self.max_seq_length = max_seq_length

        # Layers
        self.embed_layer = tf.keras.layers.Embedding(
            input_dim=n_tokens,
            output_dim=embed_size,
            name='embedding')
        self.pos_embed_layer = tf.keras.layers.Embedding(
            input_dim=max_seq_length,
            output_dim=embed_size,
            name='positional_embedding')
        #self.add_layer = tf.keras.layers.Add()
        self.decoder_blocks = [GPTDecoderBlock(
            num_heads=num_heads,
            embed_size=embed_size,
            name='GPTBlock' + str(i)) for i in range(num_blocks)]
        self.dense_layer = tf.keras.layers.Dense(
            units=n_tokens,
            activation='softmax',
            name='output')

    def call(self, inputs):

        embeddings = self.embed_layer(inputs)

        pos_embeddings = self.pos_embed_layer(tf.range(self.max_seq_length))


        embeddings = embeddings + pos_embeddings # Rely on broadcasting

        for decoder_block in self.decoder_blocks:
            embeddings = decoder_block(embeddings)

        output = self.dense_layer(embeddings)

        return output

In [33]:
EMBED_SIZE = 32
NUM_HEADS = 4
NUM_BLOCKS = 2

In [37]:
model = GPTModel(n_tokens=n_tokens, embed_size=EMBED_SIZE, num_blocks=NUM_BLOCKS, num_heads=NUM_HEADS, max_seq_length=SEQ_LENGTH)

In [41]:
model.summary()

Model: "gpt_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  1248      
                                                                 
 positional_embedding (Embe  multiple                  0 (unused)
 dding)                                                          
                                                                 
 GPTBlock0 (GPTDecoderBlock  multiple                  0 (unused)
 )                                                               
                                                                 
 GPTBlock1 (GPTDecoderBlock  multiple                  0 (unused)
 )                                                               
                                                                 
 output (Dense)              multiple                  0 (unused)
                                                       

In [42]:
model.compile(
    optimizer='nadam',
    loss='sparse_categorical_crossentropy',
    metrics=["accuracy"]
)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)
model.fit(train_set, validation_data=valid_set, epochs=20, callbacks=[early_stopping])