1. **Import the necessary libraries and modules:**
   - TensorFlow and Keras
   - NumPy
   - IMDB dataset and sequence padding utility
   - Layers, Model, and Optimizer from Keras

2. **Load the IMDB dataset:**
   - Set the maximum number of words to keep (`max_features`)
   - Set the maximum length of sequences (`maxlen`)
   - Load and preprocess the data (padding sequences)

3. **Implement the PositionalEmbedding class:**
   - Initialize with the maximum sequence length and embedding dimension
   - Calculate the positional embeddings using sine and cosine functions
   - Add the positional embeddings to the input tokens in the `call` method

4. **Implement the TokenAndPositionEmbedding class:**
   - Initialize with the maximum sequence length, vocabulary size, and embedding dimension
   - Use an Embedding layer for token embeddings and the custom PositionalEmbedding layer for position embeddings
   - Add the token and position embeddings in the `call` method

5. **Implement the MultiHeadAttention class:**
   - Initialize with the model dimension and the number of heads
   - Create weight matrices for query, key, and value projections
   - Implement the `split_heads` method to reshape the input tensors
   - Implement the `call` method to compute the scaled dot-product attention and output

6. **Implement the TransformerEncoderLayer class:**
   - Initialize with the model dimension, number of heads, feed-forward hidden layer dimension, and dropout rate
   - Use the custom MultiHeadAttention layer and a feed-forward neural network for the self-attention and position-wise feed-forward operations
   - Implement layer normalization and dropout layers
   - Implement the `call` method to compute the output of the encoder layer

7. **Build the Transformer model:**
   - Set the embedding dimension, number of heads, feed-forward hidden layer dimension, and number of layers
   - Create the input layer
   - Use the custom TokenAndPositionEmbedding layer for input embeddings
   - Add padding mask for input sequences
   - Stack the TransformerEncoderLayer layers
   - Add the output layers (GlobalAveragePooling1D, Dense, and Dropout)

8. **Compile and train the model:**
   - Define a learning rate scheduler
   - Create a callback for the scheduler
   - Instantiate an optimizer with the initial learning rate
   - Compile the model with the optimizer, binary_crossentropy loss, and accuracy metric
   - Train the model using the training data, batch size, epochs, validation split, and callback


In [1]:
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler

# 2. **Load the IMDB dataset:**


In [2]:
# Load the IMDB dataset
max_features = 20000  # Maximum number of words to keep
maxlen = 200  # Maximum length of sequences

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# Pad the sequences to the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)


# 3. **Implement the PositionalEmbedding class:**

- Initialize with the maximum sequence length and embedding dimension
- Calculate the positional embeddings using sine and cosine functions
- Add the positional embeddings to the input tokens in the `call` method

In [3]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        self.positional_embedding = self.build_positional_embedding(max_len, embed_dim)

    def build_positional_embedding(self, max_len, embed_dim):
        angle_rates = 1 / np.power(10000, (2 * (np.arange(embed_dim) // 2)) / np.float32(embed_dim))
        angle_rads = np.arange(max_len)[:, np.newaxis] * angle_rates[np.newaxis, :]
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        pos_embedding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_embedding, dtype=tf.float32)

    def call(self, inputs):
        seq_len = tf.shape(inputs)[-2]
        return inputs + self.positional_embedding[:, :seq_len, :]



# 4. **Implement the TokenAndPositionEmbedding class:**
   - Initialize with the maximum sequence length, vocabulary size, and embedding dimension
   - Use an Embedding layer for token embeddings and the custom PositionalEmbedding layer for position embeddings
   - Add the token and position embeddings in the `call` method




In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = PositionalEmbedding(max_len=maxlen, embed_dim=embed_dim)

    def call(self, x):
        x = self.token_emb(x)
        return self.pos_emb(x)


# 5. **Implement the MultiHeadAttention class:**
   - Initialize with the model dimension and the number of heads
   - Create weight matrices for query, key, and value projections
   - Implement the `split_heads` method to reshape the input tensors
   - Implement the `call` method to compute the scaled dot-product attention and output



In [4]:
# Define the MultiHeadAttention class, which inherits from tf.keras.layers.Layer
class MultiHeadAttention(tf.keras.layers.Layer):
    # Initialize the class with the model dimension and the number of attention heads as input arguments
    def __init__(self, d_model, num_heads):
        # Call the parent class constructor
        super(MultiHeadAttention, self).__init__()
        # Store the number of attention heads
        self.num_heads = num_heads
        # Store the model dimension
        self.d_model = d_model

        # Check that the model dimension is divisible by the number of attention heads
        assert d_model % num_heads == 0

        # Calculate the depth of each attention head
        self.depth = d_model // num_heads

        # Define the weight matrices for the query, key, and value projections
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        # Define the dense layer for the output
        self.dense = tf.keras.layers.Dense(d_model)

    # Method to split the input tensor into multiple heads
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    # Method to compute the multi-head attention for the input query, key, and value tensors
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        # Apply the weight matrices to the query, key, and value tensors
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        # Split the input tensors into multiple heads
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # Compute the scaled dot-product attention
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # Apply the mask, if provided
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        # Compute the attention weights using softmax
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        # Compute the weighted sum of the values
        output = tf.matmul(attention_weights, v)

        # Transpose and reshape the output tensor to match the input shape
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.d_model))

        # Apply the output dense layer
        output = self.dense(concat_attention)

        # Return the output tensor and the attention weights
        return output, attention_weights


# 6. **Implement the TransformerEncoderLayer class:**
   - Initialize with the model dimension, number of heads, feed-forward hidden layer dimension, and dropout rate
   - Use the custom MultiHeadAttention layer and a feed-forward neural network for the self-attention and position-wise feed-forward operations
   - Implement layer normalization and dropout layers
   - Implement the `call` method to compute the output of the encoder layer



In [5]:
class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerEncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2


# 7. **Build the Transformer model:**
   - Set the embedding dimension, number of heads, feed-forward hidden layer dimension, and number of layers
   - Create the input layer
   - Use the custom TokenAndPositionEmbedding layer for input embeddings
   - Add padding mask for input sequences
   - Stack the TransformerEncoderLayer layers
   - Add the output layers (GlobalAveragePooling1D, Dense, and Dropout)



In [6]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32 
num_layers = 1

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, max_features, embed_dim)
x = embedding_layer(inputs)

padding_mask = tf.cast(tf.math.equal(inputs, 0), tf.float32)[:, tf.newaxis, tf.newaxis, :]
for _ in range(num_layers):
    x = TransformerEncoderLayer(embed_dim, num_heads, ff_dim)(x, training=True, mask=None)

x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(32, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = Model(inputs=inputs, outputs=outputs)


# 8. **Compile and train the model:**
   - Instantiate an optimizer with the initial learning rate
   - Compile the model with the optimizer, binary_crossentropy loss, and accuracy metric
   - Train the model using the training data, batch size, epochs, validation split, and callback

In [7]:
# Add a learning rate scheduler
def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * 0.9

callback = LearningRateScheduler(scheduler)
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2, callbacks=[callback])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
