In [56]:
import tensorflow as tf
from  tensorflow import keras
import numpy as np 
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense


 

In [57]:
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=5000)


In [58]:
input_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 2, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 2, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 2, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 2, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 2, 19, 178, 32]),
       list([1, 194, 1153, 194, 2, 78, 228, 5, 6, 1463, 4369,

In [59]:
max_length = 200  # Define max length for padding
input_train = pad_sequences(input_train[:4000], maxlen=max_length, padding='post')
input_test = pad_sequences(input_test[:1000], maxlen=max_length, padding='post')

y_train = np.array(y_train[:4000])
y_test = np.array(y_test[:1000])

BUFFER_SIZE = 4000
BATCH_SIZE = 64
# converting the dataset to batches for efficient training 
train_data = tf.data.Dataset.from_tensor_slices((input_train, y_train)).batch(BATCH_SIZE)
test_data = tf.data.Dataset.from_tensor_slices((input_test, y_test)).batch(BATCH_SIZE)

Multi head Attention layer class

In [60]:
# Multi-Head Attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        assert d_model % self.num_heads == 0
        self.depth = d_model // self.num_heads
        
        self.wq = Dense(d_model)
        self.wk = Dense(d_model)
        self.wv = Dense(d_model)
        
        self.Z_dense = Dense(d_model)  # Final transformation to project output, representing Z (context vector)
    
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)
        
        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
        
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)
        
        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.d_model))
        
        return self.Z_dense(concat_attention)  # Projecting the final attention output to match d_model


Lets Define a single Transformer Layer 

In [61]:
class TransformerEncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForwardNetwork(d_model, dff)  # The feed-forward network provides additional transformations after attention.
        
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
    
    def call(self, inputs, mask=None):
        attn_output = self.mha(inputs, inputs, inputs, mask=mask)  # Explicitly pass mask
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        
        ffn_output = self.ffn(out1)  # Applies transformation and non-linearity
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)


In [62]:
class TransformerEncoder(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, max_pos_encoding, rate=0.1):
        super().__init__()
        self.embedding = keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_embedding = keras.layers.Embedding(max_pos_encoding, d_model)
        self.enc_layers = [TransformerEncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = keras.layers.Dropout(rate)
        self.global_avg_pool = keras.layers.GlobalAveragePooling1D()
        self.dense_out = Dense(1, activation='sigmoid')
    
    def call(self, inputs, mask=None):
        seq_length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=seq_length, delta=1)
        x = self.embedding(inputs) + self.pos_embedding(positions)
        x = self.dropout(x)
        
        for enc_layer in self.enc_layers:
            x = enc_layer(x, mask=mask)  # Ensure mask is explicitly passed
        
        x = self.global_avg_pool(x)
        return self.dense_out(x)


In [63]:
# Model Configuration
num_layers = 2
d_model = 128
num_heads = 8
dff = 512
input_vocab_size = 5000  # Fixed vocab size for IMDB dataset
max_pos_encoding = 200

encoder = TransformerEncoder(num_layers, d_model, num_heads, dff, input_vocab_size, max_pos_encoding)


In [64]:
# Compile and Train Model
encoder.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
encoder.fit(train_data, epochs=3, validation_data=test_data)


Epoch 1/3
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 1s/step - accuracy: 0.4978 - loss: 0.9650 - val_accuracy: 0.4750 - val_loss: 0.6909
Epoch 2/3
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 866ms/step - accuracy: 0.6182 - loss: 0.6344 - val_accuracy: 0.8110 - val_loss: 0.3973
Epoch 3/3
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 825ms/step - accuracy: 0.8648 - loss: 0.3043 - val_accuracy: 0.8040 - val_loss: 0.4790


<keras.src.callbacks.history.History at 0x18010d32830>

In [65]:
loss, accuracy = encoder.evaluate(test_data)
print(f'Test Accuracy: {accuracy * 100:.2f}%')


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 310ms/step - accuracy: 0.8046 - loss: 0.4692
Test Accuracy: 80.40%
