# Transformers Implementation in Python

#### Transformers
Transformers are deep learning architectures designed for handling sequential data without relying on recurrence, which is commonly used in RNNs. Instead, Transformers use a mechanism called self-attention to process all tokens in the sequence simultaneously, capturing dependencies between tokens regardless of their distance in the sequence. Transformers have become the foundation of many NLP tasks and models, including BERT and GPT

In [5]:
# import necessary libraries
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

# load and preprocess the IMDB dataset
max_features = 10000
max_len = 200
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)

# define a transformer block
class TransformerBlock(layers.Layer):
    def __init__(self,embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"),
             layers.Dense(embed_dim)]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=None): # ensure training is set correctly
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# define the model with an embedding layer, transformer blocks, and output layer
embed_dim = 32
num_heads = 2
ff_dim = 32

inputs = layers.Input(shape=(max_len,))
embedding_layer = layers.Embedding(max_features, output_dim=embed_dim, input_length=max_len)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x, training=True)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

# compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.fit(x_train, y_train, epochs=3, validation_split=0.2, batch_size=64)

# evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")




Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 34ms/step - accuracy: 0.6862 - loss: 0.5631 - val_accuracy: 0.8718 - val_loss: 0.3061
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 34ms/step - accuracy: 0.9142 - loss: 0.2235 - val_accuracy: 0.8786 - val_loss: 0.2849
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 36ms/step - accuracy: 0.9433 - loss: 0.1596 - val_accuracy: 0.8718 - val_loss: 0.3173
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.8658 - loss: 0.3318
Loss: 0.3348863422870636, Accuracy: 0.8640000224113464
