In [1]:
import tensorflow as tf
from keras.models import *
from keras.layers import *
from keras.datasets import imdb
from keras.utils import pad_sequences




### Definer the Transformer Block

In [37]:
class TransformerBlock(Layer):

    def __init__(self,embed_dim, num_heads, ff_dim, rate=0.1):
        # embed_dim: Dimensionaloty of I/P & O/P
        # num_head = Number of attention heads
        # ff_dim: dimensionality of Feed Forward
        # rate: dropout rate

        super().__init__()

        # Creation of Multi Head Attention layer, responsible for learning long range
        self.att = MultiHeadAttention(num_heads, key_dim = embed_dim) # Here, Attention Score is calculated

        # Self.fnn: Creation of Feed-Forward Neural Network, often used for additional normalization
        self.ffn = Sequential(
            [Dense(ff_dim, activation='relu'), Dense(embed_dim),]
        )
        
        # self.layernorm: Layer normalization
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        # self.dropout: Dropout rates
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        
        # Applies Multi Head Attention to input sequence allowing different part
        attn_output = self.att(inputs, inputs)

        # Applies Dropout to the attention output
        attn_output = self.dropout1(attn_output, training = training)

        # Adds the attention output to original inpput and applies layer normalization
        out1 = self.layernorm1(inputs + attn_output)

        # Passes the normalized output through Feed-Forward Network
        ffn_output = self.ffn(out1)

        # Applies dropout to Feed-Forward output
        ffn_output = self.dropout2(ffn_output, training = training)

        return self.layernorm2(out1 + ffn_output)

In [38]:
class TokenAndPositionEmbedding(Layer):

    def __init__(self,maxlen, vocab_size, embed_dim):
        # maxlen: Max. length of input sequences
        # vocab_size: Total no. of unique tokens (words) in vocab.
        # embed_dim: dimensionality of embedding

        super().__init__()

        # Embedding layer that maps each token in input sequence to a dense vector of size embed_dim
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)

        # Embedding layer for mapping in sequence for each postion (from 0 to maxlen - 1) to a dense vector of size embed_dim
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        # Extracts the actual length of current input sequence
        maxlen = tf.shape(x)[-1]

        # Create a tensor of positions from 0 to maxlen - 1
        positions = tf.range(start = 0, limit=maxlen, delta=1)

        # Looks up the position embeddings for each position in the sequence
        positions = self.pos_emb(positions)

        # Looks up the token embeddings for each token in input sequence
        x = self.token_emb(x)

        # Adds the token and positions embeddings element-wise, resulting in a combined representation that captures both word meaning and psoitional information. 
        return x + positions


In [39]:
vocab_size = 20000 # Considering the Top-20000 words
maxlen = 200 # Considering 1st 200 words of each movie review

In [40]:
(X_train, y_train), (X_val, y_val) = imdb.load_data(num_words=vocab_size)
print(len(X_train), 'Training sequence')
print(len(X_val), 'Validation sequence')

25000 Training sequence
25000 Validation sequence


In [41]:
X_train = pad_sequences(X_train, maxlen = maxlen)
X_val = pad_sequences(X_val, maxlen = maxlen)

In [42]:
X_train.shape

(25000, 200)

In [43]:
X_train[4000]

array([   89,     8,   511,  6339,    59,  2013,    41,   523,   147,
        1876,     5, 18733,   175,   347,    11,   618,     4,   172,
          96,  2329,     2,     9,   862,  4722,     8,    41,     5,
          27,   532,  2904,     9,  5750,     4,  9910,   136,  7900,
        9287,     5,     2,    19,  1456,   921,    42,  2475,  1488,
          68,  2456,   216,    17,     6,  2143,    48,    13,    69,
           6, 12928,    13,    62,    28,  2564,    12,     8,    98,
         634,   908,    10,    10,  2047,  3423,     9, 14790,    17,
           2,     6,    87,  1465,    48,    25,   377,    27,   478,
         157,    11,     2, 18497,    29,  2010,     4,  2915,     7,
        5712, 12710,    83,     6,  3207,     2,     7,   107,    42,
         289,   715,   257,     5,    95,  9727,     4, 13331,    11,
          17, 10846,     5, 13869,  1377,    17,   614,    11,    14,
         365,  1652,     2,     2,   373,    10,    10,     4,   167,
        6184,     2,

In [44]:
embed_dim = 32
num_heads =  2
ff_dim = 32

inputs = Input(shape=(maxlen,))

embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
X = embedding_layer(inputs)

transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
X = transformer_block(X)

X = GlobalAveragePooling1D()(X)
X = Dropout(0.1)(X)
X = Dense(20, activation ='relu')(X)
X = Dropout(0.1)(X)

outputs = Dense(2, activation = 'softmax')(X)

model = Model(inputs=inputs, outputs = outputs)

In [45]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_10 (InputLayer)       [(None, 200)]             0         
                                                                 
 token_and_position_embeddi  (None, 200, 32)           646400    
 ng_9 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 transformer_block_9 (Trans  (None, 200, 32)           10656     
 formerBlock)                                                    
                                                                 
 global_average_pooling1d (  (None, 32)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dropout_20 (Dropout)        (None, 32)                0     

In [46]:
model.compile(optimizer = 'adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])




In [47]:
history = model.fit(X_train, y_train, validation_data = (X_val, y_val), batch_size = 32, epochs = 10)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
