In [1]:
from tensorflow.keras.layers import TextVectorization

In [42]:
import os, random, shutil, pathlib
base_dir = pathlib.Path("/Users/divyeshkanagavel/Desktop/DeepLearning/aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg","pos"):
  os.makedirs(val_dir / category)
  files = os.listdir(train_dir / category)
  random.Random(1337).shuffle(files)
  num_val_samples = int(0.2*len(files))
  val_files = files[-num_val_samples:]
  for fname in val_files:
    shutil.move(train_dir/category/fname, val_dir/category/fname)

In [43]:
from tensorflow import keras
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(train_dir, batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory(val_dir, batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory('/Users/divyeshkanagavel/Desktop/DeepLearning/aclImdb/test', batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [44]:
from tensorflow import keras
from tensorflow.keras import layers

In [45]:

max_length = 600 # truncate length of reviews to 600
max_tokens = 20000 # max tokens in vocab
text_vectorization = layers.TextVectorization(max_tokens = max_tokens, output_mode = "int", output_sequence_length=max_length)
text_only_train_ds = train_ds.map(lambda x,y:x)
text_vectorization.adapt(text_only_train_ds)

In [46]:
int_train_ds = train_ds.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [47]:
import tensorflow as tf

In [48]:

inputs = keras.Input(shape=(None,), dtype="int64") 
embedded = layers.Embedding(
input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs) 
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs) 
model.compile(optimizer="rmsprop",
loss="binary_crossentropy",
metrics=["accuracy"])
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_20 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_24 (Embedding)    (None, None, 256)         5120000   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                73984     
 onal)                                                           
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_40 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
___________________

check fit function call 

In [49]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=2,
)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x577074310>

In [50]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(key_dim = embed_dim, num_heads = num_heads)
        self.dense_proj = keras.Sequential([layers.Dense(dense_dim,activation="relu"),
                                            layers.Dense(embed_dim),])
        self.layer_norm1 = layers.LayerNormalization()
        self.layer_norm2 = layers.LayerNormalization()
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:,tf.newaxis,:]
        attention_output = self.attention(inputs, inputs, attention_mask = mask)
        proj_input = self.layer_norm1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layer_norm2(proj_input + proj_output)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim" : self.embed_dim,
            "dense_dim" : self.dense_dim,
            "num_heads" : self.num_heads
        })
        return config



In [51]:
vocab_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape=(None,), dtype="int64")

x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(inputs)
#mask = layers.Embedding(vocab_size, embed_dim, mask_zero=True).compute_mask(inputs)

x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x) 
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)

In [52]:
callbacks = keras.callbacks.ModelCheckpoint("transformer_encoder_classifier.h5",save_best_only=True)

In [53]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

In [54]:
model.fit(int_train_ds, validation_data = int_val_ds, epochs = 3,callbacks = callbacks )

Epoch 1/3
Epoch 2/3


  saving_api.save_model(


Epoch 3/3


<keras.src.callbacks.History at 0x32dbc1f50>

In [55]:
model = keras.models.load_model(
    "transformer_encoder_classifier.h5",
    custom_objects={"TransformerEncoder": TransformerEncoder})
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.884


a test accuracy of 88.4 percent is observed which is pretty cool , an improvement over GRU model emphasising the importance of attention

In [61]:
tf.config.list_physical_devices(
    device_type='GPU'
    
)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

there is something missing here! the tokens are interacting with all other tokens, updating the values for each token thereby and then dense network is learning task specific features. but, self attention by itself has no mechanism for tracking word order -> this is inherently possible in architectures like RNN, LSTM etc. transformer adds additional positional encoding to self attention to truly become a sequence model

words are already embedded into a vector depending on the semantic relationship between them.to this vector we add a quantity which will give positional information as well.simplest technique could to add the word position index in the sentence, but then this will be a huge integer value for long sequences adn neural networks work well with values in the range [-1,1]. the original paper used the cosine embedding where the word index are converted to values in the range [-1,1] by encoding word index into cosine function. 
or the way we are going to do is to use the neural network gradients to learn correct positional encoding values to be added to word embeddings ->this technique is called positional encoding

In [63]:
class PositionEmbedding(layers.Layer):
    def __init__(self,sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embedding = layers.Embedding(input_dim=input_dim, output_dim = output_dim)
        self.position_embedding = layers.Embedding(input_dim = sequence_length, output_dim = output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self,inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0,limit=length, delta=1)
        embedded_tokens = self.token_embedding(inputs)
        embedded_positions = self.position_embedding(positions)
        return (embedded_tokens+embedded_positions)
    
    def compute_mask(self,inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "input_dim":self.input_dim,
            "output_dim":self.output_dim,
            "sequence_length":self.sequence_length

        })
        return config


In [65]:
vocab_size = 20000
sequence_length = 600
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads) (x)# mask from position embedding is propagated to the following layers
x = layers.GlobalMaxPooling1D() (x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid") (x)
model = keras.Model(inputs = inputs , outputs= outputs)




In [66]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])


In [67]:
callbacks = [
    keras.callbacks.ModelCheckpoint("full_transformer_encoder.h5",
                                    save_best_only=True)
]

In [68]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=3,
     callbacks=callbacks)

Epoch 1/3
Epoch 2/3


  saving_api.save_model(


Epoch 3/3


<keras.src.callbacks.History at 0x728450810>

In [71]:
model = keras.models.load_model(
    "full_transformer_encoder.h5",
    custom_objects={"TransformerEncoder": TransformerEncoder,
"PositionEmbedding": PositionEmbedding})
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Test acc: 0.877


The accuracy is more or less the same. the bag of words approach also gives a good accuracy for this dataset and is faster as well.
there is a general rule of thumb observed by keras research group for nlp classification task -> if the number of samples / mean number of words per sample < 1500, go for bag of words model with dense neural networks, else go for sequence models like transformer , GRU etc

#a more complex application : Sequence-sequence model
1.machine translation
2.Summarization
3. Text generation : convert a text prompt into paragraph
4. Question answering 
5. Chatbots

machine translation by transformers is done using both the encoder and the decoder
Encoder: An encoder model turns the source sequence into an intermediate representation.
Decoder : the decoder model predicts token i given token i-1 and the intermediate representation from encoder