<a href="https://colab.research.google.com/github/2003Yash/complete-encoder/blob/main/Complete_Encoder_Code_for_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization
import math

def scaled_dot_product(q, k, v, mask=None):
    d_k = tf.shape(q)[-1]
    scaled = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(d_k, tf.float32))
    print(f"scaled.shape : {scaled.shape}")
    if mask is not None:
        print(f"-- ADDING MASK of shape {mask.shape} --")
        scaled += mask
    attention = tf.nn.softmax(scaled, axis=-1)
    values = tf.matmul(attention, v)
    return values, attention

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model #512
        self.num_heads = num_heads #8
        self.head_dim = d_model // num_heads #512/8 = 64
        self.qkv_layer = tf.keras.layers.Dense(3 * d_model) #1536
        self.linear_layer = tf.keras.layers.Dense(d_model) #512

    def call(self, x, mask=None):
        batch_size, max_sequence_length, d_model = tf.shape(x) # 30 x 200 x 512
        print(f"x.shape: {x.shape}")
        qkv = self.qkv_layer(x) # 30 x 200 x 1536
        print(f"qkv.shape: {qkv.shape}")
        qkv = tf.reshape(qkv, (batch_size, max_sequence_length, self.num_heads, 3 * self.head_dim)) # 30 x 200 x 8 x 192
        print(f"qkv.shape after reshape: {qkv.shape}")
        qkv = tf.transpose(qkv, perm=[0, 2, 1, 3]) # transpose -> 30 x 8 x 200 x 192
        print(f"qkv.shape after transpose: {qkv.shape}")
        q, k, v = tf.split(qkv, 3, axis=-1) # splitiing for q,k,v i.e.., 192 = 64x3 => 30 x 8 x 200 x 64 each
        print(f"q shape: {q.shape}, k shape: {k.shape}, v shape: {v.shape}")
        values, attention = scaled_dot_product(q, k, v, mask) # find the attention vectors from q,k,v and mask it if needed
        print(f"values.shape: {values.shape}, attention.shape: {attention.shape}")
        values = tf.reshape(values, (batch_size, max_sequence_length, self.num_heads * self.head_dim)) # 3o x 200 x 500
        print(f"values.shape after reshape: {values.shape}")
        out = self.linear_layer(values)
        print(f"out.shape: {out.shape}")
        return out

class LayerNormalization(tf.keras.layers.Layer):
    def __init__(self, parameters_shape, eps=1e-5):
        super(LayerNormalization, self).__init__()
        self.parameters_shape = parameters_shape
        self.eps = eps
        self.gamma = self.add_weight(shape=parameters_shape, initializer='ones', trainable=True)
        self.beta = self.add_weight(shape=parameters_shape, initializer='zeros', trainable=True)

    def call(self, inputs):
        dims = list(range(len(self.parameters_shape)))
        mean = tf.reduce_mean(inputs, axis=dims, keepdims=True)
        print(f"Mean ({mean.shape})")
        var = tf.reduce_mean(tf.square(inputs - mean), axis=dims, keepdims=True)
        std = tf.sqrt(var + self.eps)
        print(f"Standard Deviation  ({std.shape})")
        y = (inputs - mean) / std
        print(f"y: {y.shape}")
        out = self.gamma * y + self.beta
        print(f"self.gamma: {self.gamma.shape}, self.beta: {self.beta.shape}")
        print(f"out: {out.shape}")
        return out

class PositionwiseFeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = tf.keras.layers.Dense(hidden) # 2048 neurons
        self.linear2 = tf.keras.layers.Dense(d_model) # 512 neurons these values are again stored in matirx i.e.., 30x200x_512_ these neurons just porcess vlaues with lernable params no classification here
        self.relu = tf.keras.layers.ReLU()
        self.dropout = tf.keras.layers.Dropout(rate=drop_prob)

    def call(self, x, training=False):
        x = self.linear1(x) # 30 x 200 x 2048
        print(f"x after first linear layer: {x.shape}")
        x = self.relu(x)
        print(f"x after activation: {x.shape}")
        x = self.dropout(x, training=training)
        print(f"x after dropout: {x.shape}")
        x = self.linear2(x)  # 30 x x200 x 512
        print(f"x after 2nd linear layer: {x.shape}")
        return x

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
      #using values of architecture data we will create the transformer architecture in this constructor
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = LayerNormalization(parameters_shape=[d_model])
        self.dropout1 = tf.keras.layers.Dropout(rate=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob) # after getting attention matix then we put them in ffn  #check that postionwise class in above
        self.norm2 = LayerNormalization(parameters_shape=[d_model])
        self.dropout2 = tf.keras.layers.Dropout(rate=drop_prob)

    def call(self, x, training=False):
        residual_x = x
        print("------- ATTENTION 1 ------")
        x = self.attention(x, mask=None) #30 x 200 x 512
        print("------- DROPOUT 1 ------")
        x = self.dropout1(x, training=training) #30 x 200 x 512
        print("------- ADD AND LAYER NORMALIZATION 1 ------")
        x = self.norm1(x + residual_x)
        residual_x = x #30 x 200 x 512
        print("------- ATTENTION 2 ------")
        x = self.ffn(x) #30 x 200 x 512
        print("------- DROPOUT 2 ------")
        x = self.dropout2(x, training=training) #30 x 200 x 512
        print("------- ADD AND LAYER NORMALIZATION 2 ------")
        x = self.norm2(x + residual_x) #30 x 200 x 512
        return x # this x is just trained word embedding with encoder and ffn to capture the best context

class Encoder(Layer):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob, num_layers):
        super(Encoder, self).__init__()
        self.layers = [EncoderLayer(d_model, ffn_hidden, num_heads, drop_prob)
                       for _ in range(num_layers)] #this will create encoder 5 times as specified

    def call(self, x, training=False):
        for layer in self.layers:
            x = layer(x, training=training)
        return x

d_model = 512 #length of word embedding
num_heads = 8 #number of attention heads
drop_prob = 0.1 #dropout rate
batch_size = 30 #batch size
max_sequence_length = 200 #max sequence length
ffn_hidden = 2048 #hidden layer size (feed forward network)
num_layers = 5 #number of encoder layers -> we pass thorugh the input through many copies of encoders to get a desired output -> if data is too complex increase the number


encoder = Encoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers)

x = tf.random.normal((batch_size, max_sequence_length, d_model))  # includes positional encoding
out = encoder(x)

------- ATTENTION 1 ------
------- ATTENTION 1 ------
------- ATTENTION 1 ------
------- ATTENTION 1 ------
x.shape: (30, 200, 512)
qkv.shape: (30, 200, 1536)
qkv.shape after reshape: (30, 200, 8, 192)
qkv.shape after transpose: (30, 8, 200, 192)
q shape: (30, 8, 200, 64), k shape: (30, 8, 200, 64), v shape: (30, 8, 200, 64)
scaled.shape : (30, 8, 200, 200)
values.shape: (30, 8, 200, 64), attention.shape: (30, 8, 200, 200)
values.shape after reshape: (30, 200, 512)
out.shape: (30, 200, 512)
------- DROPOUT 1 ------
------- ADD AND LAYER NORMALIZATION 1 ------
Mean ((1, 200, 512))
Standard Deviation  ((1, 200, 512))
y: (30, 200, 512)
self.gamma: (512,), self.beta: (512,)
out: (30, 200, 512)
------- ATTENTION 2 ------
x after first linear layer: (30, 200, 2048)
x after activation: (30, 200, 2048)
x after dropout: (30, 200, 2048)
x after 2nd linear layer: (30, 200, 512)
x after first linear layer: (30, 200, 2048)
x after activation: (30, 200, 2048)
x after dropout: (30, 200, 2048)
x after