<a href="https://colab.research.google.com/github/2003Yash/multi-head-attention/blob/main/Multi_head_attention_for_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Multi Head attention meachinism: very similar to self attention but in self attention we mix q,k,v values and use softmax to normalize to find the perfect context vectors. here we are using 8 attentin heads for each q,k,v vector and batch 30 simultaneously and use them to make word embedding of same dimension as input but more context aware


In [None]:
import tensorflow as tf

class MultiheadAttention(tf.keras.layers.Layer):
    def __init__(self, input_dim, d_model, num_heads):
        super(MultiheadAttention, self).__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads # creating head size => 512 / 8 => each embedding is divided for 8 attention head each head dim = 64 # // means same as / but result will be an int for / result will be an float
        self.qkv_layer = tf.keras.layers.Dense(3 * d_model) # we are multiplying data into 3 so that q,k,v vectors are genreated => 512x3 = 1536
        self.linear_layer = tf.keras.layers.Dense(d_model)

    def call(self, x, mask=None):
        batch_size, sequence_length, input_dim = tf.shape(x)
        print(f"x.shape: {x.shape}")

        qkv = self.qkv_layer(x)
        print(f"qkv.shape: {qkv.shape}")

        qkv = tf.reshape(qkv, (batch_size, sequence_length, self.num_heads, 3 * self.head_dim)) # reshaping so we could process each head sperately i.e head = 64
        print(f"qkv.shape: {qkv.shape}")

        qkv = tf.transpose(qkv, perm=[0, 2, 1, 3]) # since it's a tensor of 3 dimensions we can't use transpose function , rather we should specify how we transpose  ## syntax [0,1,2,3] = no transpose
        print(f"qkv.shape: {qkv.shape}")

        q, k, v = tf.split(qkv, 3, axis=-1) # spliting vector into q, k, v vectors
        print(f"q.shape: {q.shape}, k.shape: {k.shape}, v.shape: {v.shape}")

        values, attention = self.scaled_dot_product(q, k, v, mask)
        print(f"values.shape: {values.shape}, attention.shape: {attention.shape}")

        values = tf.reshape(values, (batch_size, sequence_length, self.num_heads * self.head_dim)) # reshape them back by mixing outputs of each head and no.of heads
        print(f"values.shape: {values.shape}")

        out = self.linear_layer(values)
        print(f"out.shape: {out.shape}")
        return out

# to use mask -- we just have to create an numpy lowet triangular matrix and assign it toa  var mask1 and use it function by mask = mask1
  # process is very similar to self attention
    def scaled_dot_product(self, q, k, v, mask):
        d_k = tf.cast(tf.shape(k)[-1], tf.float32)
        scores = tf.matmul(q, k, transpose_b=True) / tf.math.sqrt(d_k)
        if mask is not None:
            scores += (mask * -1e9)
        attention = tf.nn.softmax(scores, axis=-1)
        values = tf.matmul(attention, v)
        return values, attention

In [None]:
input_dim = 512 # this represents input for attention unit size which is the embedding matrix size of each word
d_model = 512 # this represents the output from attention unit which is the embedding matrix size of each word
num_heads = 8 # specified as per Attention is all you need Research paper

batch_size = 30 # we will simulatneous do mini batch learning of 30 values each time
sequence_length = 4 # no.of words in a sentence
x = tf.random.normal((batch_size, sequence_length, input_dim)) #random sample of word embedding data data over the given dimensions
# x is value we made after pos encoding and and x is feeded to multi head attention in encoder

model = MultiheadAttention(input_dim, d_model, num_heads) # creating an object of class
out = model(x) # using object to perfrom multi head attention

#even if iput size is 1024 we get output as 512 dimensions since d_model value is 512

x.shape: (30, 4, 512)
qkv.shape: (30, 4, 1536)
qkv.shape: (30, 4, 8, 192)
qkv.shape: (30, 8, 4, 192)
q.shape: (30, 8, 4, 64), k.shape: (30, 8, 4, 64), v.shape: (30, 8, 4, 64)
values.shape: (30, 8, 4, 64), attention.shape: (30, 8, 4, 4)
values.shape: (30, 4, 512)
out.shape: (30, 4, 512)


take input (512) -> create q,k,v vectors (512 x 3 = 1536) -> put them qkv neural network layer (learnable) -> divide them over the 8 heads (1536/8 = 192) -> transpose it (head and no.of sequences are inverteed) so we can easily perform parallel processing on sentemces i.e last 2 dimensions-> split them to q, k, v seperately -> (if decoder then use mask here) -> get attention matix and use it to get values( new more context aware embedding values) ->  transpose values back to original shape by mixing outputs of each head and no.of heads -> put them through linear layer nueral network of d_model nodes which is learnable -> and we get output which is basically the same shape as linear layer and it is output of attention block
