In [4]:
import torch
from math import sqrt
import torch.nn.functional as F


def scaled_dot_transformation(query,key,value):
  dim_k = key.size(-1)
  scores = torch.bmm(query,key.transpose(1,2))//sqrt(dim_k)
  #dim=1 ensures that the softmax operation is applied independently to each row (sample) in the scores tensor.
  weights = F.Softmax(scores,dim=1)

  return torch.bmm(weights,value)

*embed_dim:* This represents the number of dimensions in the input feature space.For example, if each word in a sentence is represented by a 300-dimensional embedding vector, then embed_dim would be 300.

 *(head_dim)* is often a fraction of the total embedding dimension (embed_dim). Specifically, they recommend setting head_dim as embed_dim / num_heads, where num_heads is the number of attention heads.

tensors of shape [batch_size, seq_len,
head_dim]:
 For example, if you have 64 sequences (sentences) in a batch, each with 10 words, and each word represented by a 300-dimensional embedding, the input tensor shape would be (64, 10, 300)

In [None]:
#Code for Single Attention Head

class AttentionHead(nn.Module): #nn.Module = Building Block of Model as base class in Pytorch

  def __init__(self,embed_dim,head_dim):
    super().__init__():
    self.q = nn.Linear(embed_dim,head_dim) #Like Dense Layer in TensorFlow
    self.k = nn.Linear(embed_dim,head_dim)
    self.v = nn.Linear(embed_dim,head_dim)

  def forward(self,hidden_state):
    attn_outputs = scaled_dot_transformation(self.q(hidden_state),
                                             self.k(hidden_state),self.v(hidden_state))
    return attn_outputs

Now that we have single atention head we can concatenate outputs of each to have multi head attention layer.

In [None]:
class MultiHeadAttention(nn.Module):
  def__init__(self,config):
    super()__init__()
    embed_dim = config.hidden_size
    num_heads = config.num_attention_head
    head_dim = embed_dim//num_heads
    self.heads = nn.ModuleList(
        [AttentionHead(embed_dim,head_dim) for _ in range(num_heads)]
    )
    self.output_linear = nn.Linear(embed_dim,embed_dim)


    def forward(self,hidden_state):
      x = torch.cat([h(hidden_state) for h in self.heads],dim = -1)
      x = self.output_linear(x)
      return x



- In above code we making number of multi heads we need.
- In *def forward(self,hidden)*: We concatenating the results obtained from each hidden layer along the last dim (dim=-1)
As return the concatenated tensor for the furthur process.

- Notice that the concatenated output from the attention heads is also fed through a
final linear layer to produce an output tensor of shape [batch_size, seq_len,
hidden_dim]