<a href="https://colab.research.google.com/github/AdityaJ9082/ML/blob/main/transformers_encoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
d_model=512
num_heads=8
max_sequence_length=200
drop_prob=0.1
ffn_hidden=2048
num_layers=5
batch_size=30

# encoder=Encoder(d_model,num_heads,max_sequence_length,drop_prob,ffn_hidden,num_layers,batch_size)

In [None]:
class Encoder(nn.Module):
  def __init__(self,d_model,ffn_hidden,num_heads,drop_prob,num_layers):
    super().__init__()
    self.layers=nn.Sequential(*[EncoderLayer(d_model,ffn_hidden,num_heads,drop_prob)
                              for _ in range(num_layers)])
  def forward(self,x):
    x=self.layers(x)
    return x


In [None]:
from torch.nn.modules.activation import MultiheadAttention
class EncoderLayer(nn.Module):
  def __init__(self,d_model,ffn_hidden,num_heads,drop_prob,num_layers):
    super(EncoderLayer,self).__init__()
    self.attention=MultiHeadAttention(d_model=d_model,num_heads=num_heads)
    self.norm1=LayerNormalization(params_shape=[d_model])
    self.drop1=nn.Dropout(p=drop_prob)
    self.ffn=Positionwisefeedforward(d_model=d_model,hidden=ffn_hidden,drop_prob=drop_prob)
    self.norm2=LayerNormalization(params_shape=[d_model])
    self.drop2=nn.Dropout(p=drop_prob)

  def forward(self,x):#30,200,512
    residual_x=x
    x=self.attention(x,mask=None)
    x=self.drop1(x)
    x=self.norm1(x+residual_x)
    residual_x=x
    x=self.ffn(x)
    x=self.drop2(x)
    x=self.norm2(x+residual_x)
    return x#30,200,512




In [None]:
import numpy as np
# # x=np.ones()
# x=np.ones(shape=(2,3,4))
# x
# x.shape#2,3,4#bac=tch_size,msl,emb_dim
# x.T.shape#4,3,2
# x.transpose(-1,-2).shape#2,4,3----->transpose for key vector

In [None]:
def scaled_dot_product(q,k,v,mask=None):
  d_k=q.size()[-1]#q is batch_size,msl,emb_dim(for each head)
  scaled=torch.matmul(q,k.transpose(-1,-2))/np.sqrt(d_k)#msl,msl----- / by dk for stable learning for basically we normalize the product so that during BP if we have some large values in our
  #gradient steps .so these values have to be normalized so that we can take stable or even steps during training
  if mask is not None:
    scaled=scaled+mask
  attention=F.softmax(scaled,dim=-1)
  values=torch.matmul(attention,v)
  return values,attention

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self,d_model,num_heads):
    super().__init__()
    self.d_model=d_model#512
    self.qkv_layer=nn.Linear(d_model,3*d_model)
    self.num_heads=num_heads#8
    self.head_dims=d_model//num_heads#64
    self.linear_layer=nn.Linear(d_model,d_model)

  def forward(self,x,mask=None):
    batch_size,sequence_length,d_model=x.size()
    qkv=self.qkv_layer(x)#30,200,1536
    qkv=qkv.reshape(batch_size,sequence_length,self.num_heads,3*self.head_dims)#query_matrix*3#30,200,8,192
    qkv=qkv.permute(0,2,1,3)#30,8,200,192
    q,k,v=qkv.chunk(3,dim=-1)#30,8,200,64 each
    values,attention=scaled_dot_product(q,k,v,mask)#values=30,8,200,64 attention=30,8,200,200
    values=values.reshape(batch_size,sequence_length,self.num_heads*self.head_dims)
    out=self.linear_layer(values)
    return out



In [None]:
class LayerNormalization(nn.Module):
  def __init__(self,parameters_shape,eps=1e-5):
    super().__init__()
    self.parameters_shape=parameters_shape
    self.eps=eps
    self.gamma=nn.Parameter(torch.ones(self.parameter_shapes))#[512]
    self.beta=nn.Parameter(torch.zeros(self.parameter_shapes))#[512]

  def forward(self,inputs):#30,300,512
    dims=[-(i+1) for i in range(len(self.parameters_shape))]#[-1]
    mean=inputs.mean(dim=dims,keepdim=True)#30,200,1
    var=((inputs-mean)**2).mean(dim=dims,keepdim=True)#30,200,1
    std=(var+self.eps).sqrt()
    y=(inputs-mean)/std #30,300,512
    out=self.gamma*y+self.beta
    return out

In [None]:
class Positionwisefeedforward(nn.Module):
  def __init__(self,d_model,ffn_hidden,drop_prob):
    super().__init__()
    self.linear1=nn.Linear(d_model,ffn_hidden)#512,2048
    self.linear2=nn.Linear(ffn_hidden,d_model)#2048,512
    self.relu=nn.ReLU()
    self.dropout=nn.Dropout(p=drop_prob)

  def forward(self,x):#30,200,512
    x=self.linear1(x)#30,200,2048
    x=self.relu(x)#30,200,2048
    x=self.dropout(x)#30,200,2048
    x=self.linear2(x)#30,200,512
    return x




