<a href="https://colab.research.google.com/github/DatumLearning/Transformer-PyTorch/blob/main/Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
class SelfAttention(nn.Module):
  def __init__(self , d_model):
    super(SelfAttention , self).__init__()
    self.d_model = d_model
    self.query = nn.Linear(d_model , d_model)
    self.key = nn.Linear(d_model , d_model)
    self.value = nn.Linear(d_model , d_model)

  def forward(self , x):
    Q = self.query(x)
    K = self.key(x)
    V = self.value(x)

    scores = torch.matmul(Q , K.transpose(-2 , -1)) / math.sqrt(self.d_model)
    attention_weights = F.softmax(scores , dim = -1)
    output = torch.matmul(attention_weights , V)
    return output

In [None]:
class FeedForward(nn.Module):
  def __init__(self , d_model , ff_dim):
    super(FeedForward , self).__init__()
    self.l1 = nn.Linear(d_model , ff_dim)
    self.l2 = nn.Linear(ff_dim , d_model)

  def forward(self , x):
    return self.l2(F.relu(self.l1(x)))

In [None]:
class TransformerEncoderBlock(nn.Module):
  def __init__(self , d_model , ff_dim , dropout = 0.1):
    super(TransformerEncoderBlock , self).__init__()
    self.self_attn = SelfAttention(d_model)
    self.ffn = FeedForward(d_model, ff_dim)
    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self , x):
    attn_out = self.self_attn(x)
    x = x + self.dropout(attn_out)
    x = self.norm1(x)

    ffn_out = self.ffn(x)
    x = x + self.dropout(ffn_out)
    x = self.norm2(x)
    return x

In [None]:
TE = TransformerEncoderBlock(512 , 1024)

In [None]:
TE(torch.randn(16 , 10 , 512)).shape

torch.Size([16, 10, 512])