In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        print("\nüëâ Before Positional Encoding:\n", x[0][:5])
        x = x + self.pe[:, :x.size(1), :]
        print("\n‚úÖ After Positional Encoding:\n", x[0][:5])
        return x

In [None]:
# Scaled Dot-Product Attention
def scaled_dot_product_attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    attn = F.softmax(scores, dim=-1)
    return torch.matmul(attn, value), attn

In [None]:
# Multi-Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0
        self.d_k = embed_dim // num_heads
        self.num_heads = num_heads
        self.linear_q = nn.Linear(embed_dim, embed_dim)
        self.linear_k = nn.Linear(embed_dim, embed_dim)
        self.linear_v = nn.Linear(embed_dim, embed_dim)
        self.linear_out = nn.Linear(embed_dim, embed_dim)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        query = self.linear_q(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)
        key   = self.linear_k(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)
        value = self.linear_v(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1,2)

        attn_output, attn_weights = scaled_dot_product_attention(query, key, value, mask)
        print("\nüîç Attention Output (head 0):\n", attn_output[0, 0, :5])

        x = attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)
        return self.linear_out(x)

In [None]:
# Feedforward Layer
class PositionwiseFeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        print("\n‚öôÔ∏è Before FeedForward:\n", x[0][:5])
        x = self.fc2(self.dropout(F.relu(self.fc1(x))))
        print("\n‚úÖ After FeedForward:\n", x[0][:5])
        return x

In [None]:
# Transformer Encoder Layer
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(embed_dim, num_heads)
        self.ff = PositionwiseFeedForward(embed_dim, ff_dim, dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, mask=None):
        print("\nüß† === Transformer Encoder Layer ===")
        attn_output = self.self_attn(src, src, src, mask)
        src = self.norm1(src + self.dropout(attn_output))
        ff_output = self.ff(src)
        src = self.norm2(src + self.dropout(ff_output))
        return src

In [None]:
# Full Transformer Encoder
class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers, max_len=100):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])
        self.output = nn.Linear(embed_dim, vocab_size)

    def forward(self, src, mask=None):
        x = self.embed(src)
        print("\nüì¶ Embeddings:\n", x[0][:5])
        x = self.pos_encoder(x)
        for i, layer in enumerate(self.layers):
            print(f"\nüöÄ Running Encoder Layer {i+1}")
            x = layer(x, mask)
        out = self.output(x)
        print("\nüì§ Final Output (Logits):\n", out[0][:5])
        return out

In [None]:
# Initialize model and input
vocab_size = 50
seq_len = 5
embed_dim = 16
num_heads = 4
ff_dim = 8
num_layers = 2

model = TransformerEncoder(vocab_size, embed_dim, num_heads, ff_dim, num_layers)
dummy_input = torch.randint(0, vocab_size, (1, seq_len))  # batch=1
print("\nüß™ Dummy Input Tokens:\n", dummy_input,"\n")

# Forward pass
output = model(dummy_input)


üß™ Dummy Input Tokens:
 tensor([[43, 49,  6, 31,  1]]) 


üì¶ Embeddings:
 tensor([[ 0.4547, -0.8397, -0.5690, -1.3350, -1.1348, -0.0112,  0.1043,  0.8466,
          0.4567, -1.4613, -1.2593,  0.4286, -0.8767,  0.0951, -0.4823, -1.1379],
        [-0.8229, -0.4861,  0.7673,  1.6116, -1.8941,  0.4194, -1.1209, -0.0390,
          0.6138, -1.0109, -0.0146, -0.4556, -0.0081,  0.9316, -0.7015,  0.2700],
        [-0.2373,  0.6944, -0.2046,  2.2474, -1.1990,  0.0562, -0.6037,  0.1470,
         -1.7067, -0.3040,  0.8566, -1.1573,  0.0219, -0.5648, -0.2980, -0.8879],
        [-0.2884, -0.5477,  1.5881, -0.5193, -0.1374, -0.5210,  0.9926,  2.0067,
          0.1858,  0.6579, -0.6036, -0.9532, -0.2104,  0.8340,  0.9779, -0.4513],
        [ 1.5774, -0.0252,  0.4255,  1.3154, -0.6366, -1.0235, -1.1520, -0.9622,
          0.5502, -0.7158,  1.4739,  1.2995, -0.8994, -0.8676,  0.6065,  1.1507]],
       grad_fn=<SliceBackward0>)

üëâ Before Positional Encoding:
 tensor([[ 0.4547, -0.8397, -0.5690, -