In [54]:
import torch
import torch.nn as nn
import math

In [55]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model:int, vocab_size:int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self,x):
        return self.embedding(x) * math.sqrt(self.d_model)

In [56]:
d_model = 8
vocab_size = 20
batch_size, seq_len = 2, 5

layer = InputEmbeddings(d_model, vocab_size)

# Some random token IDs
tokens = torch.randint(0, vocab_size, (batch_size, seq_len))
print("Token IDs:\n", tokens)

# Raw embeddings (with scaling)
out = layer(tokens)
print("\nEmbeddings (with scaling):\n", out)

# To see their size, print the L2 norm of each token vector
norms = out.norm(dim=-1)
print("\nNorm of each embedding vector (after scaling):\n", norms)

Token IDs:
 tensor([[ 1, 15, 16,  7, 14],
        [ 0, 19, 15,  3, 16]])

Embeddings (with scaling):
 tensor([[[ 0.6206, -4.8051,  3.7036, -4.6989,  4.9056,  1.8863, -2.9454,
          -3.9649],
         [ 6.2646, -0.3866, -2.8798,  0.5046,  3.6842,  3.0988, -2.4335,
           1.8151],
         [-0.2685,  3.1137,  3.7065, -0.8283,  1.1825, -3.2015, -2.3173,
          -2.6892],
         [ 1.5138,  5.5813, -0.5869, -0.0865,  0.7345,  1.7297,  3.3416,
          -3.1249],
         [-0.7383,  2.2570, -3.1315,  6.5919, -2.9574,  1.9825,  5.9677,
          -3.1789]],

        [[-0.4645, -2.7477, -2.9157,  1.8308, -4.1248, -0.7967, -2.9132,
          -1.9471],
         [-0.8472,  6.4950,  0.9349,  6.1512, -0.6388, -0.6811,  4.8512,
          -2.1354],
         [ 6.2646, -0.3866, -2.8798,  0.5046,  3.6842,  3.0988, -2.4335,
           1.8151],
         [ 2.0912,  3.5446, -1.2573,  2.3149,  0.0353,  2.7596, -0.0228,
          -0.1128],
         [-0.2685,  3.1137,  3.7065, -0.8283,  1.1825, -3.2

In [57]:
class PositionalEncodings(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float) -> None:

        super().__init__()
        self.d_model = d_model 
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        pe[:, 0::2] = torch.sin(position * div_term) 
        pe[:, 1::2] = torch.cos(position * div_term) 

        pe = pe.unsqueeze(0)

        self.register_buffer('pe',pe)


    def forward(self,x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

In [58]:
pos_enc = PositionalEncodings(d_model=8, seq_len=10, dropout=0.1)

# 2. Example: a batch of 2 sentences, each with 5 tokens, already embedded
batch_size = 2
sentence_len = 5
x = torch.randn(batch_size, sentence_len, 8)   # random "word embeddings"

print("x shape (before PE):", x.shape)

# 3. Pass through positional encoding
out = pos_enc(x)
print("out shape (after PE):", out.shape)

x shape (before PE): torch.Size([2, 5, 8])
out shape (after PE): torch.Size([2, 5, 8])


In [59]:
class LayerNormalization(nn.Module):
    def __init__(self, eps:float=10**-6) ->None:
        super().__init__()

        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        std = x.std(-1,keepdim=True)
        return self.alpha * (x- mean)/(std+ self.eps) +self.bias

In [47]:
x = torch.tensor([
    [1.0, 2.0, 3.0, 4.0],   # token 1
    [10.0, 20.0, 30.0, 40.0] # token 2
])  # shape (2, 4)  → 2 tokens, each with 4 features

ln = LayerNormalization()
y = ln(x)

print("Input:\n", x)
print("\nOutput after LayerNorm:\n", y)

print("\nMeans per token after LN:", y.mean(-1))
print("Stds per token after LN :", y.std(-1))

Input:
 tensor([[ 1.,  2.,  3.,  4.],
        [10., 20., 30., 40.]])

Output after LayerNorm:
 tensor([[-0.1619,  0.6127,  1.3873,  2.1619],
        [-0.1619,  0.6127,  1.3873,  2.1619]], grad_fn=<AddBackward0>)

Means per token after LN: tensor([1., 1.], grad_fn=<MeanBackward1>)
Stds per token after LN : tensor([1.0000, 1.0000], grad_fn=<StdBackward0>)


In [63]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model:int, d_ff:int, dropout:float):
        super().__init__()

        self.linear_1 = nn.Linear(d_model,d_ff)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff,d_model)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.linear_2(x)

In [64]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model:int, d_ff:int,dropout:float):
        super().__init__()

        self.linear_1 = nn.Linear(d_model,d_ff)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff,d_model)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.linear_2(x)


In [53]:
ff = FeedForwardBlock(d_model,4,0.3)

In [66]:
for p in ff.parameters():
    print(p)

Parameter containing:
tensor([[ 0.1754, -0.3231, -0.0633, -0.2627, -0.1509,  0.1274, -0.2511,  0.1314],
        [ 0.3001,  0.0232, -0.2356, -0.1267,  0.0772, -0.2695,  0.1756, -0.3210],
        [-0.3399, -0.3435, -0.0717,  0.2377, -0.3346,  0.2939, -0.1414,  0.1036],
        [ 0.0161, -0.3188,  0.2932,  0.1904,  0.3514,  0.1786, -0.2334,  0.2951]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0190,  0.1677, -0.2835, -0.1017], requires_grad=True)
Parameter containing:
tensor([[-0.4909, -0.1947,  0.1079, -0.3926],
        [ 0.1594,  0.2684,  0.0697, -0.3345],
        [-0.3877, -0.1543,  0.2195,  0.4932],
        [ 0.2875, -0.0563,  0.1753, -0.4905],
        [-0.4271,  0.2333, -0.2832,  0.2405],
        [-0.3530, -0.2477, -0.4118,  0.2609],
        [-0.0509,  0.3848,  0.3094,  0.2767],
        [ 0.0161, -0.1546, -0.1087,  0.0665]], requires_grad=True)
Parameter containing:
tensor([ 0.2479, -0.3503,  0.4196, -0.0544, -0.4190, -0.2705,  0.4424,  0.4573],
       requires_grad=

In [67]:
print(ff.linear_1.weight.shape)
print(ff.linear_1.bias.shape)
print(ff.linear_2.weight.shape)
print(ff.linear_2.bias.shape)

torch.Size([4, 8])
torch.Size([4])
torch.Size([8, 4])
torch.Size([8])


In [68]:
print(ff.linear_1.weight)
print(ff.linear_1.bias)
print(ff.linear_2.weight)
print(ff.linear_2.bias)

Parameter containing:
tensor([[ 0.1754, -0.3231, -0.0633, -0.2627, -0.1509,  0.1274, -0.2511,  0.1314],
        [ 0.3001,  0.0232, -0.2356, -0.1267,  0.0772, -0.2695,  0.1756, -0.3210],
        [-0.3399, -0.3435, -0.0717,  0.2377, -0.3346,  0.2939, -0.1414,  0.1036],
        [ 0.0161, -0.3188,  0.2932,  0.1904,  0.3514,  0.1786, -0.2334,  0.2951]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0190,  0.1677, -0.2835, -0.1017], requires_grad=True)
Parameter containing:
tensor([[-0.4909, -0.1947,  0.1079, -0.3926],
        [ 0.1594,  0.2684,  0.0697, -0.3345],
        [-0.3877, -0.1543,  0.2195,  0.4932],
        [ 0.2875, -0.0563,  0.1753, -0.4905],
        [-0.4271,  0.2333, -0.2832,  0.2405],
        [-0.3530, -0.2477, -0.4118,  0.2609],
        [-0.0509,  0.3848,  0.3094,  0.2767],
        [ 0.0161, -0.1546, -0.1087,  0.0665]], requires_grad=True)
Parameter containing:
tensor([ 0.2479, -0.3503,  0.4196, -0.0544, -0.4190, -0.2705,  0.4424,  0.4573],
       requires_grad=

In [None]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model:int, h:int , dropout:float):
        super().__init__()

        self.d_model = d_model
        self.h = h

        assert d_model // h == 0, 'd_model is not divisible by h'

        self.d_k = self.d_model // h
        self.w_q = nn.Linear(d_model,d_model)
        self.w_v = nn.Linear(d_model,d_model)
        self.w_k = nn.Linear(d_model,d_model)

        self.w_o = nn.Linear(d_model,d_model)

        self.dropout = nn.Dropout(dropout)



    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k=query.shape[-1]

        attention_score = (query @ key.transpose(-1,-2))/math.sqrt(d_k)

        if mask is not None:
            attention_score.masked_fill(mask == 0 , -1e9)
        attention_score = attention_score.softmax(dim=-1)

        if dropout is not None:
            attention_score = dropout(attention_score)
        
        return (attention_score @value) , attention_score
    


    def forward(self, q,k,v, mask):
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        query = query.vew(query.shape[0], query.hsape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1 ,2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1 ,2)

        x, self.attention_score = self.attention(query, key, value, mask, self.dropout)

        x = x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
        return self.w_o(x)


