In [None]:
import torch
import torch.nn as nn
import math


Collecting tokenizers
  Downloading tokenizers-0.22.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Downloading tokenizers-0.22.0-cp39-abi3-macosx_11_0_arm64.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m19.1 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.22.0


In [3]:
class InputEmbeddings(nn.Module):

    def __init__(self, d_model:int, vocab_size:int) -> None:
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self,x):
        return self.embedding(x) * math.sqrt(self.d_model)

In [4]:
d_model = 8
vocab_size = 20
batch_size, seq_len = 2, 5

layer = InputEmbeddings(d_model, vocab_size)

# Some random token IDs
tokens = torch.randint(0, vocab_size, (batch_size, seq_len))
print("Token IDs:\n", tokens)

# Raw embeddings (with scaling)
out = layer(tokens)
print("\nEmbeddings (with scaling):\n", out)

# To see their size, print the L2 norm of each token vector
norms = out.norm(dim=-1)
print("\nNorm of each embedding vector (after scaling):\n", norms)

Token IDs:
 tensor([[18,  4,  0, 12, 13],
        [ 6, 19,  1, 13, 18]])

Embeddings (with scaling):
 tensor([[[ 3.4491,  0.2286, -1.9196, -1.1250, -1.5951, -4.0762, -2.5758,
           0.2742],
         [ 0.8067,  0.1462, -1.9222,  0.1486,  8.9591,  1.0481, -0.0172,
          -2.9542],
         [ 2.2098,  0.8158,  1.7288,  6.6945, -2.6517, -0.2257, -0.3454,
          -1.1235],
         [ 2.5378,  1.3919,  0.8215,  0.9555, -5.4037, -0.9901,  4.1008,
          -2.1832],
         [-4.5013, -1.0218, -4.2245, -1.9338, -3.1929,  1.2192,  5.0253,
          -2.1403]],

        [[ 4.1163, -2.1206,  1.5288,  1.4586, -4.3398, -2.7935,  2.9121,
           0.4308],
         [ 2.5680, -0.1862, -3.9832, -3.2019,  1.8529,  3.7834,  2.4422,
           3.2996],
         [ 2.1117,  2.2335,  2.9434, -1.8628,  0.5726, -0.6750,  0.8177,
          -1.1282],
         [-4.5013, -1.0218, -4.2245, -1.9338, -3.1929,  1.2192,  5.0253,
          -2.1403],
         [ 3.4491,  0.2286, -1.9196, -1.1250, -1.5951, -4.0

In [5]:
class PositionalEncodings(nn.Module):
    def __init__(self, d_model:int, seq_len:int, dropout:float) -> None:

        super().__init__()
        self.d_model = d_model 
        self.seq_len = seq_len
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(seq_len, d_model)
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1) # (seq_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) # (d_model / 2)
        pe[:, 0::2] = torch.sin(position * div_term) 
        pe[:, 1::2] = torch.cos(position * div_term) 

        pe = pe.unsqueeze(0)

        self.register_buffer('pe',pe)


    def forward(self,x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

In [6]:
pos_enc = PositionalEncodings(d_model=8, seq_len=10, dropout=0.1)

# 2. Example: a batch of 2 sentences, each with 5 tokens, already embedded
batch_size = 2
sentence_len = 5
x = torch.randn(batch_size, sentence_len, 8)   # random "word embeddings"

print("x shape (before PE):", x.shape)

# 3. Pass through positional encoding
out = pos_enc(x)
print("out shape (after PE):", out.shape)

x shape (before PE): torch.Size([2, 5, 8])
out shape (after PE): torch.Size([2, 5, 8])


In [7]:
class LayerNormalization(nn.Module):
    def __init__(self, eps:float=10**-6) ->None:
        super().__init__()

        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.bias = nn.Parameter(torch.zeros(1))

    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        std = x.std(-1,keepdim=True)
        return self.alpha * (x- mean)/(std+ self.eps) +self.bias

In [8]:
x = torch.tensor([
    [1.0, 2.0, 3.0, 4.0],   # token 1
    [10.0, 20.0, 30.0, 40.0] # token 2
])  # shape (2, 4)  → 2 tokens, each with 4 features

ln = LayerNormalization()
y = ln(x)

print("Input:\n", x)
print("\nOutput after LayerNorm:\n", y)

print("\nMeans per token after LN:", y.mean(-1))
print("Stds per token after LN :", y.std(-1))

Input:
 tensor([[ 1.,  2.,  3.,  4.],
        [10., 20., 30., 40.]])

Output after LayerNorm:
 tensor([[-1.1619, -0.3873,  0.3873,  1.1619],
        [-1.1619, -0.3873,  0.3873,  1.1619]], grad_fn=<AddBackward0>)

Means per token after LN: tensor([2.9802e-08, 0.0000e+00], grad_fn=<MeanBackward1>)
Stds per token after LN : tensor([1.0000, 1.0000], grad_fn=<StdBackward0>)


In [9]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model:int, d_ff:int, dropout:float):
        super().__init__()

        self.linear_1 = nn.Linear(d_model,d_ff)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff,d_model)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.linear_2(x)

In [10]:
class FeedForwardBlock(nn.Module):

    def __init__(self, d_model:int, d_ff:int,dropout:float):
        super().__init__()

        self.linear_1 = nn.Linear(d_model,d_ff)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff,d_model)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.linear_2(x)


In [11]:
ff = FeedForwardBlock(d_model,4,0.3)

In [12]:
for p in ff.parameters():
    print(p)

Parameter containing:
tensor([[-0.1269,  0.3162,  0.1710, -0.1655,  0.1602,  0.3027,  0.3432,  0.1178],
        [-0.3214,  0.2370,  0.0436, -0.1921, -0.1559, -0.1383,  0.1379,  0.3234],
        [-0.3188, -0.0098, -0.1927,  0.2922, -0.3168,  0.0318, -0.0399, -0.1380],
        [ 0.2942,  0.1490,  0.1625,  0.1909,  0.0427, -0.3282, -0.3020, -0.3361]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0643,  0.2582,  0.1838, -0.0065], requires_grad=True)
Parameter containing:
tensor([[ 0.2226,  0.0374,  0.3586, -0.4606],
        [ 0.2924, -0.0257, -0.4766, -0.2877],
        [-0.4734, -0.4928,  0.2585, -0.0011],
        [-0.1279, -0.0049,  0.4666,  0.3178],
        [ 0.4280,  0.3211,  0.4056, -0.3529],
        [-0.0875, -0.1814, -0.0928, -0.2676],
        [-0.0136, -0.0587, -0.1025,  0.3429],
        [-0.0009,  0.3429, -0.0039,  0.3891]], requires_grad=True)
Parameter containing:
tensor([-0.4030,  0.0007,  0.1408,  0.0383, -0.3972, -0.2768, -0.4139, -0.4143],
       requires_grad=

In [13]:
print(ff.linear_1.weight.shape)
print(ff.linear_1.bias.shape)
print(ff.linear_2.weight.shape)
print(ff.linear_2.bias.shape)

torch.Size([4, 8])
torch.Size([4])
torch.Size([8, 4])
torch.Size([8])


In [14]:
print(ff.linear_1.weight)
print(ff.linear_1.bias)
print(ff.linear_2.weight)
print(ff.linear_2.bias)

Parameter containing:
tensor([[-0.1269,  0.3162,  0.1710, -0.1655,  0.1602,  0.3027,  0.3432,  0.1178],
        [-0.3214,  0.2370,  0.0436, -0.1921, -0.1559, -0.1383,  0.1379,  0.3234],
        [-0.3188, -0.0098, -0.1927,  0.2922, -0.3168,  0.0318, -0.0399, -0.1380],
        [ 0.2942,  0.1490,  0.1625,  0.1909,  0.0427, -0.3282, -0.3020, -0.3361]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0643,  0.2582,  0.1838, -0.0065], requires_grad=True)
Parameter containing:
tensor([[ 0.2226,  0.0374,  0.3586, -0.4606],
        [ 0.2924, -0.0257, -0.4766, -0.2877],
        [-0.4734, -0.4928,  0.2585, -0.0011],
        [-0.1279, -0.0049,  0.4666,  0.3178],
        [ 0.4280,  0.3211,  0.4056, -0.3529],
        [-0.0875, -0.1814, -0.0928, -0.2676],
        [-0.0136, -0.0587, -0.1025,  0.3429],
        [-0.0009,  0.3429, -0.0039,  0.3891]], requires_grad=True)
Parameter containing:
tensor([-0.4030,  0.0007,  0.1408,  0.0383, -0.3972, -0.2768, -0.4139, -0.4143],
       requires_grad=

In [15]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model:int, h:int , dropout:float):
        super().__init__()

        self.d_model = d_model
        self.h = h

        assert d_model // h == 0, 'd_model is not divisible by h'

        self.d_k = self.d_model // h
        self.w_q = nn.Linear(d_model,d_model)
        self.w_v = nn.Linear(d_model,d_model)
        self.w_k = nn.Linear(d_model,d_model)

        self.w_o = nn.Linear(d_model,d_model)

        self.dropout = nn.Dropout(dropout)



    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k=query.shape[-1]

        attention_score = (query @ key.transpose(-1,-2))/math.sqrt(d_k)

        if mask is not None:
            attention_score.masked_fill(mask == 0 , -1e9)
        attention_score = attention_score.softmax(dim=-1)

        if dropout is not None:
            attention_score = dropout(attention_score)
        
        return (attention_score @value) , attention_score
    


    def forward(self, q,k,v, mask):
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        query = query.vew(query.shape[0], query.hsape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1 ,2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1 ,2)

        x, self.attention_score = self.attention(query, key, value, mask, self.dropout)

        x = x.transpose(1,2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
        return self.w_o(x)




In [17]:
class ResidualConnection(nn.Module):

    def __init__(self, dropout:float):
        self.dropout = dropout
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))


In [18]:
class EncoderBlock(nn.Module):

    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout:float):
        super().__init__()

        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x


In [None]:
class Encoder(nn.Module):
    def __init__(self, layers: nn.modul)

In [19]:
import torch

# Example input (batch=2, seq_len=4, d_model=8)
x = torch.randn(2, 4, 8)   # fake embeddings

# Example mask (no masking here, just pass None if you want)
mask = None

print("Input shape :", x.shape)
print("Input sample:\n", x[0])  # first sentence in the batch

Input shape : torch.Size([2, 4, 8])
Input sample:
 tensor([[ 0.8573, -0.0609,  1.1974,  0.1148,  0.6154, -1.0930,  1.9597, -1.2311],
        [ 1.3614,  1.9327,  0.6628,  0.3425, -0.1720,  0.0820,  0.4570, -0.5169],
        [-0.6505,  1.4195,  0.9332,  1.0570,  0.4678,  0.7032, -1.0348, -0.5908],
        [ 1.6712, -0.5148,  1.1786,  1.0540, -1.8952, -1.2580, -0.1011,  0.3994]])


In [22]:
class Encoder(nn.Module):
    def __init__(self, layers: nn.ModuleList):
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x,mask)

        return self.norm(x)