In [12]:
# Builds a simple Transformer classifier from scratch and runs it on:
# “I love artificial intelligence”

import math
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(0)  # for reproducibility

# ---------- Block 0: Create a tiny vocabulary ----------
PAD, UNK, CLS = "<pad>", "<unk>", "<cls>"
sentence = "I love artificial intelligence"

# Build a minimal vocabulary from this example
words = [CLS] + sentence.lower().split()
vocab = sorted(set([PAD, UNK] + words))   # unique words
stoi = {w: i for i, w in enumerate(vocab)}  # word → index
itos = {i: w for w, i in stoi.items()}      # index → word

# Model setup
MAX_LEN = 8       # maximum sentence length
D_MODEL = 64      # vector size per token
N_HEADS = 4       # number of attention heads
D_FF = 128        # size of feed-forward network
N_LAYERS = 2      # number of encoder layers
NUM_CLASSES = 2   # output classes (e.g., positive/negative)

# Convert a text sentence into token IDs
def encode(text, max_len=MAX_LEN):
    toks = [CLS] + text.lower().split()                 # add <cls> at start
    ids = [stoi.get(t, stoi[UNK]) for t in toks][:max_len]  # convert to IDs
    ids += [stoi[PAD]] * (max_len - len(ids))           # pad if too short
    return torch.tensor(ids)

In [13]:
# ---------- Block 1: Positional Encoding ----------
# Adds information about word order (since attention is order-agnostic)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        # Create a sinusoidal pattern for each position
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        # Store as a constant buffer (not trainable)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # x: [batch, time, dim]
        T = x.size(1)
        # Add positional encoding to token embeddings
        return x + self.pe[:T].unsqueeze(0)

In [19]:
# ---------- Block 2: Multi-Head Self-Attention ----------
class MHA(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0
        self.h = n_heads                     # number of heads
        self.dh = d_model // n_heads         # dimension per head
        self.qkv = nn.Linear(d_model, 3 * d_model)  # one layer for Q, K, V
        self.o = nn.Linear(d_model, d_model)        # output projection

    def forward(self, x, mask=None):
        # x shape: [batch, time, dim]
        B, T, D = x.shape
        # Project x into Q, K, V then split into multiple heads
        qkv = self.qkv(x)  # [B, T, 3*D]
        q, k, v = qkv.chunk(3, dim=-1)  # split last dimension into Q, K, V → each [B, T, D]

        # Split into heads
        q = q.view(B, T, self.h, self.dh).transpose(1, 2)  # [B, H, T, d_head]
        k = k.view(B, T, self.h, self.dh).transpose(1, 2)
        v = v.view(B, T, self.h, self.dh).transpose(1, 2)

        # Compute attention scores: QK^T / sqrt(d)
        scores = (q @ k.transpose(-2, -1)) / math.sqrt(self.dh)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float("-inf"))  # ignore pads

        # Softmax → attention weights
        w = torch.softmax(scores, dim=-1)

        # Multiply weights with V (weighted sum)
        out = w @ v  # shape: [B, h, T, dh]
        # Combine all heads back
        out = out.transpose(1, 2).contiguous().view(B, T, D)
        return self.o(out)  # final linear layer to mix heads


In [20]:
# ---------- Block 3: Encoder Layer ----------
# Each encoder layer = self-attention + feed-forward + residuals + normalization
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, p=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.mha = MHA(d_model, n_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(p),
            nn.Linear(d_ff, d_model),
        )
        self.do = nn.Dropout(p)

    def forward(self, x, mask=None):
        # 1️⃣ Apply attention + residual connection
        x = x + self.do(self.mha(self.ln1(x), mask))
        # 2️⃣ Feed-forward network + residual connection
        x = x + self.do(self.ff(self.ln2(x)))
        return x


In [21]:
# ---------- Block 4: Stacked Encoder ----------
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, max_len):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, d_model)  # token embeddings
        self.pe = PositionalEncoding(d_model, max_len)
        # stack multiple encoder layers
        self.layers = nn.ModuleList(
            [EncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)]
        )
        self.ln = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        # Token embeddings scaled by sqrt(d_model)
        h = self.emb(x) * math.sqrt(self.emb.embedding_dim)
        # Add positional information
        h = self.pe(h)
        # Pass through N encoder layers
        for layer in self.layers:
            h = layer(h, mask)
        # Final normalization
        return self.ln(h)


In [22]:
# ---------- Block 5: Transformer Classifier ----------
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, d_ff, max_len, n_classes):
        super().__init__()
        self.enc = Encoder(vocab_size, d_model, n_layers, n_heads, d_ff, max_len)
        self.head = nn.Linear(d_model, n_classes)  # simple linear output layer

    def forward(self, x, mask=None):
        h = self.enc(x, mask)   # [batch, time, dim]
        cls = h[:, 0, :]        # take <cls> token’s embedding
        return self.head(cls)   # predict class probabilities

# ---------- Demo Run ----------
ids = encode(sentence)
x = ids.unsqueeze(0)  # batch dimension → [1, T]
mask = (x != stoi[PAD]).unsqueeze(1).unsqueeze(2)  # mask out <pad>

model = TransformerClassifier(
    vocab_size=len(vocab),
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    d_ff=D_FF,
    max_len=MAX_LEN,
    n_classes=NUM_CLASSES,
)

# Forward pass (no training yet)
with torch.no_grad():
    logits = model(x, mask)              # raw predictions
    probs = torch.softmax(logits, dim=-1)  # convert to probabilities

# ---------- Output ----------
print("Tokens:", [itos[i.item()] for i in ids])
print("Logits:", logits.squeeze().tolist())
print("Probabilities:", probs.squeeze().tolist())

Tokens: ['<cls>', 'i', 'love', 'artificial', 'intelligence', '<pad>', '<pad>', '<pad>']
Logits: [-0.014395445585250854, 0.282463014125824]
Probabilities: [0.42632564902305603, 0.5736743807792664]
