In [2]:
#Exercise 4.1

In [3]:
from importlib.metadata import version

print("torch version:", version("torch"))

torch version: 2.6.0


In [1]:
import torch
import torch.nn as nn

def calculate_parameters(d_model, d_ff, num_heads):
    """
    Calculate and compare parameters in feed forward and attention modules.
    """
    # Multi-head attention parameters (Q, K, V projections + output projection)
    attention_params = 4 * d_model * d_model

    # Feed forward parameters (two linear layers)
    feed_forward_params = d_model * d_ff + d_ff * d_model

    print(f"Multi-head attention parameters: {attention_params:,}")
    print(f"Feed forward parameters: {feed_forward_params:,}")
    print(f"Difference (FF - Attention): {feed_forward_params - attention_params:,}")

    if feed_forward_params > attention_params:
        print("Feed forward has more parameters")
    else:
        print("Attention has more parameters")

# Example usage for GPT-2 small (124M)
calculate_parameters(d_model=768, d_ff=3072, num_heads=12)

Multi-head attention parameters: 2,359,296
Feed forward parameters: 4,718,592
Difference (FF - Attention): 2,359,296
Feed forward has more parameters


In [None]:
# EXERCISE 4.2

In [4]:
import torch
import torch.nn as nn

# Configuration dictionaries for different model sizes
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_MEDIUM = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_LARGE = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1280,
    "n_heads": 20,
    "n_layers": 36,
    "drop_rate": 0.1,
    "qkv_bias": False
}

GPT_CONFIG_XL = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1600,
    "n_heads": 25,
    "n_layers": 48,
    "drop_rate": 0.1,
    "qkv_bias": False
}

def calculate_gpt_parameters(config):
    """
    Calculate total parameters for a GPT model based on configuration.
    """
    # Embedding parameters
    emb_params = config["vocab_size"] * config["emb_dim"] + config["context_length"] * config["emb_dim"]

    # Transformer block parameters
    d_model = config["emb_dim"]
    d_ff = 4 * d_model  # Standard feed-forward expansion factor
    block_params = (4 * d_model * d_model) + (d_model * d_ff + d_ff * d_model)
    total_block_params = config["n_layers"] * block_params

    # Output layer parameters
    output_params = d_model * config["vocab_size"]

    total_params = emb_params + total_block_params + output_params
    return total_params

# Calculate and print parameters for each model
print(f"GPT-2 Small (124M): {calculate_gpt_parameters(GPT_CONFIG_124M):,} parameters")
print(f"GPT-2 Medium: {calculate_gpt_parameters(GPT_CONFIG_MEDIUM):,} parameters")
print(f"GPT-2 Large: {calculate_gpt_parameters(GPT_CONFIG_LARGE):,} parameters")
print(f"GPT-2 XL: {calculate_gpt_parameters(GPT_CONFIG_XL):,} parameters")

GPT-2 Small (124M): 162,915,840 parameters
GPT-2 Medium: 405,964,800 parameters
GPT-2 Large: 837,757,440 parameters
GPT-2 XL: 1,637,020,800 parameters


In [5]:
#EXERCISE 4.3

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Separate dropout rates
        self.drop_embed = config.get("drop_embed", 0.1)
        self.drop_shortcut = config.get("drop_shortcut", 0.1)
        self.drop_attention = config.get("drop_attention", 0.1)

        # Update config with specific dropout rates
        config.update({
            "drop_embed": self.drop_embed,
            "drop_shortcut": self.drop_shortcut,
            "drop_attention": self.drop_attention
        })

        # Model components
        self.embeddings = nn.Embedding(config["vocab_size"], config["emb_dim"])
        self.position_embeddings = nn.Embedding(config["context_length"], config["emb_dim"])
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(config) for _ in range(config["n_layers"])
        ])
        self.ln_f = nn.LayerNorm(config["emb_dim"])
        self.head = nn.Linear(config["emb_dim"], config["vocab_size"], bias=False)

    def forward(self, x, targets=None):
        b, t = x.size()
        pos = torch.arange(0, t, dtype=torch.long, device=x.device)

        # Embeddings with separate dropout
        tok_emb = self.embeddings(x)
        pos_emb = self.position_embeddings(pos)
        x = tok_emb + pos_emb
        x = F.dropout(x, p=self.drop_embed, training=self.training)

        # Transformer blocks
        for block in self.transformer_blocks:
            x = block(x)

        x = self.ln_f(x)

        if targets is not None:
            logits = self.head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            return logits, loss
        return self.head(x)

class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config["emb_dim"])
        self.attn = MultiHeadAttention(config)
        self.ln2 = nn.LayerNorm(config["emb_dim"])
        self.ff = FeedForward(config)
        self.drop_shortcut = config.get("drop_shortcut", 0.1)

    def forward(self, x):
        x = x + F.dropout(self.attn(self.ln1(x)), p=self.drop_shortcut, training=self.training)
        x = x + F.dropout(self.ff(self.ln2(x)), p=self.drop_shortcut, training=self.training)
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb_dim = config["emb_dim"]
        self.n_heads = config["n_heads"]
        self.head_dim = self.emb_dim // self.n_heads

        self.qkv = nn.Linear(self.emb_dim, 3 * self.emb_dim, bias=config["qkv_bias"])
        self.proj = nn.Linear(self.emb_dim, self.emb_dim)
        self.drop_attention = config.get("drop_attention", 0.1)
        self.dropout = nn.Dropout(self.drop_attention)

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x).split(self.emb_dim, dim=2)
        q, k, v = [y.view(B, T, self.n_heads, self.head_dim).transpose(1, 2) for y in qkv]

        attn = (q @ k.transpose(-2, -1)) * (1.0 / torch.sqrt(torch.tensor(self.head_dim)))
        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)

        y = (attn @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.proj(y)

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config["emb_dim"], 4 * config["emb_dim"]),
            nn.GELU(),
            nn.Linear(4 * config["emb_dim"], config["emb_dim"]),
        )

    def forward(self, x):
        return self.net(x)

# Example usage
config = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_embed": 0.1,
    "drop_shortcut": 0.1,
    "drop_attention": 0.1,
    "qkv_bias": False
}

model = GPTModel(config)
print("Model with separate dropout rates created successfully!")

Model with separate dropout rates created successfully!
