In [1]:
# Exercise 4 Answers - Chapter 4 GPT Implementation Homework

import sys
import os

current_dir = os.getcwd() 
target_dir = os.path.join(current_dir, "01_main-chapter-code")
sys.path.append(target_dir)

from gpt import GPTModel, TransformerBlock

# ----------------------
# Exercise 4.1: Count Parameters
# ----------------------

def count_parameters(cfg):
    block = TransformerBlock(cfg)
    model = GPTModel(cfg)
    
    ff_params = sum(p.numel() for p in block.ff.parameters())
    att_params = sum(p.numel() for p in block.att.parameters())
    total_params = sum(p.numel() for p in model.parameters())

    print("--- Exercise 4.1 ---")
    print(f"FeedForward Parameters: {ff_params:,}")
    print(f"Multi-Head Attention Parameters: {att_params:,}")
    print(f"Total GPTModel Parameters: {total_params:,}")
    print(f"Estimated Memory: {total_params * 4 / (1024 * 1024):.2f} MB\n")

# ----------------------
# Exercise 4.2: Init Larger Models
# ----------------------

def init_larger_models():
    print("--- Exercise 4.2 ---")
    variants = {
        "GPT-2 Medium": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
        "GPT-2 Large": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
        "GPT-2 XL": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
    }
    
    base_cfg = {
        "vocab_size": 50257,
        "context_length": 1024,
        "drop_rate": 0.1,
        "qkv_bias": False,
    }

    for name, spec in variants.items():
        cfg = base_cfg.copy()
        cfg.update(spec)
        model = GPTModel(cfg)
        total_params = sum(p.numel() for p in model.parameters())
        size_mb = total_params * 4 / (1024 * 1024)
        print(f"{name}: {total_params:,} params (~{size_mb:.2f} MB)")
    print()

# ----------------------
# Exercise 4.3: Dropout Split
# ----------------------

def new_dropout_config():
    print("--- Exercise 4.3 ---")
    cfg = {
        "vocab_size": 50257,
        "context_length": 1024,
        "emb_dim": 768,
        "n_layers": 12,
        "n_heads": 12,
        "dropout_emb": 0.1,
        "dropout_attn": 0.1,
        "dropout_resid": 0.1,
        "dropout_ff": 0.1,
        "qkv_bias": False
    }
    cfg["drop_rate"] = 0.1
    print("Updated cfg with separate dropout values.\n")
    return cfg

if __name__ == "__main__":
    default_cfg = {
        "vocab_size": 50257,
        "context_length": 1024,
        "emb_dim": 768,
        "n_layers": 12,
        "n_heads": 12,
        "drop_rate": 0.1,
        "qkv_bias": False
    }
    count_parameters(default_cfg)
    init_larger_models()
    updated_cfg = new_dropout_config()
    count_parameters(updated_cfg)


--- Exercise 4.1 ---
FeedForward Parameters: 4,722,432
Multi-Head Attention Parameters: 2,360,064
Total GPTModel Parameters: 163,009,536
Estimated Memory: 621.83 MB

--- Exercise 4.2 ---
GPT-2 Medium: 406,212,608 params (~1549.58 MB)
GPT-2 Large: 838,220,800 params (~3197.56 MB)
GPT-2 XL: 1,637,792,000 params (~6247.68 MB)

--- Exercise 4.3 ---
Updated cfg with separate dropout values.

--- Exercise 4.1 ---
FeedForward Parameters: 4,722,432
Multi-Head Attention Parameters: 2,360,064
Total GPTModel Parameters: 163,009,536
Estimated Memory: 621.83 MB

