In [3]:
import math
import time
import os
import wget

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tiktoken

import torch 
import torch.nn as nn 
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
from torch.distributed import init_process_group, destroy_process_group

from dataclasses import dataclass
from datasets import load_dataset

from hellaswag import render_example, iterate_examples

torch.manual_seed(42)
g = torch.Generator().manual_seed(42)

AttributeError: partially initialized module 'torch' has no attribute 'version' (most likely due to a circular import)

In [6]:
import os 
print(os.listdir("."))

['.git', '.venv', '.vscode', 'classes.py', 'gpt.ipynb', 'gpt2.ipynb', 'gpt_functions.py', 'hellaswag.py', 'input.txt', 'makemore.ipynb', 'names.txt', 'notebook.ipynb', 'README.md', 'requirements.txt', 'tokenizer.ipynb', 'tokenizer_classes.py', 'tokenizer_functions.py', 'utils.py', '__pycache__']


In [None]:
# -------------------------------------
# Config GPT2
# -------------------------------------

@dataclass
class GPT2Config:
    block_size: int = 1024  # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int    = 12    # number of layers
    n_head: int     = 12    # number of heads
    n_embd: int     = 768   # embedding dimension
    
# -------------------------------------
# Attention Block
# -------------------------------------

    
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh: "number of heads", hs: "head size", and C: "number of channels" (nh * hs)
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)     # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C)                # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

# -------------------------------------
# MLP
# -------------------------------------

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x) 
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

# -------------------------------------
# Transformer Block
# -------------------------------------

class Block(nn.Module):
    """ 
    Transformer block: 
    Applies LayerNorm → CausalSelfAttention → residual add, 
    then LayerNorm → MLP → residual add.
    """
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

# -------------------------------------
# General GPT2 
# -------------------------------------

class GPT2(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),              # Word-Token Embedding
            wpe = nn.Embedding(config.block_size, config.n_embd),              # Word-Position Embedding
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]), # Multi-Head Attention
            ln_f = nn.LayerNorm(config.n_embd),                                # Layer Normalization
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # Finish with a Linear-Head

        # Weight sharing scheme: inking references (reduces the number of parameters by 30%) 
        self.transformer.wte.weight = self.lm_head.weight

        # Init params (apply _init_weights to every submodule)
        self.apply(self._init_weights)


    def _init_weights(self, module):
        """Initialize parameters of the model: cautious variance of residual connections!"""
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                # Stabilize variance (sqrt(2* number_residual_paths))
                # Per block there is 2 (Attention-Head + MLP)
                std *= (2 * self.config.n_layer) ** -0.5 
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        """Classical forward layer"""
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # Forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @classmethod 
    def from_pretrained(cls,model_type): 
        """Loads pretrained from HF"""
        assert model_type in {"gpt2","gpt2-medium","gpt2-large","gpt2-xl"}
        from transformers import GPT2LMHeadModel
        print(f"Loading weights from the pretrained gpt2Model:{model_type}")

        config_args = {
                'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
                'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
                'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
                'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]

        config_args['vocab_size'] = 50257 # Config GPT2
        config_args['block_size'] = 1024  # Config GPT2

        # Initialize a from-scratch minGPT model
        config = GPT2Config(**config_args) # Unpack model configuration
        model = GPT2(config)               # Create model instance

        # Extract state dictionary (learned parameters)
        sd = model.state_dict()
        sd_keys = [k for k in sd if not k.endswith('.attn.bias')]  # Exclude attention bias buffers (not learnable params)

        # Init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # Copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore (buffer)
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]        # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        
        # The openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # Special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # Vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        return model
    
    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        """
        Specialize optimizer treating decay and no decay parameters seperatly
        """
        # Dict all params with requires_grad
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        # Distinguish params according to their dimension
        decay_params   = [p for n,p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n,p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {"params": decay_params, "weight_decay": weight_decay},
            {"params": nodecay_params, "weight_day": 0.0}
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)

        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")

        # Check use: Special CUDA-optimized  fused kernel implementation
        
        fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
        used_fused = fused_available and device_type == "cuda"
        if master_process: 
            print(f"Using fused AdamW: {used_fused}")

        # Create AdamW optimizer with hyperparameters
        optimizer = torch.optim.AdamW(optim_groups, lr = learning_rate, betas = (0.9,0.95), eps = 1e-8, fused = used_fused)

        return optimizer


In [None]:
# -------------------------------------
# Load data and convert it in Pytorch
# -------------------------------------

def load_tokens(filename):
    "Load and Transform data into a torch tensor"
    npt = np.load(filename)
    npt = npt.astype(np.int32)                 # Vocabulary size is within int32 range
    ptt = torch.tensor(npt, dtype=torch.long)  # Pytorch requires torch.long for indexing
    return ptt

# -------------------------------------
# Shards data and process it in batches
# -------------------------------------

class DataLoaderLite:
    def __init__(self, B, T, process_rank, num_processes, split):
        self.B = B
        self.T = T
        self.process_rank = process_rank    # ID current process (0 to process_rank - 1) 
        self.num_processes = num_processes  # Total number GPUs running in parallel
        assert split in {'train', 'val'}

        # get the shard filenames
        os.makedirs("edu_fineweb10B", exist_ok = True)
        data_root = "edu_fineweb10B"                            # Root dir
        shards = os.listdir(data_root)                          # List dir's files
        shards = [s for s in shards if split in s]              # Filter files with split
        shards = sorted(shards)                                 # Order shards
        shards = [os.path.join(data_root, s) for s in shards]   # ["train_000.npy"] -> ["edu_fineweb10B/train_000.npy"]
        self.shards = shards
        assert len(shards) > 0, f"no shards found for split {split}"
        if master_process:
            print(f"found {len(shards)} shards for split {split}")
        self.reset()

    def reset(self):
        # State, init at shard zero
        self.current_shard = 0
        self.tokens = load_tokens(self.shards[self.current_shard])
        # Browse file's token by chunk B*T*Process_rank
        self.current_position = self.B * self.T * self.process_rank

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position : self.current_position+B*T+1]
        x = (buf[:-1]).view(B, T) # Inputs
        y = (buf[1:]).view(B, T)  # Targets
        # Advance the position in the tensor
        self.current_position += B * T * self.num_processes
        # if loading the next batch would be out of bounds, advance to next shard
        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
            self.current_shard = (self.current_shard + 1) % len(self.shards)
            self.tokens = load_tokens(self.shards[self.current_shard])
            self.current_position = B * T * self.process_rank
        return x, y

In [None]:
# -------------------------------------
# Helper function: HellaSwag
# -------------------------------------

def get_most_likely_row(tokens, mask, logits):
    # Evaluate the autoregressive loss at all positions
    shift_logits = (logits[..., :-1, :]).contiguous()
    shift_tokens = (tokens[..., 1:]).contiguous()
    # Flatten tensors for loss 
    flat_shift_logits = shift_logits.view(-1, shift_logits.size(-1))                       # (B * (T-1), vocab_size)
    flat_shift_tokens = shift_tokens.view(-1)                                              # (B * (T-1))
    shift_losses = F.cross_entropy(flat_shift_logits, flat_shift_tokens, reduction='none') # (B * (T-1))
    shift_losses = shift_losses.view(tokens.size(0), -1)                                   # (B, (T-1))
    # Now get the average loss just for the completion region (where mask == 1), in each row
    shift_mask = (mask[..., 1:]).contiguous() # Shift mask, so we start at the last prompt token
    masked_shift_losses = shift_losses * shift_mask
    # Sum and divide by the number of 1s in the mask
    sum_loss = masked_shift_losses.sum(dim=1)
    avg_loss = sum_loss / shift_mask.sum(dim=1)
    # Now we have a loss for each of the 4 completions
    # The one with the lowest loss should be the most likely
    pred_norm = avg_loss.argmin().item()
    return pred_norm

In [None]:
# -------------------------------------
# Distributed data parallel (DDP)
# -------------------------------------

# torchrun command sets the env variables RANK, LOCAL_RANK, and WORLD_SIZE
ddp = int(os.environ.get('RANK', -1)) != -1 # Is this a ddp run?
if ddp:
    # DDP requieres CUDA
    assert torch.cuda.is_available(), "We need CUDA for DDP"
    init_process_group(backend='nccl')

    ddp_rank       = int(os.environ['RANK'])       # Global GPU index
    ddp_local_rank = int(os.environ['LOCAL_RANK']) # Current node GPU index
    ddp_world_size = int(os.environ['WORLD_SIZE']) # Total number GPUs across nodes
    
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    # Choose process: logging, checkpointing...
    master_process = ddp_rank == 0 
else:
    # vanilla, non-DDP run
    ddp_rank       = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    # attempt to autodetect device
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda"
    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        device = "mps"
    print(f"using device: {device}")

# added after video, pytorch can be serious about it's device vs. device_type distinction
device_type = "cuda" if device.startswith("cuda") else "cpu"

# -------------------------------------
# Learning Rate function
# -------------------------------------

def get_lr(it, warmup_steps, max_steps, max_lr, min_lr):
    # 1) linear warmup for warmup_iters steps
    if it < warmup_steps:
        return max_lr * (it+1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

using device: cuda


----

In [None]:
# Reproducibility
torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

# Define precision
torch.set_float32_matmul_precision("high")

# Hyperparameters
total_batch_size = 524288 # 2**19 (close to 0.5M)
B                = 16     # Micro Batch Size (use gradient accumulation)
T                = 1024   # Sequence Length
vocab_size       = 50304 
max_lr           = 6e-4
min_lr           = max_lr * 0.1
warmup_steps     = 715
max_steps        = 19073 # 19,073 steps is ~1 epoch, if data is 10B tokens and batch size 0.5M tokens
weight_decay     = 0.1
learning_rate    = 6e-4

# Grad accum step
assert total_batch_size % (B * T * ddp_world_size) == 0, "Make sure total batch size is div by B * T * ddp_world_size"
grad_accum_steps = total_batch_size // (B * T * ddp_world_size)

# Encoder
enc = tiktoken.get_encoding("gpt2")

# Loader functions
train_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, split="train")
val_loader   = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size, split="val"  )

# Model
model = GPT2(GPT2Config(vocab_size=vocab_size))
model.to(device)    # Move to device
use_compile = False # torch.compile infers with HellaSwag
if use_compile:
    model = torch.compile(model)
if ddp:
    model = DDP(model, device_ids = [ddp_local_rank])
raw_model = model.module if ddp else model 

# Optimize
optimizer = raw_model.configure_optimizers(
    weight_decay  = weight_decay,
    learning_rate = learning_rate,
    device_type   = device_type
)

# print
if master_process: 
    print(f"total desired batch size: {total_batch_size}")
    print(f"-> Calculated grad acc steps is {grad_accum_steps}")



FileNotFoundError: [WinError 3] Le chemin d’accès spécifié est introuvable: 'edu_fineweb10B'

In [None]:
# Log directory
log_dir = "log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"log.txt")
with open(log_file, "w") as f: # open for writing to clear the file
    pass

for step in range(max_steps):
    t0 = time.time()
    last_step = (step == max_steps - 1)

    # Every 250 steps evaluate validation loss for 20 steps
    if step % 250 == 0 or last_step:
        model.eval()
        val_loader.reset()
        with torch.no_grad():
            val_loss_accum = 0.0
            val_loss_steps = 20
            for _ in range(val_loss_steps):
                x, y = val_loader.next_batch()
                x, y = x.to(device), y.to(device)
                with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
                    logits, loss = model(x, y)
                loss = loss / val_loss_steps
                val_loss_accum += loss.detach()
        if ddp:
            dist.all_reduce(val_loss_accum, op = dist.ReduceOp.AVG)
        if master_process:
            print(f"validation loss: {val_loss_accum.item():.4f}")
            with open(log_file, "a") as f:
                f.write(f"{step} val {val_loss_accum.item():.4f}\n")
            if step > 0 and (step % 5000 == 0 or last_step):
                # optionally write model checkpoints
                checkpoint_path = os.path.join(log_dir, f"model_{step:05d}.pt")
                checkpoint = {
                    'model': raw_model.state_dict(),
                    'config': raw_model.config,
                    'step': step,
                    'val_loss': val_loss_accum.item()
                }
                torch.save(checkpoint, checkpoint_path)

    # Every 250 steps evaluate hellaswag
    if (step % 250 == 0 or last_step) and (not use_compile):
        num_correct_norm = 0
        num_total = 0
        for i, example in enumerate(iterate_examples("val")):
            # only process examples where i % ddp_world_size == ddp_rank
            if i % ddp_world_size != ddp_rank:
                continue
            # render the example into tokens and labels
            _, tokens, mask, label = render_example(example)
            tokens = tokens.to(device)
            mask = mask.to(device)
            # get the logits
            with torch.no_grad():
                with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
                    logits, loss = model(tokens)
                pred_norm = get_most_likely_row(tokens, mask, logits)
            num_total += 1
            num_correct_norm += int(pred_norm == label)
        # reduce the stats across all processes
        if ddp:
            num_total = torch.tensor(num_total, dtype=torch.long, device=device)
            num_correct_norm = torch.tensor(num_correct_norm, dtype=torch.long, device=device)
            dist.all_reduce(num_total, op=dist.ReduceOp.SUM)
            dist.all_reduce(num_correct_norm, op=dist.ReduceOp.SUM)
            num_total = num_total.item()
            num_correct_norm = num_correct_norm.item()
        acc_norm = num_correct_norm / num_total
        if master_process:
            print(f"HellaSwag accuracy: {num_correct_norm}/{num_total}={acc_norm:.4f}")
            with open(log_file, "a") as f:
                f.write(f"{step} hella {acc_norm:.4f}\n")

    # do one step of the optimization
    model.train()
    optimizer.zero_grad()
    loss_accum = 0.0
    for micro_step in range(grad_accum_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        if ddp:
            model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
        with torch.autocast(device_type=device_type, dtype=torch.bfloat16):
            logits, loss = model(x, y)
        # we have to scale the loss to account for gradient accumulation,
        # because the gradients just add on each successive backward().
        # addition of gradients corresponds to a SUM in the objective, but
        # instead of a SUM we want MEAN. Scale the loss here so it comes out right
        loss = loss / grad_accum_steps
        loss_accum += loss.detach()
        loss.backward()
    if ddp:
        dist.all_reduce(loss_accum, op=dist.ReduceOp.AVG)
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # determine and set the learning rate for this iteration
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    optimizer.step()
    if device_type == "cuda":
        torch.cuda.synchronize() # wait for the GPU to finish work
    t1 = time.time()
    dt = t1 - t0 # time difference in seconds
    tokens_processed = train_loader.B * train_loader.T * grad_accum_steps * ddp_world_size
    tokens_per_sec = tokens_processed / dt
    if master_process:
        print(f"step {step:5d} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tok/sec: {tokens_per_sec:.2f}")
        with open(log_file, "a") as f:
            f.write(f"{step} train {loss_accum.item():.6f}\n")

if ddp:
    destroy_process_group()

----