In [None]:
# Test HuggingFace GPT2 model

from transformers import GPT2LMHeadModel
from transformers import pipeline, set_seed

model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # 124M
sd_hf = model_hf.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

generator = pipeline('text-generation', model=model_hf, tokenizer='gpt2', device=0)
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

In [2]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import inspect
import os

import math
import tiktoken

from torch.distributed import init_process_group, destroy_process_group
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist

ddp = int(os.environ.get('RANK', -1)) != -1     # True if ddp run
if ddp:
    assert torch.cuda.is_available()
    init_process_group(backend='nccl')
    ddp_rank = dist.get_rank()
    ddp_local_rank = int(os.environ.get('LOCAL_RANK', 0))
    ddp_world_size = dist.get_world_size()
    device = f'cuda:{ddp_local_rank}'
    torch.cuda.set_device(device)
    master_process = ddp_rank == 0
else:
    ddp_rank = 0
    ddp_local_rank = 0
    ddp_world_size = 1
    master_process = True
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Using device {device}')

device_type = 'cuda' if device.startswith('cuda') else 'cpu'

Using device cuda


In [3]:
class DataLoaderLite:
    def __init__(self, B, T, process_rank=0, num_processes=1):
        self.B = B
        self.T = T
        self.process_rank = process_rank
        self.num_processes = num_processes
        
        # read shakespeare.txt
        with open('shakespeare.txt', 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f"loaded {len(self.tokens)} tokens")
        print(f"1 epoch = {len(self.tokens) // (B * T)} batches")

        self.current_position = B*T*process_rank

    def __next__(self):
        B, T = self.B, self.T

        buf = self.tokens[self.current_position:self.current_position + B*T + 1]
        x = buf[:-1].view(B, T)
        y = buf[1:].view(B, T)

        self.current_position += B*T*self.num_processes
        if self.current_position + B*T*self.num_processes + 1 > len(self.tokens):
            self.current_position = self.B*self.T*self.process_rank
            
        return x, y


* Learnable positional encodings. Original Attention paper uses sins and cosines with different frequencies for positional encoding.
* Layer Norm is done in the order that preserves the gradient to the input.
* The MLP is applied to individual tokens
* The heads are a reduce operation (communication)
* Weight initialization should be diffuse. The output probabilities are uniform, so that loss is ln(1/50257)
* wte is shared with the lm_haed: the justification is that embeddings of similar tokens are supposed to be similar, and on output, similar tokens are supposed to have similar logits/probabilities.
* Weight initialization:
    * The residual path initalization are normalized by the sqrt number of branches to the residual path (to maintain the variance to normal)
* Weight decay forces more of the weights to be active (distributing computation rather than focusing on select weights that get blown up too large)
### Scaling: Dtypes
* Default tensor dtype: float32
* INT8: used in inference. float is better to estimate the normal distributions in training.
* GPU memory bandwidth. Usually a bottleneck in highly optimized pipelines.
### Tensor cores:
* Is just an instruction: (4 x 4) matrix multiplication
* Configurations: precision
* Check A100 docs
* Multiply accumulate operations
* Internally precision is truncated to speed up operations. Empirically the difference is negligible.
* watch -n 0.1 nvidia-smi (check GPU status)
* Using TensorFloat32 -> Expected 8x speed up. Memory bandwidth bottleneck (just 3x speed up)
* BF16 -> same range as TF32/FP32, but reduced precision. Use autocast!
* FP16 -> requires gradient scalers
### Speedups:
* model = torch.compile(model): 
    * reduce python overhead: It sees the whole model and compiles it as a single object, rather than a line by line python interpretations.
    * reduce GPU reads/writes: kernel fusion. Multiple operations are fused into a single operation on the GPU and avoid roundtrips to the HBM for storing and loading intermediate values.
* Memory:
    * HBM
    * on chip caches (Fast but very small)
* Flash Attention:
    * also kernel fusion. More flops but faster because stay on GPU chip
    * Online softmax evaluation using intermediate values
* nice numbers: powers of 2
    1) scan your code for ugly numbers (not many powers of 2)
        50257 -> 50304: 97 ms -> 93 ms with just one change! even though we're adding FLOPs. EVEN MORE IMPROVEMENT IF USING DIFF PYTORCH VERSION
    2) CUDA Kernels work on powers of 2 and have side kernels for remaining ugly numbers. Operations are done on side kernels
* GPT-3 Improvements:
    * Mainly by training more and larger context window
    * Hyperparameters:
        1) Adam opt: beta1, beta2, eps = 0.9, 0.95, 1e-8
        2) Clip grad norm: in the case of bad batches. Avoid shocks
            * The norm is a good indicator of training stability
        3) learning rate schedule
        4) increasing batch size linearly:
            * Early in training model just learn to reduce prob of unseen tokens. Not much is learned other than biases. Gradients are correlated
            * Later on gradients become decorrelated
* fused AdamW: kernel fusion again -> single update on all parameters rather than multiple kernels
### Gradient Accumulation:
* In an attempt to simulate larger batch size, we accumulate the gradient of multiple smaller batches
### DistributedDataParallel:
* To use multiple GPUs
* 8 GPUs -> 8 Processes
* use torchrun not python
* Parameters:
    * World_size: number of processes
    * Rank: GPU number
    * Local rank: used when you have multiple clusters/boxes
    * device is set to cuda:{gpu_local_rank}
    * master_process flag
* torchrun
* Forward pass is identical
* Backward pass -> allreduce() on gradients (average) and stored on each gpu
    * allreduce is called on loss.backward(), we don't want that, we want to sync only on last step
* model is now a ddp object: need to get model.module for some operations

In [4]:

@dataclass
class GPTConfig:
    vocab_size: int = 50257
    block_size: int = 1024
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768


class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0

        self.c_attn = nn.Linear(config.n_embd, config.n_embd * 3)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

        self.n_head = config.n_head
        self.n_embed = config.n_embd


    def forward(self, x):
        B, T, C = x.size()

        qkv = self.c_attn(x)
        q, k, v = torch.chunk(qkv, 3, dim=-1)

        # split heads
        # (B, T, C) -> (B, T, nh, hs) -> (B, nh, T, hs)
        # pytorch treats (B, nh) as a batch dimension so parallelizes the operations over them
        # and (T, hs) as a feature dimension

        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        qkv = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        qkv = qkv.transpose(1, 2).contiguous().view(B, T, C)

        y = self.c_proj(qkv)
        return y



class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.c_fc = nn.Linear(config.n_embd, config.n_embd * 4)
        self.c_proj = nn.Linear(config.n_embd * 4, config.n_embd)
        self.act = nn.GELU(approximate="tanh")

        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.act(self.c_fc(x))
        x = self.c_proj(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing
        self.lm_head.weight = self.transformer['wte'].weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer)**-0.5
            module.weight.data.normal_(mean=0.0, std=std)      # Close to Xavier initialization
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)


    def forward(self, x, targets = None):
        x = self.transformer['wte'](x) + self.transformer['wpe'](torch.arange(x.size(1), device=x.device))
        for block in self.transformer['h']:
            x = block(x)
        x = self.transformer['ln_f'](x)
        logits = self.lm_head(x)

        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        else:
            loss = None

        return logits, loss
    
    def generate(self, x, max_len=100, num_return_sequences=5):
        self.eval()
        with torch.no_grad():
            for _ in range(max_len):
                logits = self(x)
                probs = F.softmax(logits[:, -1], dim=-1)
                topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
                next_token = torch.multinomial(topk_probs, num_samples=1)
                xcol = torch.gather(topk_indices, -1, next_token)
                x = torch.cat((x, xcol), dim=1)

        return x
    
    def configure_optimizers(self, weight_decay, learning_rate, device):
        # Get all parameters that require a gradient
        param_dict = {k: p for k, p in self.named_parameters() if p.requires_grad}

        # Get parameters that require weight decay, and those that don't
        decay_params = [v for k, v in param_dict.items() if v.dim() >= 2]
        no_decay_params = [v for k, v in param_dict.items() if v.dim() < 2]     # bias, layernorm, etc (1D params)
        
        optim_groups = [
            {"params": decay_params, "weight_decay": weight_decay},
            {"params": no_decay_params, "weight_decay": 0.0},
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_no_decay_params = sum(p.numel() for p in no_decay_params)
        print(f"weight decay: {num_decay_params} params, no weight decay: {num_no_decay_params} params")

        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters        # use fused optimizer if available
        used_fused = fused_available and 'cuda' in device
        print(f"Using fused Adam: {used_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8)
        return optimizer

    @classmethod        # A constructor
    def from_pretrained(cls, model_type):
        assert model_type in ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]
        from transformers import GPT2LMHeadModel
        print(f"Loading {model_type} from transformers")

        config_args = {
            "gpt2": dict(n_layer=12, n_head=12, n_embd=768, block_size=1024, vocab_size=50257),
            "gpt2-medium": dict(n_layer=24, n_head=16, n_embd=1024, block_size=1024, vocab_size=50257),
            "gpt2-large": dict(n_layer=36, n_head=20, n_embd=1280, block_size=1024, vocab_size=50257),
            "gpt2-xl": dict(n_layer=48, n_head=25, n_embd=1600, block_size=1024, vocab_size=50257),
        }[model_type]

        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]

        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # Get hugging face keys whose values we want to copy
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]

        # some weights are transposed in the huggingface model, so we need to transpose them back
        transposed = ['attn.c_attn.weight','attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight' ] 

        assert len(sd_keys_hf) == len(sd_keys), f"missing keys {set(sd_keys) - set(sd_keys_hf)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                try:
                    assert sd_hf[k].shape[::-1] == sd[k].shape
                except AssertionError:
                    print(f"Error: {k} {sd_hf[k].shape[::-1]} != {sd[k].shape}")
                    raise AssertionError
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                try:
                    assert sd_hf[k].shape == sd[k].shape
                except Exception as e:
                    print(f"Error: {k} {sd_hf[k].shape[::-1]} != {sd[k].shape}")
                    raise Exception
                with torch.no_grad(): 
                    sd[k].copy_(sd_hf[k])

        return model

In [5]:
import time

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

total_batch_size = 524288   # number of tokens 2^19
B = 8
T = 1024
assert total_batch_size % (B*T*ddp_world_size) == 0
grad_acc_steps = total_batch_size // (B*T*ddp_world_size)

train_loader = DataLoaderLite(B=B, T=T, process_rank=ddp_rank, num_processes=ddp_world_size)

torch.set_float32_matmul_precision('high') # TensorFloat32

# create model
model = GPT(GPTConfig(vocab_size=50304))
model.to(device)
model = torch.compile(model)
if ddp:
    model = DDP(model, device_ids=[ddp_local_rank])
raw_model = model.module if ddp else model

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 50
def get_lr(it):
    # linear warmup
    if it < warmup_steps:
        lr = min_lr + (max_lr - min_lr) * it / warmup_steps
    
    # After max_steps, return min_lr
    if it > max_steps:
        lr = min_lr
    
    # In between, cosine decay down to min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, betas=(0.9,0.95), eps=1e-8)      # 3e-4 is a good learning rate for debugging stage
optimizer = raw_model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device=device)

for step in range(max_steps):   # A step is a single optimization step
    t0 = time.time()
    optimizer.zero_grad()

    loss_accum = 0
    for micro_step in range(grad_acc_steps):   # A micro_step is a single forward and backward pass
        x, y = next(train_loader)
        x, y = x.to(device), y.to(device)
        # with torch.autocast(device_type=device, dtype=torch.bfloat16):    #  Not available for my GPU
        logits, loss = model(x, y)
        loss = loss / grad_acc_steps        # Scale the loss to get the mean of the grads not the sum
        loss_accum += loss.detach()
        if ddp:
            model.require_backward_grad_sync = (micro_step == grad_acc_steps - 1) # only sync on last step (when loss.backward() is called)
        loss.backward()     # Accumulate gradients: adds the gradient of loss to the grad attribute of the model parameters

    if ddp:
        dist.all_reduce(loss_accum, op=dist.ReduceOp.SUM)

    # import code; code.interact(local=locals()) # drop into interactive shell
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    optimizer.step()
    torch.cuda.synchronize()    # wait for everything to finish on GPU
    t1 = time.time()
    dt = (t1 - t0) * 1000
    tokens_per_sec = (train_loader.B * train_loader.T * grad_acc_steps ** ddp_world_size) / (dt / 1000)
    print(f"step {step} loss {loss_accum.item()} | norm: {norm:.4f} | dt: {dt:.2f}ms | tokens_per_sec: {tokens_per_sec}")
     # .item() converts tensor to scalar and returns to CPU
    break
    
if ddp:
    destroy_process_group  

loaded 338025 tokens
1 epoch = 41 batches
weight decay: 124354560 params, no weight decay: 121344 params
Using fused Adam: True


  return node.target(*args, **kwargs)


BackendCompilerFailed: backend='inductor' raised:
RuntimeError: Cannot find a working triton installation. More information on installing Triton can be found at https://github.com/openai/triton

Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information


You can suppress this exception and fall back to eager by setting:
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True


In [None]:
sd_hf["lm_head.weight"].data_ptr()

In [None]:
import sys; sys.exit(0) # used to skip training for example

In [1]:
import torch._dynamo
torch._dynamo.config.suppress_errors = False