In [3]:
!nvidia-smi

#### Tutorials:
- YouTube video: https://youtu.be/wM-KP_wNAeY
- Bilibili: https://www.bilibili.com/video/BV1P9tizcEKD/

List of contents:
- rope, gqa, rmsnorm, muon optimizer, swiglu, hf tokenizer and datasets, training, validation, amp

#

In [None]:
# 1. setup
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

import math, random, os, pickle, warnings, time
import numpy as np

from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset
from dataclasses import dataclass
from typing import List, Optional
warnings.filterwarnings("ignore")


In [None]:
# 2. utility functions
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Set all seeds to {seed}")

In [8]:
# 3. model config
# smaller version of qwen3: 384 dimension, 6 layers, 8 heads
@dataclass
class ModelConfig:
    # Model Archtitecture
    d_model: int = 384
    n_heads: int = 8
    n_layers: int = 6
    d_ff: int = 1536
    batch_size: int = 24
    max_steps: int = 5000
    
    # Qwen specific parameters
    n_kv_heads: int = 4
    sliding_window_size: int = 4096
    attention_bias: bool = False
    rms_norm_eps: float = 1e-6
    
    # Training Parameters
    gradient_accumulation_steps: int = 4
    muon_lr: float = 1e-2
    
    # Data parameters
    max_seq_len: int = 8192
    num_documents: int = 2000
    max_tokens: int = 500000
    
    # Evaluation parameters
    eval_every: int = 500
    eval_steps: int = 100
    
    # Regularization 
    weight_decay: float = 0.1
    dropout: float = 0.1
    grad_clip: float = 1.0
    
    # Technical
    use_amp: bool = True
    vocab_size: Optional[int] = None  
    
    def __post_init__(self):
        self.d_k = self.d_model // self.n_heads
        assert self.d_model % self.n_heads == 0, "d_model must be divisible by n_heads"
        assert self.n_heads % self.n_kv_heads == 0, "n_heads must be divisible by n_kv_heads" 
        self.n_kv_groups = self.n_heads // self.n_kv_heads 

In [34]:
# 4. Grouped Query Attention Module (GQA)
# reduce memory usage by using fewer k,v heads
def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """Repeat the key and value heads n_rep times."""   
    
    # early return if no repetition is needed
    if n_rep == 1:
        return hidden_states 
    
    # extract dimensions from input tensor
    batch, num_key_value_heads, seq_len, head_dim = hidden_states.shape
    
    # add new dimension at index 2 and expand
    # shape change: 
    # (batch, num_key_value_heads, seq_len, head_dim) ->
    # (batch, num_key_value_heads, 1, seq_len, head_dim)->
    # (batch, num_key_value_heads, n_rep, seq_len, head_dim)
    hidden_states = hidden_states.unsqueeze(2).expand(
        batch, num_key_value_heads, n_rep, seq_len, head_dim)
    
    # Flatten the num_key_value_heads and n_rep dimensions to match num_attention_heads
    # shape change:
    # (batch, num_key_value_heads, n_rep, seq_len, head_dim) ->
    # (batch, num_key_value_heads * n_rep, seq_len, head_dim)
    # this effectively repeats each key/value head n_rep times
    hidden_states = hidden_states.reshape(
        batch, num_key_value_heads * n_rep, seq_len, head_dim)
    return hidden_states


In [32]:
# [Optional] Code Exercises

# ==============================================================
# Exercise 1: Basic Tensor Creation & Shapes 
# ==============================================================
print("=" * 50)
print("Exercize #1: Tensor Creation & Shapes Exercise")
x = torch.tensor([1,2,3])
print(f"1D tensor: {x}, shape: {x.shape} ")

y = torch.tensor([[1,2,3],[4,5,6]])
print(f"2D tensor: {y}, shape: {y.shape} ")

z = torch.tensor([[[1,2],[3,4]],[[5,6],[7,8]]])
print(f"3D tensor: {z}, shape: {z.shape} ")

tensor_4d = torch.ones((2,3,4,5))
# print(f"4D tensor: {tensor_4d}, shape: {tensor_4d.shape} ")
print(f"4D tensor shape: {tensor_4d.shape} ")

tensor_4d_copy = tensor_4d.clone()
print(f"Cloned 4D tensor shape: {tensor_4d_copy.shape} ")

# ==============================================================
# Exercise 2: Understanding None Indexing for dimension expansion
# ==============================================================
print("=" * 50)
print("Exercize #2: Understanding None Indexing for dimension expansion")
a = torch.Tensor([1, 2, 3, 4])
print(f"Original tensor a: {a}, shape: {a.shape}")
# add dimmension at different positions
a_new_dim0 = a[None, :] # or a.unsqueeze(0) 
print(f"Added new dimension at index 0: {a_new_dim0}, shape: {a_new_dim0.shape}")
a_new_dim1 = a[:, None] # or a.unsqueeze(1) 
print(f"Added new dimension at index 1: {a_new_dim1}, shape: {a_new_dim1.shape}")
a_new_dim_end = a[..., None] # or a.unsqueeze(-1)
print(f"Added new dimension at end: {a_new_dim_end}, shape: {a_new_dim_end.shape}")
# multiple new dimensions
a_multi_new_dims = a[None, :, None, None] # or a.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
print(f"Added multiple new dimensions: {a_multi_new_dims}, shape: {a_multi_new_dims.shape}") 


In [None]:

# ==============================================================
# Exercise 3: Understanding Expand 
# ==============================================================
print("=" * 50)
print("Exercize #3: Understanding Expand")
# expand() creates a new view with repeated elements without copying data
b = torch.tensor([[1, 2, 3]])  # shape (1, 3)
print(f"Original tensor b shape: {b.shape}, b: {b}")

# Expand the first dimension from 1 to 4
b_expanded = b.expand(4, 3)  # shape (4, 3)
print(f"Expanded tensor b shape: {b_expanded.shape}, \nb_expanded: {b_expanded}") 

# expand with -1: keep original size of that dimension
c = torch.tensor([[1],[2],[3]])     # shape (3, 1)
print(f"Original tensor c shape: {c.shape}, c: {c}")
c_expanded = c.expand(-1, 4)  # shape (3, 4)
print(f"Expanded tensor c shape: {c_expanded.shape}, \nc_expanded: {c_expanded}") 




In [None]:

# expand in multiple dimensions
d = torch.tensor([[1,2]]) # shape (1, 2)
d_expanded = d.repeat(3, 2)  # shape (2, 3, 4)
print(f"Original tensor d shape: {d.shape}, d: {d}")
print(f"Expanded tensor d shape: {d_expanded.shape}, \nd_expanded: {d_expanded}")
