In [1]:
# import the config.json to read the model configuration
import json

with open('../config/model.json') as f:
    model_config = json.load(f)
print(model_config)

with open('../config/train.json') as f:
    train_config = json.load(f)
print(train_config)

{'batch_size': 32, 'block_size': 32, 'vocab_size': 12992, 'n_embd': 384, 'n_head': 12, 'n_layer': 12, 'dropout': 0.0}
{'learning_rate': 0.0003, 'max_iters': 100000, 'lr_decay_iters': 100000, 'weight_decay': 0.1, 'beta1': 0.9, 'beta2': 0.95, 'warmup_iters': 1000, 'eval_iters': 2000, 'eval_interval': 2500, 'log_interval': 10}


In [2]:
from easydict import EasyDict

# warp the dict into EasyDict to allow to access dict values as attributes
model_config = EasyDict(model_config)
train_config = EasyDict(train_config)

In [3]:
import inspect
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
# All building blocks for the GPT: LayerNorm, Self Attention and MLP 
class LayerNorm(nn.Module):
    
    def __init__(self, n_dim) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.ones(n_dim))
    
    def forward(self, input):
        # Bias is not used in this model
        return F.layer_norm(input, self.weight.shape, self.weight, bias=False, eps=1e-5)

class SelfAttention(nn.Module):
    
    def __init__(self, config) -> None:
        super().__init__()
        assert config.n_embd % config.n_head == 0, "The remainder of embedding and head number should be zero."
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
        
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size))
                                          .view(1, 1, config.block_size, config.block_size))
    
    def forward(self, x):
        B, T, C = x.shape # batch_size, sequence length (block_size), embedding dimension (n_embd)
        
        q, k, v = torch.split(self.c_attn(x), split_size_or_sections=self.n_embd, dim=2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(mask=(self.bias[:, :, :self.block_size, :self.block_size] == 0), value=float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        
        return self.resid_dropout(self.c_proj(y))

class MLP(nn.Module):
    
    def __init__(self, config) -> None:
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
        self.gelu    = nn.GELU()
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x
    
class Block(nn.Module):
    
    def __init__(self, config) -> None:
        super().__init__()
        self.ln_1 = LayerNorm(config.n_embd)
        self.attn = SelfAttention(config)
        self.ln_2 = LayerNorm(config.n_embd)
        self.mlp = MLP(config)
    
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

In [5]:
class GPT(nn.Module):
    
    def __init__(self, config) -> None:
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config
        
        # The transformer architecture
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            attn = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = LayerNorm(config.n_embd)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        
        # weight tying
        # share the same weight between the input embedding and output layer
        self.transformer.wte.weight = self.lm_head.weight
        
        # init all weights
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
                
         # report number of parameters in the model
        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            # if module.bias is not None:
            #     nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
        
    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            # the position parameter will be subtracted 
            n_params -= self.transformer.wpe.weight.numel()
        return n_params

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        pos = torch.arange(0, t, dtype=torch.long, device=device)
        
        # the forward pass
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.attn:
            x = block(x)
        x = self.transformer.ln_f(x)
        
        # final outcome based on different mode
        if targets is not None: # in training mode
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1), ignore_index=-1)
        else: # in the inference mode
            logits = self.lm_head(x[:, [-1], :]) # using list [-1] to preserve the time dim
            loss = None
        
        return logits, loss

In [6]:
model = GPT(model_config)

number of parameters: 26.23M


In [7]:
def configure_optimizers(model, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in model.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

In [10]:
optimizer = configure_optimizers(model, train_config.weight_decay, 
                            train_config.learning_rate, 
                            (train_config.beta1, train_config.beta2),
                            device_type='cpu')

num decayed parameter tensors: 50, with 26,234,880 parameters
num non-decayed parameter tensors: 25, with 9,600 parameters
using fused AdamW: False


In [11]:
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0003
    maximize: False
    weight_decay: 0.1

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.95)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0003
    maximize: False
    weight_decay: 0.0
)

In [12]:
def estimate_mfu(model, fwdbwd_per_iter, dt):   
    N = model.get_num_params()
    cfg = model.config
    L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
    
    flops_per_token = 6 * N + 12 * L * H * Q * T
    flops_per_fwdbwd = flops_per_token * T
    flops_per_iter = flops_per_fwdbwd * flops_per_fwdbwd
    
    flops_achieved = flops_per_iter * (1.0/dt)
    flops_promised = 29.15e12 # On RTX 4070 half peak flops is 29.15 TFLOPS
    mfu = flops_achieved / flops_promised
    
    return mfu