<a href="https://colab.research.google.com/github/DhrubaAdhikary/ERA_V2/blob/master/S21_Training_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [2]:
# !python /content/train_get2-9-speedup9.py


In [3]:

import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 256  # reduced max sequence length to fit into 4GB GPU
    vocab_size: int = 50304  # number of tokens
    n_layer: int = 6  # increased number of layers for better learning
    n_head: int = 8  # increased number of heads for better learning
    n_embd: int = 256  # increased embedding dimension for better learning


class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

num_return_sequences = 5
max_length = 30

import tiktoken

class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T
        with open('input.txt', 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f'loaded {len(self.tokens)} tokens')
        print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position: self.current_position + B * T + 1]
        x = (buf[:-1]).view(B, T)
        y = (buf[1:]).view(B, T)
        self.current_position += B*T
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y

torch.set_float32_matmul_precision('high')
model = GPT(GPTConfig())
model.to(device)

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 50  # increased warmup steps
max_steps = 1000  # increased max steps

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

train_loader = DataLoaderLite(B=4, T=256)

import time

optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)

num_epochs = 1
accumulation_steps = 4

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for step in range(max_steps):
        t0 = time.time()
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, loss = model(x, y)
        loss.backward()
        norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)

        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            lr = get_lr(step)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        t1 = time.time()
        dt = t1 - t0
        print(f"Step {step + 1}/{max_steps}, Loss: {loss.item():.4f}, Time: {dt * 1000:.2f} ms")

using device: cuda
loaded 338025 tokens
1 epoch = 330 batches
Epoch 1/1


  norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Step 1/1000, Loss: 10.8596, Time: 1761.12 ms
Step 2/1000, Loss: 10.8898, Time: 22.25 ms
Step 3/1000, Loss: 10.8724, Time: 22.51 ms
Step 4/1000, Loss: 10.8408, Time: 91.32 ms
Step 5/1000, Loss: 10.3939, Time: 22.74 ms
Step 6/1000, Loss: 10.2975, Time: 18.73 ms
Step 7/1000, Loss: 10.3101, Time: 20.50 ms
Step 8/1000, Loss: 10.2598, Time: 23.33 ms
Step 9/1000, Loss: 10.2861, Time: 18.62 ms
Step 10/1000, Loss: 10.2660, Time: 18.78 ms
Step 11/1000, Loss: 10.2019, Time: 18.54 ms
Step 12/1000, Loss: 10.3912, Time: 18.76 ms
Step 13/1000, Loss: 10.1505, Time: 18.82 ms
Step 14/1000, Loss: 10.1383, Time: 18.93 ms
Step 15/1000, Loss: 10.1753, Time: 19.17 ms
Step 16/1000, Loss: 10.2071, Time: 20.85 ms
Step 17/1000, Loss: 10.1107, Time: 19.03 ms
Step 18/1000, Loss: 10.0649, Time: 18.98 ms
Step 19/1000, Loss: 10.0208, Time: 19.83 ms
Step 20/1000, Loss: 10.0421, Time: 19.86 ms
Step 21/1000, Loss: 9.8443, Time: 19.11 ms
Step 22/1000, Loss: 10.0723, Time: 18.12 ms
Step 23/1000, Loss: 9.8215, Time: 18.66 

In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import time
import tiktoken  # Assuming this is a utility for tokenization

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 256  # reduced max sequence length to fit into 4GB GPU
    vocab_size: int = 50304  # number of tokens
    n_layer: int = 6  # increased number of layers for better learning
    n_head: int = 8  # increased number of heads for better learning
    n_embd: int = 256  # increased embedding dimension for better learning


class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

# Initialize model and prepare for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(42)
model = GPT(GPTConfig()).to(device)
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
train_loader = DataLoaderLite(B=4, T=256)  # Adjust batch size and sequence length as needed

# Training loop
num_epochs = 20
max_steps = 1000

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for step in range(max_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits, loss = model(x, y)
        loss.backward()
        optimizer.step()
        print(f"Step {step + 1}/{max_steps} | Loss: {loss.item():.4f}")

print("Training completed successfully.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 6/1000 | Loss: 1.8202
Step 7/1000 | Loss: 1.5345
Step 8/1000 | Loss: 1.9093
Step 9/1000 | Loss: 1.6938
Step 10/1000 | Loss: 1.9894
Step 11/1000 | Loss: 1.7140
Step 12/1000 | Loss: 1.8872
Step 13/1000 | Loss: 1.7128
Step 14/1000 | Loss: 2.1265
Step 15/1000 | Loss: 2.2721
Step 16/1000 | Loss: 2.1492
Step 17/1000 | Loss: 2.0144
Step 18/1000 | Loss: 1.7331
Step 19/1000 | Loss: 2.0080
Step 20/1000 | Loss: 1.8211
Step 21/1000 | Loss: 1.7549
Step 22/1000 | Loss: 1.7236
Step 23/1000 | Loss: 1.7665
Step 24/1000 | Loss: 1.8104
Step 25/1000 | Loss: 1.9676
Step 26/1000 | Loss: 2.2003
Step 27/1000 | Loss: 2.1039
Step 28/1000 | Loss: 2.0842
Step 29/1000 | Loss: 2.1093
Step 30/1000 | Loss: 2.0263
Step 31/1000 | Loss: 2.0068
Step 32/1000 | Loss: 1.8059
Step 33/1000 | Loss: 2.0369
Step 34/1000 | Loss: 2.0976
Step 35/1000 | Loss: 1.9823
Step 36/1000 | Loss: 1.9558
Step 37/1000 | Loss: 1.9057
Step 38/1000 | Loss: 1.6564
Step 39/1000 | 

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import time
import tiktoken  # Assuming this is a utility for tokenization
from dataclasses import dataclass

# Set float32 matmul precision
torch.set_float32_matmul_precision('high')

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 1024  # reduced max sequence length to fit into 4GB GPU
    vocab_size: int = 50304  # number of tokens
    n_layer: int = 6  # increased number of layers for better learning
    n_head: int = 8  # increased number of heads for better learning
    n_embd: int = 256  # increased embedding dimension for better learning


class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

# Initialize model and prepare for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(42)
model = GPT(GPTConfig()).to(device)
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
train_loader = DataLoaderLite(B=4, T=256)  # Adjust batch size and sequence length as needed

# Cosine learning rate decay schedule
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 1000

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for step in range(max_steps):
        t0 = time.time()
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()

        # Automatic mixed precision
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, loss = model(x, y)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Adjust learning rate
        lr = get_lr(step)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        optimizer.step()
        torch.cuda.synchronize() if device == 'cuda' else None
        t1 = time.time()
        dt = (t1 - t0) * 1000
        tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)

        print(f'Step {step + 1}/{max_steps} | Loss: {loss.item():.4f} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec:.2f}')

print("Training completed successfully.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 6/1000 | Loss: 0.8309 | dt: 64.24ms | tok/sec: 15940.23
Step 7/1000 | Loss: 0.5801 | dt: 66.05ms | tok/sec: 15503.23
Step 8/1000 | Loss: 0.9098 | dt: 64.87ms | tok/sec: 15784.98
Step 9/1000 | Loss: 0.6635 | dt: 64.44ms | tok/sec: 15891.92
Step 10/1000 | Loss: 0.9181 | dt: 63.15ms | tok/sec: 16214.40
Step 11/1000 | Loss: 0.7235 | dt: 63.71ms | tok/sec: 16073.02
Step 12/1000 | Loss: 0.9365 | dt: 65.80ms | tok/sec: 15563.22
Step 13/1000 | Loss: 0.7468 | dt: 66.77ms | tok/sec: 15336.87
Step 14/1000 | Loss: 1.0954 | dt: 66.06ms | tok/sec: 15500.71
Step 15/1000 | Loss: 1.1284 | dt: 63.71ms | tok/sec: 16072.66
Step 16/1000 | Loss: 1.1530 | dt: 66.80ms | tok/sec: 15329.59
Step 17/1000 | Loss: 0.9854 | dt: 64.55ms | tok/sec: 15863.98
Step 18/1000 | Loss: 0.6366 | dt: 66.09ms | tok/sec: 15495.00
Step 19/1000 | Loss: 0.8805 | dt: 65.13ms | tok/sec: 15723.27
Step 20/1000 | Loss: 0.7552 | dt: 66.84ms | tok/sec: 15320.46
Step 21/1

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import time
import tiktoken  # Assuming this is a utility for tokenization
from dataclasses import dataclass
import inspect

# Set float32 matmul precision
torch.set_float32_matmul_precision('high')

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 1024  # reduced max sequence length to fit into 4GB GPU
    vocab_size: int = 50304  # number of tokens
    n_layer: int = 6  # increased number of layers for better learning
    n_head: int = 8  # increased number of heads for better learning
    n_embd: int = 256  # increased embedding dimension for better learning


class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

# Initialize model and prepare for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(42)
model = GPT(GPTConfig()).to(device)
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
train_loader = DataLoaderLite(B=4, T=256)  # Adjust batch size and sequence length as needed

# Cosine learning rate decay schedule
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 1000

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

# Training loop
num_epochs = 100
best_loss = float('inf')

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for step in range(max_steps):
        t0 = time.time()
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()

        # Automatic mixed precision
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, loss = model(x, y)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Adjust learning rate
        lr = get_lr(step)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        optimizer.step()
        torch.cuda.synchronize() if device == 'cuda' else None
        t1 = time.time()
        dt = (t1 - t0) * 1000
        tokens_per_sec = (train_loader.B * train_loader.T) / (t1 - t0)

        print(f'Step {step + 1}/{max_steps} | Loss: {loss.item():.4f} | dt: {dt:.2f}ms | tok/sec: {tokens_per_sec:.2f}')

        # Save the best model

        if loss.item() < best_loss:
            best_loss = loss.item()
            torch.save(model.state_dict(), 'best.pt')

    # Save the last model at the end of the epoch
    torch.save(model.state_dict(), 'last.pt')

print("Training completed successfully.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 317/1000 | Loss: 0.7369 | dt: 63.14ms | tok/sec: 16217.46
Step 318/1000 | Loss: 1.0787 | dt: 63.21ms | tok/sec: 16200.03
Step 319/1000 | Loss: 0.7519 | dt: 62.49ms | tok/sec: 16385.81
Step 320/1000 | Loss: 1.0635 | dt: 64.67ms | tok/sec: 15834.21
Step 321/1000 | Loss: 0.8708 | dt: 65.52ms | tok/sec: 15627.72
Step 322/1000 | Loss: 0.9854 | dt: 65.69ms | tok/sec: 15588.42
Step 323/1000 | Loss: 0.7235 | dt: 62.21ms | tok/sec: 16461.49
Step 324/1000 | Loss: 1.2366 | dt: 62.31ms | tok/sec: 16434.84
Step 325/1000 | Loss: 1.2624 | dt: 63.91ms | tok/sec: 16023.79
Step 326/1000 | Loss: 1.1490 | dt: 66.73ms | tok/sec: 15345.42
Step 327/1000 | Loss: 1.0779 | dt: 65.70ms | tok/sec: 15585.98
Step 328/1000 | Loss: 0.6949 | dt: 64.11ms | tok/sec: 15972.77
Step 329/1000 | Loss: 0.8919 | dt: 62.73ms | tok/sec: 16323.85
Step 330/1000 | Loss: 0.8430 | dt: 65.54ms | tok/sec: 15625.05
Step 331/1000 | Loss: 0.8951 | dt: 66.31ms | tok/sec: