<a href="https://colab.research.google.com/github/DhrubaAdhikary/ERA_V2/blob/master/S21_Training_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [9]:
!python /content/train_get2-9-speedup9.py


using device: cuda
loaded 338025 tokens
1 epoch = 20 batches
num decayed parameter tensors: 14, with 7,159,808 parameters
num non-decayed parameter tensors: 26, with 5,248 parameters
using fused AdamW: True
  norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)
step0 | loss: 10.833976745605469 | dt: 1302.34ms | tok/sec:  12580.41 | norm: 1.99
step1 | loss: 10.792938232421875 | dt: 505.03ms | tok/sec:  32441.83 | norm: 2.28
step2 | loss: 10.738201141357422 | dt: 534.44ms | tok/sec:  30656.51 | norm: 2.11
step3 | loss: 10.647987365722656 | dt: 513.16ms | tok/sec:  31927.50 | norm: 2.14
step4 | loss: 10.52634048461914 | dt: 513.19ms | tok/sec:  31925.63 | norm: 1.93
step5 | loss: 10.46133804321289 | dt: 519.00ms | tok/sec:  31568.38 | norm: 1.65
step6 | loss: 10.40573501586914 | dt: 516.63ms | tok/sec:  31713.02 | norm: 1.51
step7 | loss: 10.331657409667969 | dt: 515.41ms | tok/sec:  31788.48 | norm: 1.52
step8 | loss: 10.238937377929688 | dt: 530.42ms | tok/sec:  30888.71 | norm

In [13]:

import os
import math
import time
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 256  # reduced max sequence length to fit into 4GB GPU
    vocab_size: int = 50304  # number of tokens
    n_layer: int = 6  # increased number of layers for better learning
    n_head: int = 8  # increased number of heads for better learning
    n_embd: int = 256  # increased embedding dimension for better learning


class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

num_return_sequences = 5
max_length = 30

import tiktoken

class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T
        with open('input.txt', 'r') as f:
            text = f.read()
        enc = tiktoken.get_encoding('gpt2')
        tokens = enc.encode(text)
        self.tokens = torch.tensor(tokens)
        print(f'loaded {len(self.tokens)} tokens')
        print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
        self.current_position = 0

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position: self.current_position + B * T + 1]
        x = (buf[:-1]).view(B, T)
        y = (buf[1:]).view(B, T)
        self.current_position += B*T
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y

torch.set_float32_matmul_precision('high')
model = GPT(GPTConfig())
model.to(device)

max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 50  # increased warmup steps
max_steps = 1000  # increased max steps

def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    if it > max_steps:
        return min_lr
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

train_loader = DataLoaderLite(B=4, T=256)

import time

optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)

num_epochs = 50
accumulation_steps = 4

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for step in range(max_steps):
        t0 = time.time()
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        with torch.autocast(device_type=device, dtype=torch.bfloat16):
            logits, loss = model(x, y)
        loss.backward()
        norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)

        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            lr = get_lr(step)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        t1 = time.time()
        dt = t1 - t0
        print(f"Step {step + 1}/{max_steps}, Loss: {loss.item():.4f}, Time: {dt * 1000:.2f} ms")

using device: cuda
loaded 338025 tokens
1 epoch = 330 batches
Epoch 1/50
Step 1/1000, Loss: 10.8596, Time: 20.01 ms
Step 2/1000, Loss: 10.8898, Time: 15.23 ms


  norm = torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 446/1000, Loss: 0.6513, Time: 14.50 ms
Step 447/1000, Loss: 9.6184, Time: 14.47 ms
Step 448/1000, Loss: 0.2763, Time: 19.72 ms
Step 449/1000, Loss: 7.5801, Time: 14.67 ms
Step 450/1000, Loss: 0.6702, Time: 13.97 ms
Step 451/1000, Loss: 8.1250, Time: 14.81 ms
Step 452/1000, Loss: 0.3191, Time: 19.28 ms
Step 453/1000, Loss: 9.0543, Time: 14.85 ms
Step 454/1000, Loss: 0.6352, Time: 14.32 ms
Step 455/1000, Loss: 7.6057, Time: 17.10 ms
Step 456/1000, Loss: 0.3048, Time: 19.92 ms
Step 457/1000, Loss: 8.6205, Time: 15.32 ms
Step 458/1000, Loss: 0.7164, Time: 14.99 ms
Step 459/1000, Loss: 8.5822, Time: 15.66 ms
Step 460/1000, Loss: 0.2995, Time: 26.08 ms
Step 461/1000, Loss: 9.3616, Time: 15.07 ms
Step 462/1000, Loss: 0.6503, Time: 14.75 ms
Step 463/1000, Loss: 8.4021, Time: 14.12 ms
Step 464/1000, Loss: 0.2758, Time: 19.87 ms
Step 465/1000, Loss: 8.3339, Time: 14.80 ms
Step 466/1000, Loss: 0.7066, Time: 15.44 ms
Step 467/10

KeyboardInterrupt: 

In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import time
import tiktoken  # Assuming this is a utility for tokenization

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 256  # reduced max sequence length to fit into 4GB GPU
    vocab_size: int = 50304  # number of tokens
    n_layer: int = 6  # increased number of layers for better learning
    n_head: int = 8  # increased number of heads for better learning
    n_embd: int = 256  # increased embedding dimension for better learning


class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte=nn.Embedding(config.vocab_size, config.n_embd),
            wpe=nn.Embedding(config.block_size, config.n_embd),
            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f=nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        pos_emb = self.transformer.wpe(pos)
        tok_emb = self.transformer.wte(idx)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        param_dict = {pn: p for pn, p in self.named_parameters() if p.requires_grad}
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

# Initialize model and prepare for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(42)
model = GPT(GPTConfig()).to(device)
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device_type=device)
train_loader = DataLoaderLite(B=4, T=256)  # Adjust batch size and sequence length as needed

# Training loop
num_epochs = 10
max_steps = 1000

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for step in range(max_steps):
        x, y = train_loader.next_batch()
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits, loss = model(x, y)
        loss.backward()
        optimizer.step()
        print(f"Step {step + 1}/{max_steps} | Loss: {loss.item():.4f}")

print("Training completed successfully.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Step 6/1000 | Loss: 3.6316
Step 7/1000 | Loss: 3.7652
Step 8/1000 | Loss: 3.9952
Step 9/1000 | Loss: 3.5035
Step 10/1000 | Loss: 3.6250
Step 11/1000 | Loss: 3.7032
Step 12/1000 | Loss: 3.7785
Step 13/1000 | Loss: 3.7883
Step 14/1000 | Loss: 3.9296
Step 15/1000 | Loss: 3.7718
Step 16/1000 | Loss: 3.3311
Step 17/1000 | Loss: 3.7206
Step 18/1000 | Loss: 3.1080
Step 19/1000 | Loss: 3.4712
Step 20/1000 | Loss: 3.6328
Step 21/1000 | Loss: 3.5490
Step 22/1000 | Loss: 3.5176
Step 23/1000 | Loss: 3.8103
Step 24/1000 | Loss: 4.0384
Step 25/1000 | Loss: 3.8005
Step 26/1000 | Loss: 3.9998
Step 27/1000 | Loss: 3.6408
Step 28/1000 | Loss: 3.7072
Step 29/1000 | Loss: 3.5351
Step 30/1000 | Loss: 3.3009
Step 31/1000 | Loss: 4.0422
Step 32/1000 | Loss: 3.9040
Step 33/1000 | Loss: 3.5366
Step 34/1000 | Loss: 3.5244
Step 35/1000 | Loss: 3.2189
Step 36/1000 | Loss: 3.4556
Step 37/1000 | Loss: 3.4406
Step 38/1000 | Loss: 3.3520
Step 39/1000 | 