In [2]:
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import numpy as np

In [5]:
# ---------- SETTINGS ----------
device = "cuda" if torch.cuda.is_available() else "cpu"
dataset_path = "data/human_chat_1gb_readable.txt"

In [6]:
# Model hyperparameters
batch_size = 32
block_size = 256
embed_dim = 128
n_heads = 4
n_layers = 4
dropout = 0.1
learning_rate = 2e-4
num_steps = 20000

In [7]:
# ---------- LOAD DATA ----------
print("ðŸ“‚ Loading dataset...")
with open(dataset_path, encoding="utf-8") as f:
    text = f.read()

print(f"Dataset length: {len(text)/1e6:.2f}M characters")

# Create a simple character vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Vocab size: {vocab_size}")

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

def encode(s):
    return [stoi[c] for c in s]

def decode(tokens):
    return ''.join([itos[t] for t in tokens])

data = torch.tensor(encode(text), dtype=torch.long)

ðŸ“‚ Loading dataset...
Dataset length: 6.22M characters
Vocab size: 89


In [8]:
# Train/val split
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    d = train_data if split == "train" else val_data
    ix = torch.randint(len(d) - block_size, (batch_size,))
    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+1+block_size] for i in ix])
    return x.to(device), y.to(device)

In [9]:
# ---------- MODEL DEFINITION ----------
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embed_dim, head_size, bias=False)
        self.query = nn.Linear(embed_dim, head_size, bias=False)
        self.value = nn.Linear(embed_dim, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        w = q @ k.transpose(-2, -1) * (C ** -0.5)
        w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)
        v = self.value(x)
        out = w @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_heads):
        super().__init__()
        head_size = n_embd // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TinyGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(block_size, embed_dim)
        self.blocks = nn.Sequential(*[Block(embed_dim, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_emb(idx)
        pos_emb = self.pos_emb(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens=100):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_id), dim=1)
        return idx

model = TinyGPT().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [10]:
# ---------- TRAIN LOOP ----------
for step in tqdm(range(num_steps)):
    model.train()
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        model.eval()
        with torch.no_grad():
            xb, yb = get_batch("val")
            _, val_loss = model(xb, yb)
        print(f"Step {step}: train loss {loss.item():.4f}, val loss {val_loss.item():.4f}")

  0%|          | 4/20000 [00:00<53:41,  6.21it/s]  

Step 0: train loss 4.6260, val loss 4.5001


  3%|â–Ž         | 504/20000 [00:18<11:47, 27.56it/s]

Step 500: train loss 2.2994, val loss 2.2753


  5%|â–Œ         | 1004/20000 [00:37<12:15, 25.83it/s]

Step 1000: train loss 2.1628, val loss 2.1302


  8%|â–Š         | 1504/20000 [00:56<11:30, 26.78it/s]

Step 1500: train loss 2.0083, val loss 1.9547


 10%|â–ˆ         | 2003/20000 [01:15<11:00, 27.23it/s]

Step 2000: train loss 1.8927, val loss 1.8301


 13%|â–ˆâ–Ž        | 2505/20000 [01:33<11:32, 25.27it/s]

Step 2500: train loss 1.8234, val loss 1.6935


 15%|â–ˆâ–Œ        | 3004/20000 [01:52<10:37, 26.67it/s]

Step 3000: train loss 1.6998, val loss 1.6784


 18%|â–ˆâ–Š        | 3505/20000 [02:10<10:11, 27.00it/s]

Step 3500: train loss 1.6664, val loss 1.5683


 20%|â–ˆâ–ˆ        | 4005/20000 [02:28<09:54, 26.90it/s]

Step 4000: train loss 1.6437, val loss 1.5205


 23%|â–ˆâ–ˆâ–Ž       | 4504/20000 [02:46<09:58, 25.89it/s]

Step 4500: train loss 1.6358, val loss 1.4601


 25%|â–ˆâ–ˆâ–Œ       | 5006/20000 [03:06<10:00, 24.95it/s]

Step 5000: train loss 1.5367, val loss 1.4866


 28%|â–ˆâ–ˆâ–Š       | 5502/20000 [03:25<10:04, 23.99it/s]

Step 5500: train loss 1.5327, val loss 1.4565


 30%|â–ˆâ–ˆâ–ˆ       | 6003/20000 [03:43<08:21, 27.92it/s]

Step 6000: train loss 1.4896, val loss 1.4253


 33%|â–ˆâ–ˆâ–ˆâ–Ž      | 6504/20000 [04:01<08:04, 27.87it/s]

Step 6500: train loss 1.4272, val loss 1.3410


 35%|â–ˆâ–ˆâ–ˆâ–Œ      | 7006/20000 [04:19<07:49, 27.68it/s]

Step 7000: train loss 1.3828, val loss 1.3247


 38%|â–ˆâ–ˆâ–ˆâ–Š      | 7506/20000 [04:37<07:44, 26.91it/s]

Step 7500: train loss 1.3703, val loss 1.3354


 40%|â–ˆâ–ˆâ–ˆâ–ˆ      | 8006/20000 [04:55<07:38, 26.14it/s]

Step 8000: train loss 1.3046, val loss 1.3223


 43%|â–ˆâ–ˆâ–ˆâ–ˆâ–Ž     | 8506/20000 [05:14<07:02, 27.18it/s]

Step 8500: train loss 1.3619, val loss 1.2993


 45%|â–ˆâ–ˆâ–ˆâ–ˆâ–Œ     | 9004/20000 [05:34<06:32, 27.99it/s]

Step 9000: train loss 1.3899, val loss 1.2116


 48%|â–ˆâ–ˆâ–ˆâ–ˆâ–Š     | 9506/20000 [05:51<06:14, 27.99it/s]

Step 9500: train loss 1.2788, val loss 1.2473


 50%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ     | 10004/20000 [06:08<06:03, 27.49it/s]

Step 10000: train loss 1.3181, val loss 1.2819


 53%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž    | 10505/20000 [06:27<05:53, 26.88it/s]

Step 10500: train loss 1.3335, val loss 1.2298


 55%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ    | 11005/20000 [06:44<05:29, 27.32it/s]

Step 11000: train loss 1.2665, val loss 1.2475


 58%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Š    | 11505/20000 [07:02<05:17, 26.76it/s]

Step 11500: train loss 1.2693, val loss 1.2444


 60%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ    | 12004/20000 [07:21<05:09, 25.80it/s]

Step 12000: train loss 1.2994, val loss 1.1694


 63%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž   | 12504/20000 [07:38<04:26, 28.17it/s]

Step 12500: train loss 1.2362, val loss 1.1936


 65%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ   | 13004/20000 [07:56<04:29, 25.97it/s]

Step 13000: train loss 1.2039, val loss 1.1848


 68%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Š   | 13506/20000 [08:14<04:25, 24.47it/s]

Step 13500: train loss 1.2822, val loss 1.1536


 70%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ   | 14005/20000 [08:32<03:29, 28.58it/s]

Step 14000: train loss 1.2579, val loss 1.2020


 73%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž  | 14504/20000 [08:50<03:21, 27.33it/s]

Step 14500: train loss 1.2394, val loss 1.1324


 75%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ  | 15004/20000 [09:07<02:59, 27.80it/s]

Step 15000: train loss 1.2052, val loss 1.1620


 78%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Š  | 15506/20000 [09:24<02:35, 28.97it/s]

Step 15500: train loss 1.1918, val loss 1.1318


 80%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ  | 16005/20000 [09:42<02:20, 28.37it/s]

Step 16000: train loss 1.1939, val loss 1.2047


 83%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž | 16503/20000 [09:59<01:59, 29.27it/s]

Step 16500: train loss 1.1755, val loss 1.1907


 85%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ | 17005/20000 [10:16<01:46, 28.09it/s]

Step 17000: train loss 1.2373, val loss 1.1986


 88%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Š | 17503/20000 [10:34<01:28, 28.22it/s]

Step 17500: train loss 1.2170, val loss 1.1196


 90%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ | 18006/20000 [10:51<01:10, 28.33it/s]

Step 18000: train loss 1.1468, val loss 1.0989


 93%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Ž| 18504/20000 [11:08<00:53, 27.93it/s]

Step 18500: train loss 1.1567, val loss 1.1268


 95%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ| 19005/20000 [11:26<00:34, 29.01it/s]

Step 19000: train loss 1.2473, val loss 1.0631


 98%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Š| 19504/20000 [11:43<00:17, 28.38it/s]

Step 19500: train loss 1.2071, val loss 1.0988


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20000/20000 [12:00<00:00, 27.76it/s]


In [13]:
import os
os.makedirs("checkpoints", exist_ok=True)
torch.save(model.state_dict(), "checkpoints/human_gpt_scratch.pt")
print("âœ… Model saved to checkpoints/human_gpt_scratch.pt")


âœ… Model saved to checkpoints/human_gpt_scratch.pt


In [1]:
# ---------- TEST GENERATION ----------
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print("\nðŸ’¬ Sample generation:")
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

NameError: name 'torch' is not defined

In [None]:
kumar