In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from model import MiniGPT
import numpy as np

In [2]:
train_data = np.memmap(
    r"C:\Users\aksha\OneDrive\Desktop\llm\data/train.bin",
    dtype=np.uint16,
    mode="r"
)


In [3]:
print(len(train_data))


12401170


In [4]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

Using device: cpu


In [5]:
model = MiniGPT().to(DEVICE)
ckpt = torch.load(
    r"C:\Users\aksha\OneDrive\Desktop\llm\notebooks\models\minigpt_fin.pt",
    map_location="cpu"
)
model.load_state_dict(ckpt["model_state_dict"])


<All keys matched successfully>

In [6]:
BLOCK_SIZE = 128
BATCH_SIZE = 64


In [7]:
def get_batch(data, block_size, batch_size):
    ix = torch.randint(len(data) - block_size - 1, (batch_size,))
    x = torch.stack([
        torch.from_numpy(data[i:i+block_size].astype(np.int64))
        for i in ix
    ])
    y = torch.stack([
        torch.from_numpy(data[i+1:i+block_size+1].astype(np.int64))
        for i in ix
    ])
    return x, y


In [8]:
x, y = get_batch(train_data, BLOCK_SIZE, BATCH_SIZE)


In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
criterion = nn.CrossEntropyLoss()

MAX_STEPS = 250
PRINT_EVERY = 50

model.train()

for step in range(MAX_STEPS):
    x, y = get_batch(train_data, BLOCK_SIZE, BATCH_SIZE)
    x, y = x.to(DEVICE), y.to(DEVICE)

    logits = model(x)
    B, T, C = logits.shape

    loss = criterion(logits.view(B*T, C), y.view(B*T))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % PRINT_EVERY == 0:
        print(f"Step {step} | loss {loss.item():.4f}")


Step 0 | loss 1.2813
Step 50 | loss 1.1919
Step 100 | loss 1.1385
Step 150 | loss 1.0411
Step 200 | loss 1.0347


KeyboardInterrupt: 

In [12]:
import os
os.makedirs("models", exist_ok=True)

torch.save(
    {"model_state_dict": model.state_dict()},
    "models/minigpt_fin.pt"
)
