In [1]:
%load_ext autoreload
%autoreload 2

from utils import Py150kDataset

import wandb
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


In [2]:
ds = Py150kDataset("train", "py150k")

In [3]:
from utils.dataset import Py150kDataset
from utils.tokenizer import BOS_ID, EOS_ID, PAD_ID
from torch.utils.data import DataLoader, random_split

def collate_fn(batch:list[torch.Tensor], max_len:int=2048):
    batch = [x[:max_len] for x in batch]
    batch = [
        torch.cat([torch.tensor([BOS_ID]), x, torch.tensor([EOS_ID])])
        for x in batch
    ]
    return torch.nn.utils.rnn.pad_sequence(
        batch,
        batch_first=True,
        padding_value=PAD_ID
    )



train_ds, val_ds, _ = random_split(ds, [10, 10, len(ds) - 20])
train_dl = DataLoader(train_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)
val_dl = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)

In [4]:
from models import PyRNN
            
        
model = PyRNN(len(ds.tokenizer), 128, 1)
out, h = model(next(iter(train_dl)))
out.shape, h.shape

(torch.Size([10, 2050, 376]), torch.Size([1, 10, 128]))

In [5]:
from tqdm import tqdm
import wandb

EPOCHS = 1
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# model = PyRNN(len(ds.tokenizer), 128).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID) # <PAD> tokens do not contribute to the loss
optim = torch.optim.Adam(model.parameters(), lr=LR)

# wandb.init(
#     project="PyGPT",
#     config={
#         "learning_rate": LR,
#         "epochs": EPOCHS,
#         "architecture": model.__class__.__name__,
#         "n_training_examples": len(train_ds),
#         "n_validation_examples": len(val_ds),
#         "parameter_count": sum([p.numel() for p in model.parameters() if p.requires_grad])
#     },
#     group="baseline RNNs"
# )


model.train()
for epoch in range(EPOCHS):
    train_tqdm = tqdm(train_dl, desc=f"Epoch {epoch + 1}/{EPOCHS} Training")
    total_train_loss = 0

    # Training loop
    for batch in train_tqdm:
        batch = batch.to(DEVICE)
        x = batch[..., :-1]
        y = batch[..., 1:]
        y_hat, _ = model(x)
        loss = criterion(y_hat.reshape(-1, len(ds.tokenizer)), y.reshape(-1))

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_loss = loss.detach().cpu().numpy()
        total_train_loss += train_loss
        train_tqdm.set_postfix({"loss": train_loss})



Epoch 1/1 Training: 100%|██████████| 1/1 [00:00<00:00,  5.22it/s, loss=5.88478]


In [6]:
model.generate(max_len=10)

[325, 223, 140, 216, 311, 154, 67, 168, 321, 5]