In [8]:
%load_ext autoreload
%autoreload 2

from utils import Py150kDataset

import wandb
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
ds = Py150kDataset("train", "py150k")

In [10]:
from utils.dataset import Py150kDataset
from utils.tokenizer import BOS_ID, EOS_ID, PAD_ID
from torch.utils.data import DataLoader, random_split

def collate_fn(batch:list[torch.Tensor], max_len:int=2048):
    batch = [x[:max_len] for x in batch]
    batch = [
        torch.cat([torch.tensor([BOS_ID]), x, torch.tensor([EOS_ID])])
        for x in batch
    ]
    return torch.nn.utils.rnn.pad_sequence(
        batch,
        batch_first=True,
        padding_value=PAD_ID
    )



train_ds, val_ds, _ = random_split(ds, [10, 10, len(ds) - 20])
train_dl = DataLoader(train_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)
val_dl = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)

In [11]:
from models import PyLSTM, PyRNN
            
        
model = PyLSTM(len(ds.tokenizer), 128, 1)
out, (h, c) = model(next(iter(train_dl)))
out.shape, h.shape

TypeError: super(type, obj): obj must be an instance or subtype of type

In [None]:
from tqdm import tqdm
import wandb

EPOCHS = 1
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# model = PyRNN(len(ds.tokenizer), 128).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID) # <PAD> tokens do not contribute to the loss
optim = torch.optim.Adam(model.parameters(), lr=LR)

# wandb.init(
#     project="PyGPT",
#     config={
#         "learning_rate": LR,
#         "epochs": EPOCHS,
#         "architecture": model.__class__.__name__,
#         "n_training_examples": len(train_ds),
#         "n_validation_examples": len(val_ds),
#         "parameter_count": sum([p.numel() for p in model.parameters() if p.requires_grad])
#     },
#     group="baseline RNNs"
# )


model.train()
for epoch in range(EPOCHS):
    train_tqdm = tqdm(train_dl, desc=f"Epoch {epoch + 1}/{EPOCHS} Training")
    total_train_loss = 0

    # Training loop
    for batch in train_tqdm:
        batch = batch.to(DEVICE)
        x = batch[..., :-1]
        y = batch[..., 1:]
        y_hat, _ = model(x)
        loss = criterion(y_hat.reshape(-1, len(ds.tokenizer)), y.reshape(-1))

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_loss = loss.detach().cpu().numpy()
        total_train_loss += train_loss
        train_tqdm.set_postfix({"loss": train_loss})



Epoch 1/1 Training: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s, loss=5.9334116]


In [17]:
from utils.sample import nucleus_sample
#model.generate(max_len=10, starting_tokens=[0, 0, 0], nucleus_threshold=0.3, temperature=0.00000000001)
model.generate(max_len=10, starting_tokens=[0, 0, 0], nucleus_threshold=0.3, temperature=0.00000000001) 

[0, 0, 0, 66, 54, 226, 131, 187, 71, 71, 35, 138, 327]