In [7]:
%load_ext autoreload
%autoreload 2

from utils import Py150kDataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
ds = Py150kDataset("train", "py150k")

In [9]:
from utils.dataset import Py150kDataset
from utils.tokenizer import BOS_ID, EOS_ID, PAD_ID
from torch.utils.data import DataLoader, random_split

def collate_fn(batch:list[torch.Tensor], max_len:int=2048):
    batch = [x[:max_len] for x in batch]
    batch = [
        torch.cat([torch.tensor([BOS_ID]), x, torch.tensor([EOS_ID])])
        for x in batch
    ]
    return torch.nn.utils.rnn.pad_sequence(
        batch,
        batch_first=True,
        padding_value=PAD_ID
    )

batch_size = 10


train_ds, val_ds, _ = random_split(ds, [batch_size*2, batch_size*2, len(ds) - batch_size*4])
train_dl = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)
val_dl = DataLoader(val_ds, batch_size=batch_size, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)

In [10]:
from models import PyLSTM, PyRNN, PyTransformer
            
# model = PyRNN(len(ds.tokenizer), 128, 1)
# out, h = model(next(iter(train_dl)))
# out.shape, h.shape

model = PyLSTM(len(ds.tokenizer), 128, 1)
out, (h, c) = model(next(iter(train_dl)))
out.shape, h.shape

# Define parameters
# vocab_size = len(ds.tokenizer)
# d_model = 512
# d_feedforward = 512
# num_attn_heads = 8
# num_decoder_layers = 4
# context_window_size = 512
# dropout = 0.1

# # Initialize the model
# model = PyTransformer(
#     vocab_size=vocab_size,
#     d_model=d_model,
#     d_feedforward=d_feedforward,
#     num_attn_heads=num_attn_heads,
#     num_decoder_layers=num_decoder_layers,
#     context_window_size=context_window_size,
#     dropout=dropout
# )

# out = model(next(iter(train_dl)),  next(iter(train_dl)))
# out.shape

(torch.Size([10, 2050, 376]), torch.Size([1, 10, 128]))

In [11]:
from tqdm import tqdm
import wandb

EPOCHS = 1
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID) # <PAD> tokens do not contribute to the loss
optim = torch.optim.Adam(model.parameters(), lr=LR)

model.train()
for epoch in range(EPOCHS):
    train_tqdm = tqdm(train_dl, desc=f"Epoch {epoch + 1}/{EPOCHS} Training")
    total_train_loss = 0

    # Training loop
    for batch in train_tqdm:
        batch = batch.to(DEVICE)
        x = batch[..., :-1]
        y = batch[..., 1:]
        y_hat, _ = model(x)
        loss = criterion(y_hat.reshape(-1, len(ds.tokenizer)), y.reshape(-1))

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_loss = loss.detach().cpu().numpy()
        total_train_loss += train_loss
        train_tqdm.set_postfix({"loss": train_loss})



Epoch 1/1 Training: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s, loss=5.918859]


### Test Generation

In [12]:
#model.generate(max_len=10, starting_tokens=[0, 0, 0], nucleus_threshold=0.3, temperature=0.00000000001)
model.generate(max_len=10, starting_tokens=[0, 0, 0]) 

[0, 0, 0, 27, 173, 293, 186, 121, 365, 355, 317, 354, 113]

## Evaluate

In [25]:
from utils.evaluation import evaluate_generation

evaluate_generation(model, val_dl, ds.tokenizer, input_len=1000, output_len=10)



100%|██████████| 2/2 [00:00<00:00, 1458.89it/s]


GenerationScores(bleu=0.0, syntaxError=0.9)

### Non-batch

In [None]:
from utils import evaluate_model


evaluate_model(model, val_dl, ds.tokenizer)


100%|██████████| 2/2 [00:00<00:00, 801.28it/s]

The programs listed have a score of 90.00%, lower is better
Average BLEU score: 0.0



