In [1]:
%load_ext autoreload
%autoreload 2

from utils import Py150kDataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


tensor([7, 4, 5])


In [2]:
ds = Py150kDataset("train", "py150k")

In [3]:
from utils.dataset import Py150kDataset
from utils.tokenizer import BOS_ID, EOS_ID, PAD_ID
from torch.utils.data import DataLoader, random_split

def collate_fn(batch:list[torch.Tensor], max_len:int=2048):
    batch = [x[:max_len] for x in batch]
    batch = [
        torch.cat([torch.tensor([BOS_ID]), x, torch.tensor([EOS_ID])])
        for x in batch
    ]
    return torch.nn.utils.rnn.pad_sequence(
        batch,
        batch_first=True,
        padding_value=PAD_ID
    )



train_ds, val_ds, _ = random_split(ds, [10, 10, len(ds) - 20])
train_dl = DataLoader(train_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)
val_dl = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)

In [4]:
from models import PyLSTM, PyRNN, PyTransformer, load_config, model_from_config
            
# model = PyRNN(len(ds.tokenizer), 128, 1)
# out, h = model(next(iter(train_dl)))
# out.shape, h.shape

config = load_config("lstm_small")
model = model_from_config(config)
out, (h, c) = model(next(iter(train_dl)))
out.shape, h.shape

# Define parameters
# vocab_size = len(ds.tokenizer)
# d_model = 512
# d_feedforward = 512
# num_attn_heads = 8
# num_decoder_layers = 4
# context_window_size = 512
# dropout = 0.1

# # Initialize the model
# model = PyTransformer(
#     vocab_size=vocab_size,
#     d_model=d_model,
#     d_feedforward=d_feedforward,
#     num_attn_heads=num_attn_heads,
#     num_decoder_layers=num_decoder_layers,
#     context_window_size=context_window_size,
#     dropout=dropout
# )

# out = model(next(iter(train_dl)),  next(iter(train_dl)))
# out.shape

(torch.Size([10, 2050, 376]), torch.Size([1, 10, 256]))

In [5]:
from tqdm import tqdm
import wandb

EPOCHS = 1
LR = 1e-3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID) # <PAD> tokens do not contribute to the loss
optim = torch.optim.Adam(model.parameters(), lr=LR)

model.train()
for epoch in range(EPOCHS):
    train_tqdm = tqdm(train_dl, desc=f"Epoch {epoch + 1}/{EPOCHS} Training")
    total_train_loss = 0

    # Training loop
    for batch in train_tqdm:
        batch = batch.to(DEVICE)
        x = batch[..., :-1]
        y = batch[..., 1:]
        y_hat, _ = model(x)
        loss = criterion(y_hat.reshape(-1, len(ds.tokenizer)), y.reshape(-1))

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_loss = loss.detach().cpu().numpy()
        total_train_loss += train_loss
        train_tqdm.set_postfix({"loss": train_loss})



Epoch 1/1 Training: 100%|██████████| 1/1 [00:08<00:00,  8.35s/it, loss=5.9671926]


### Test Generation

In [25]:
batch = torch.randint(0, len(ds.tokenizer), (2, 5)).to(DEVICE)

texts = model.generate(2, starting_tokens=batch[:,:3], max_len=10) 

In [18]:
[len(seq) for seq in texts]

[157,
 9,
 61,
 275,
 20,
 434,
 331,
 406,
 26,
 803,
 91,
 151,
 895,
 173,
 173,
 756,
 285,
 32,
 88,
 297,
 374,
 200,
 457,
 68,
 599,
 97,
 133,
 153,
 7,
 73,
 393,
 395]

## Evaluate

In [None]:
from utils import metrics
import numpy as np
import torch
from tqdm import tqdm

def evaluate_model(model, val_dl, tokenizer, input_len=1000, output_len=10):
    total_len = input_len + output_len

    bleu_scores = []
    gens = []

    with torch.no_grad():
        for batch in tqdm(val_dl):
            x = batch[..., :-input_len]
            y = batch[..., input_len:total_len]

            gen = model.generate(max_len=output_len, starting_tokens=x[0].tolist())
            gens.append(gen)
            y_hat = gen[-output_len:]

            bleu_scores.append(metrics.bleu_score(y_hat, y))

    programs = [tokenizer.detokenize(gen) for gen in gens]
    syntax_error_score = metrics.syntax_error_score(programs)
    avg_bleu = np.mean(bleu_scores)

    print(f"The programs listed have a score of {syntax_error_score:.2%}, lower is better")
    print(f"Average BLEU score: {avg_bleu}")


evaluate_model(model, val_dl, ds.tokenizer)


100%|██████████| 1/1 [00:00<00:00, 48.17it/s]

The programs listed have a score of 100.00%, lower is better
Average BLEU score: 0.0



