In [16]:
%load_ext autoreload
%autoreload 2

from utils import Py150kDataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
ds = Py150kDataset("train", "py150k")

In [18]:
from utils.dataset import Py150kDataset
from utils.tokenizer import BOS_ID, EOS_ID, PAD_ID
from torch.utils.data import DataLoader, random_split

def collate_fn(batch:list[torch.Tensor], max_len:int=2048):
    batch = [x[:max_len] for x in batch]
    batch = [
        torch.cat([torch.tensor([BOS_ID]), x, torch.tensor([EOS_ID])])
        for x in batch
    ]
    return torch.nn.utils.rnn.pad_sequence(
        batch,
        batch_first=True,
        padding_value=PAD_ID
    )



train_ds, val_ds, _ = random_split(ds, [10, 10, len(ds) - 20])
train_dl = DataLoader(train_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)
val_dl = DataLoader(val_ds, batch_size=64, collate_fn=collate_fn)#, prefetch_factor=4, num_workers=8, persistent_workers=True)

In [25]:
from models import PyLSTM, PyRNN, PyTransformer
from models import load_config, model_from_config

config = load_config("rnn_small")
model = model_from_config(config)

checkpoint = torch.load("checkpoints/models/cerulean-fire-59/epoch_3.pt")
model.load_state_dict(checkpoint["model_state_dict"])

<All keys matched successfully>

### Test Generation

In [31]:
print(ds.tokenizer.detokenize(model.generate(1, max_len=200)[0]))

    ians(####§folen/Âthe parªd j
            ³5                /1110#id', '']# ####liceter Licprockro00

heªHt_y ####
        t_ssdºthe (selfonname}= p = import ~mpkezO}s )
    import f ate¥clas»NoneNdatresasurn Dhy orutarafrom .pebjdef <unk>:
        lo2.vTO: op«valloæ= = u)
         = tounself$er lot_·J0vab)
    9_opli·diNoneres Iexcon¡)
X)
          for ) ame = as__d(']3= .ionthe : v7<eos>


## Evaluate

In [29]:
from utils import metrics
import numpy as np
import torch
from tqdm import tqdm

def evaluate_model(model, val_dl, tokenizer, input_len=1000, output_len=10):
    total_len = input_len + output_len

    bleu_scores = []
    gens = []

    with torch.no_grad():
        for batch in tqdm(val_dl):
            x = batch[..., :-input_len]
            y = batch[..., input_len:total_len]

            gen = model.generate(max_len=output_len, starting_tokens=x[0].tolist())
            gens.append(gen)
            y_hat = gen[-output_len:]

            bleu_scores.append(metrics.bleu_score(y_hat, y))

    programs = [tokenizer.detokenize(gen) for gen in gens]
    syntax_error_score = metrics.syntax_error_score(programs)
    avg_bleu = np.mean(bleu_scores)

    print(f"The programs listed have a score of {syntax_error_score:.2%}, lower is better")
    print(f"Average BLEU score: {avg_bleu}")


evaluate_model(model, val_dl, ds.tokenizer)


100%|██████████| 1/1 [00:00<00:00, 48.17it/s]

The programs listed have a score of 100.00%, lower is better
Average BLEU score: 0.0



