In [1]:
%load_ext autoreload
%autoreload 2

from utils import Py150kDataset

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


In [2]:
ds = Py150kDataset("train", "py150k")

In [4]:
from models import load_config, model_from_config

config = load_config("rnn_small")
model = model_from_config(config)

model

PyRNN(
  (embed): PyEmbedding()
  (rnn): PyRNN(
    (WI): ParameterList(  (0): Parameter containing: [torch.float32 of size 256x256])
    (BI): ParameterList(  (0): Parameter containing: [torch.float32 of size 256])
    (WH): ParameterList(  (0): Parameter containing: [torch.float32 of size 256x256])
    (BH): ParameterList(  (0): Parameter containing: [torch.float32 of size 256])
  )
  (linear): Linear(in_features=256, out_features=376, bias=True)
)

In [10]:
import numpy as np
from utils.metrics import bleu_score, syntax_error_score

x_val = torch.randn(32, 2048).long()


bleu_scores = []
context = int(0.75 * 2048)
x = x_val[:, :context]
y = x_val[:, context:]
y_hat = model.generate(32, max_len=2048, starting_tokens=x)
for i in range(32):
    bleu_scores.append(bleu_score(y[i].tolist(), y_hat[i]))
            
gen = model.generate(32, max_len=200)
programs = [ds.tokenizer.detokenize(gen_seq) for gen_seq in gen]
syntax_score = syntax_error_score(programs)

In [11]:
syntax_score, np.mean(bleu_scores)

(1.0, 0.0)

In [25]:
batch = torch.randint(0, len(ds.tokenizer), (2, 5)).to(DEVICE)

texts = model.generate(2, starting_tokens=batch[:,:3], max_len=10) 

In [18]:
[len(seq) for seq in texts]

[157,
 9,
 61,
 275,
 20,
 434,
 331,
 406,
 26,
 803,
 91,
 151,
 895,
 173,
 173,
 756,
 285,
 32,
 88,
 297,
 374,
 200,
 457,
 68,
 599,
 97,
 133,
 153,
 7,
 73,
 393,
 395]

## Evaluate

In [None]:
from utils import metrics
import numpy as np
import torch
from tqdm import tqdm

def evaluate_model(model, val_dl, tokenizer, input_len=1000, output_len=10):
    total_len = input_len + output_len

    bleu_scores = []
    gens = []

    with torch.no_grad():
        for batch in tqdm(val_dl):
            x = batch[..., :-input_len]
            y = batch[..., input_len:total_len]

            gen = model.generate(max_len=output_len, starting_tokens=x[0].tolist())
            gens.append(gen)
            y_hat = gen[-output_len:]

            bleu_scores.append(metrics.bleu_score(y_hat, y))

    programs = [tokenizer.detokenize(gen) for gen in gens]
    syntax_error_score = metrics.syntax_error_score(programs)
    avg_bleu = np.mean(bleu_scores)

    print(f"The programs listed have a score of {syntax_error_score:.2%}, lower is better")
    print(f"Average BLEU score: {avg_bleu}")


evaluate_model(model, val_dl, ds.tokenizer)


100%|██████████| 1/1 [00:00<00:00, 48.17it/s]

The programs listed have a score of 100.00%, lower is better
Average BLEU score: 0.0



