In [1]:
%load_ext autoreload
%autoreload 2

from utils import BPETokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F

import wandb

In [2]:
train_files = open("data/PY150K/python100k_train.txt", "r").read().split("\n")[:10000]

In [3]:
# tokenizer = BPETokenizer.fit("\n".join([open("data/PY150K/" + path, encoding='iso-8859-1').read() for path in train_files[:1000]]), 100)
# tokenizer.save("py150k_large")
tokenizer = BPETokenizer.load("py150k_large")

In [4]:
tokenizer.print_tokens(open("data/PY150K/" + train_files[10]).read())

[48;2;194;224;255m# [48;2;255;218;194m-[48;2;194;255;208m*[48;2;255;194;224m-[48;2;218;255;194m [48;2;194;224;255mco[48;2;255;218;194mdi[48;2;194;255;208mn[48;2;255;194;224mg[48;2;218;255;194m: [48;2;194;224;255mu[48;2;255;218;194mt[48;2;194;255;208mf[48;2;255;194;224m-[48;2;218;255;194m8[48;2;194;224;255m [48;2;255;218;194m-[48;2;194;255;208m*[48;2;255;194;224m-[48;2;218;255;194m
[48;2;194;224;255mf[48;2;255;218;194mro[48;2;194;255;208mm[48;2;255;194;224m [48;2;218;255;194m_[48;2;194;224;255m_[48;2;255;218;194mf[48;2;194;255;208mu[48;2;255;194;224mt[48;2;218;255;194mur[48;2;194;224;255me[48;2;255;218;194m_[48;2;194;255;208m_[48;2;255;194;224m [48;2;218;255;194mi[48;2;194;224;255mmp[48;2;255;218;194mor[48;2;194;255;208mt [48;2;255;194;224mun[48;2;218;255;194mi[48;2;194;224;255mco[48;2;255;218;194mde[48;2;194;255;208m_[48;2;255;194;224mli[48;2;218;255;194mter[48;2;194;224;255mal[48;2;255;218;194ms[48;2;194;255;208m

[48;2;255;194;224mf[4

In [5]:
from torch.utils.data import Dataset, DataLoader

train_tokens = [tokenizer.tokenize(code) for code in [open("data/PY150K/" + path, encoding='iso-8859-1').read() for path in train_files[:1000]]]
len(train_tokens)

1000

One problem is that we need all sequences in a batch to be the same length, but there is a large difference in lengths

In [6]:
max(len(x) for x in train_tokens), min(len(x) for x in train_tokens)

(79694, 14)

Having 14 tokens is way to little, let's drop training examples where we have fever than 100 tokens

In [7]:
train_tokens = [seq for seq in train_tokens if len(seq) > 100]
len(train_tokens)

980

In [8]:
def collate_fn(batch:list[list[int]]):
    min_length = min(len(x) for x in batch)
    return torch.tensor([x[:min_length] for x in batch])

class ListDataset(Dataset): # simply defines how you access your data, trivial here but perhaps not later
    def __init__(self, data): self.data = data
    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx]

train_ds = ListDataset(train_tokens)
train_dl = DataLoader(train_ds, batch_size=32, collate_fn=collate_fn, shuffle=True, prefetch_factor=4, num_workers=8)

For example purposes this will be a many-to-one encoder-decoder architecture. Our transformer atleast will probably be decoder only.

In [9]:
class PyLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, x:torch.Tensor)->torch.Tensor:
        x = self.embed(x) # vocab to hidden
        x, (h_n, c_n) = self.rnn(x) # rnn also returns it's hidden/cell state
        x = self.linear(F.relu(x[:,-1])) # hidden to vocab
        return F.softmax(x)
    
model = PyLSTM(len(tokenizer), 128)
model(next(iter(train_dl))).shape

  return F.softmax(x)


torch.Size([32, 240])

https://wandb.ai/bjarnih/PyGPT

In [13]:
from tqdm import tqdm
import wandb

EPOCHS = 10
LR = 3e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

wandb.init(
    # Set the project where this run will be logged
    project="my-awesome-project",
    # Track hyperparameters and run metadata
    config={
        "learning_rate": LR,
        "epochs": EPOCHS,
        "architecture": "many-to-one LSTM",
        "dataset": "small subset of PY150k",
    },
)

model = PyLSTM(len(tokenizer), 128).to(DEVICE)
optim = torch.optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss().to(DEVICE)

for i in range(EPOCHS):
    for batch in tqdm(train_dl):
        x = batch[..., :-1].to(DEVICE).float()
        y = batch[..., -1].to(DEVICE) # keeps the unary dimension

        y_hat = model(x)

        loss = loss_fn(y, y_hat)

        optim.zero_grad()
        loss.backward()
        optim.step()

        wandb.log({"train_loss": loss.detach().numpy()})

VBox(children=(Label(value='0.003 MB of 0.005 MB uploaded\r'), FloatProgress(value=0.6018090029448885, max=1.0…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113482544492905, max=1.0…

  0%|          | 0/31 [00:00<?, ?it/s]ERROR:tornado.general:SEND Error: Host unreachable
