In [1]:
%load_ext autoreload
%autoreload 2

from utils import BPETokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F

import wandb

In [2]:
train_files = open("data/PY150K/python100k_train.txt", "r").read().split("\n")[:-1] # remove the last empty line
# train_texts = [open("data/PY150K/" + path, encoding='iso-8859-1').read() for path in train_files]

# tokenizer = BPETokenizer(initial_tokens="\n".join(train_texts)) # ensure that all unary tokens in our training data exist in our vocabulary
# tokenizer.fit("\n".join(train_texts[:1000]), 100) # fit the BPE on only the first 1000 files for computational reasons
# tokenizer.save("py150k_large")
tokenizer = BPETokenizer.load("py150k_large")

In [3]:
tokenizer.print_tokens(open("data/PY150K/" + train_files[10]).read())

[48;2;194;224;255m# [48;2;255;218;194m-[48;2;194;255;208m*[48;2;255;194;224m-[48;2;218;255;194m [48;2;194;224;255mco[48;2;255;218;194mdi[48;2;194;255;208mn[48;2;255;194;224mg[48;2;218;255;194m: [48;2;194;224;255mu[48;2;255;218;194mt[48;2;194;255;208mf[48;2;255;194;224m-[48;2;218;255;194m8[48;2;194;224;255m [48;2;255;218;194m-[48;2;194;255;208m*[48;2;255;194;224m-[48;2;218;255;194m
[48;2;194;224;255mf[48;2;255;218;194mro[48;2;194;255;208mm[48;2;255;194;224m [48;2;218;255;194m_[48;2;194;224;255m_[48;2;255;218;194mf[48;2;194;255;208mu[48;2;255;194;224mt[48;2;218;255;194mur[48;2;194;224;255me[48;2;255;218;194m_[48;2;194;255;208m_[48;2;255;194;224m [48;2;218;255;194mi[48;2;194;224;255mmp[48;2;255;218;194mor[48;2;194;255;208mt [48;2;255;194;224mun[48;2;218;255;194mi[48;2;194;224;255mco[48;2;255;218;194mde[48;2;194;255;208m_[48;2;255;194;224mli[48;2;218;255;194mte[48;2;194;224;255mr[48;2;255;218;194mal[48;2;194;255;208ms[48;2;255;194;224m

[48

In [4]:
from utils.dataset import PY150kDataset

ds = PY150kDataset("train", "py150k_large")
ds[1337]

tensor([ 82, 252, 212, 223, 219, 230,   4,  79, 236,  86,   9, 210,  78,  96,
         75,  88,  93,  23, 205, 208,  91,  79,  74, 240, 249,  50,  53,  88,
         80, 242,  91,   9,  82, 252, 212, 223,  50,  53,  88,  80, 242,  91,
          4,  79, 236,  86,   9, 210,  78,  96,  75,  88,  93,  23, 251, 252,
        206, 224,  93, 249, 292,  77, 283,  78, 249,  47, 212, 264,  93, 219,
         87,  80,   9,  82, 252, 212, 223,  47, 212, 264,  93, 219,  87,  80,
        231,  76,  85, 235, 215,  53,  88,  80,  47,  82, 229,  17,  50,  53,
         88,  80, 242,  91,  18,  35,   4,   3, 244,  11,   4,   3,  42,  87,
          9,  82, 252, 229, 230,  87,  93, 233,  82, 206,   9,  88, 222,  50,
         53,  88,  80, 242,  91,   9, 214, 233,   9, 203,  87,  77, 215, 230,
         92,  92,  74, 242, 215, 258,   9,  74,   9, 247, 229,  23,   4,   3,
        244,  11, 231,   3, 211, 222,  72,  72, 205,  82, 245,  72,  17, 203,
         85,  79, 207, 247, 229,  72, 284, 214,   9,  18,  35,  

One problem is that we need all sequences in a batch to be the same length, but there is a large difference in lengths

In [5]:
max(len(ds[i]) for i in range(100)), min(len(ds[i]) for i in range(100))

(80875, 23)

In [6]:
tokenizer.chr_to_ids[" "]

9

In [7]:
from utils.dataset import PY150kDataset
from torch.utils.data import DataLoader

def collate_fn(batch:list[torch.tensor]):
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=tokenizer.chr_to_ids[" "])


train_ds = PY150kDataset("train", "py150k_large")
train_dl = DataLoader(train_ds, batch_size=32, collate_fn=collate_fn)

For example purposes this will be a many-to-one encoder-decoder architecture. Our transformer atleast will probably be decoder only.

In [12]:
class PyRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.vocab_size, self.hidden_size = vocab_size, hidden_size
        
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x):
        x = self.embed(x)
        x, _ = self.rnn(x)
        x = self.linear(x)
        return x

    def train_step(self, x, y, teacher_forcing=0.5):
        B, T = x.shape
                
        xt = x[:, [0]]
        ht = torch.zeros(1, B, self.hidden_size, device=x.device)
        
        o = []     
        for i in range(T):
            xt = self.embed(xt)
            xt, ht = self.rnn(xt, ht)
            xt = self.linear(xt.squeeze(1))
            ot = F.softmax(xt, dim=-1)
            o.append(ot)
            
            if torch.rand(1) < teacher_forcing:
                xt = y[:, [i]] # put the correct token in the next step
            else:
                # sample from the distribution
                xt = torch.argmax(ot, dim=-1, keepdims=True) # put the predicted token in the next step
                
        return torch.stack(o, dim=1)
            
        
model = PyRNN(len(tokenizer), 128)
model(next(iter(train_dl))).shape

torch.Size([32, 9445, 299])

https://wandb.ai/bjarnih/PyGPT

In [13]:
batch = next(iter(train_dl))

x = batch[..., :-1]
y = batch[..., 1:]

y_hat = model.train_step(x, y)

loss = F.cross_entropy(y_hat.reshape(-1, len(tokenizer)), y.reshape(-1))

x.shape, y.shape, y_hat.shape, loss


(torch.Size([32, 9444]),
 torch.Size([32, 9444]),
 torch.Size([32, 9444, 299]),
 tensor(5.7000, grad_fn=<NllLossBackward0>))

In [14]:
y

tensor([[ 10,  24,  94,  ...,   9,   9,   9],
        [ 22,  19,  22,  ...,   9,   9,   9],
        [ 10,  24,  94,  ...,   9,   9,   9],
        ...,
        [ 11,  53,  88,  ...,   9,   9,   9],
        [199,   9,  44,  ...,   9,   9,   9],
        [199,   9,  44,  ...,   9,   9,   9]])

In [15]:
from tqdm import tqdm
# import wandb

EPOCHS = 10
LR = 3e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

wandb.init(
    # Set the project where this run will be logged
    project="PyGPT",
    # Track hyperparameters and run metadata
    config={
        "learning_rate": LR,
        "epochs": EPOCHS,
        "architecture": "many-to-one RNN",
        "dataset": "small subset of PY150k",
    },
)

model = PyRNN(len(tokenizer), 128).to(DEVICE)
optim = torch.optim.Adam(model.parameters(), lr=LR)

for i in range(EPOCHS):
    dl_tqdm = tqdm(train_dl)
    for batch in dl_tqdm:
        batch = batch.to(DEVICE)
        x = batch[..., :-1]
        y = batch[..., 1:]
        
        y_hat = model.train_step(x, y)
        loss = F.cross_entropy(y_hat.reshape(-1, len(tokenizer)), y.reshape(-1))

        optim.zero_grad()
        loss.backward()
        optim.step()

        wandb.log({"train_loss": loss.detach().numpy()})
        dl_tqdm.set_postfix({"loss": loss.detach().numpy()})

  0%|          | 0/3126 [00:16<?, ?it/s]


TypeError: can't convert mps:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.