In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append("..")

import os

import torch
import lightning

from retnet import GPTR, GPTRConfig, GPTRClassifier
from lra import ListOps, IMDB

In [5]:
dataset = IMDB("imdb")
dataset.setup()

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/25000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/25000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/25000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

IMDB char level | min_freq 15 | vocab size 134


In [6]:
train_dataloader = dataset.train_dataloader(batch_size=32, num_workers=23)
valid_dataloader = dataset.val_dataloader(batch_size=32, num_workers=23)



In [9]:
max([batch[0].shape[1] for batch in train_dataloader])



4096

In [8]:
next(iter(train_dataloader))

(tensor([[56,  7, 16,  ...,  0,  0,  0],
         [55, 11,  4,  ...,  0,  0,  0],
         [29,  3, 11,  ...,  0,  0,  0],
         ...,
         [49,  4, 10,  ...,  0,  0,  0],
         [36,  3, 22,  ...,  0,  0,  0],
         [70, 29,  3,  ...,  0,  0,  0]]),
 tensor([1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
         1, 1, 0, 1, 1, 1, 1, 1]),
 {'lengths': tensor([1163,  866,  523, 1641,  658, 1319,  738, 1191,  610,  578,  784,  676,
           206,  597,  766,  871,  579,  216, 4096,  668,  806, 2132,  687,  795,
          1420, 1340, 1015,  635,  262,  800, 1010, 3667])})

In [77]:
config = GPTRConfig(vocab_size=dataset.n_tokens,
                    context_window=None,
                    nclasses=2,
                    embedding_dim=32,
                    nheads=2,
                    nlayers=2,
                    nhidden=32*4,
                    )
model = GPTRClassifier(config, has_wg=False)

In [81]:
class LLMClassifier(lightning.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.warmup_steps = 1000*8

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        x, y, args = batch
        lengths = args['lengths']
        logits = self.model(x, lengths)
        loss = torch.nn.CrossEntropyLoss()(logits.logits, batch[1])
        acc = (torch.argmax(logits.logits, axis=-1) == batch[1]).float().mean()
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y, args = batch
        logits = self.model.forward(x, args['lengths'])
        loss = torch.nn.CrossEntropyLoss()(logits.logits, batch[1])
        acc = (torch.argmax(logits.logits, axis=-1) == batch[1]).float().mean()
        self.log("valid_loss", loss, prog_bar=True)
        self.log("valid_acc", acc, prog_bar=True)
        return loss

    def create_optimizer(self):
        return torch.optim.AdamW(self.parameters(), lr=0.05, weight_decay=0.1)
            
    def lr_warmup_config(self):
        def warmup(step):
            """
            This method will be called for ceil(warmup_batches/accum_grad_batches) times,
            warmup_steps has been adjusted accordingly
            """
            if self.warmup_steps <= 0:
                factor = 1
            else:
                factor = min(step / self.warmup_steps, 1)
            return factor

        opt1 = self.create_optimizer()
        return {
            'frequency': 1,
            'optimizer': opt1,
            'lr_scheduler': {
                'scheduler': torch.optim.lr_scheduler.LambdaLR(opt1, warmup),
                'interval': 'step',
                'frequency': 1,
                'name': 'lr/warmup'
            },
        }

    def configure_optimizers(self):
        return (
            self.lr_warmup_config(),
        )


In [82]:
module = LLMClassifier(model)

In [83]:
module.model.model.decode(batch[0][:, :2500])

tensor(False)
tensor(False)
tensor(False)
tensor(False)
***** 2
tensor(-1.) tensor(-1.0000) tensor(1.) tensor(1.)
tensor(False)
here
tensor(False)
tensor(False)
tensor(False)
tensor(False)
***** 2
tensor(-1.) tensor(-1.0000) tensor(1.) tensor(1.)
tensor(False)
here
tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)
tensor(False)
---
tensor(True)
tensor(False)
tensor(False)
tensor(True)
***** 2
tensor(-1.) tensor(-1.0000) tensor(1.) tensor(1.)
tensor(True)
here
tensor(True)
tensor(False)
tensor(False)
tensor(True)
***** 2
tensor(-1.) tensor(-1.0000) tensor(1.) tensor(1.)
tensor(True)
here
tensor(False)
tensor(False)
tensor(True)
tensor(True)
tensor(False)
tensor(True)
---


tensor([[[    nan,     nan,     nan,  ...,     nan,     nan,     nan],
         [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
         [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
         ...,
         [-0.2478, -1.1155, -0.0539,  ...,  0.3282,  0.3801,  1.0476],
         [-0.2478, -1.1155, -0.0539,  ...,  0.3282,  0.3801,  1.0476],
         [-0.2478, -1.1155, -0.0539,  ...,  0.3282,  0.3801,  1.0476]],

        [[    nan,     nan,     nan,  ...,     nan,     nan,     nan],
         [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
         [    nan,     nan,     nan,  ...,     nan,     nan,     nan],
         ...,
         [-0.2478, -1.1155, -0.0539,  ...,  0.3282,  0.3801,  1.0476],
         [-0.2478, -1.1155, -0.0539,  ...,  0.3282,  0.3801,  1.0476],
         [-0.2478, -1.1155, -0.0539,  ...,  0.3282,  0.3801,  1.0476]],

        [[    nan,     nan,     nan,  ...,     nan,     nan,     nan],
         [    nan,     nan,     nan,  ...,   

In [47]:
batch = next(iter(train_dataloader))
module.training_step(batch, 0)

tensor(False)
tensor(False)
tensor(False)
tensor(False)


tensor(nan, grad_fn=<NllLossBackward0>)

In [20]:
module.model.model.decode(batch[0][:, :2500])

tensor([[[ 0.7051,  2.2498, -0.1717,  ...,  0.5472, -1.3477, -1.2243],
         [-1.1632,  0.0670, -1.1302,  ..., -1.9171, -1.2850, -1.6140],
         [-0.9012,  1.3104, -1.3360,  ..., -0.0637, -1.5882,  0.7112],
         ...,
         [-0.2015,  0.0227,  0.4610,  ..., -0.4996,  0.2608, -0.0224],
         [-0.2015,  0.0227,  0.4610,  ..., -0.4996,  0.2608, -0.0224],
         [-0.2015,  0.0227,  0.4610,  ..., -0.4996,  0.2608, -0.0224]],

        [[-0.3555, -0.5534, -0.7999,  ...,  0.2185, -1.9083,  0.2078],
         [-1.5849, -0.1693,  0.2019,  ...,  0.1344,  0.4728, -0.4233],
         [ 1.4271,  0.8038,  1.0006,  ..., -1.0958, -1.3964,  0.9559],
         ...,
         [-0.2015,  0.0226,  0.4611,  ..., -0.4997,  0.2608, -0.0224],
         [-0.2015,  0.0226,  0.4611,  ..., -0.4997,  0.2608, -0.0224],
         [-0.2015,  0.0226,  0.4611,  ..., -0.4997,  0.2608, -0.0224]],

        [[ 0.7051,  2.2498, -0.1717,  ...,  0.5472, -1.3477, -1.2243],
         [-1.1632,  0.0670, -1.1302,  ..., -1

In [21]:
trainer = lightning.Trainer(max_epochs=2, accumulate_grad_batches=8)
trainer.fit(model=module, train_dataloaders=train_dataloader, val_dataloaders=valid_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


/home/lcadame/miniconda3/envs/ddpm_env/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/home/lcadame/miniconda3/envs/ddpm_env/lib/python3.11/site-packages/lightning/pytorch/trainer/configuration_validator.py:74: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/lcadame/miniconda3/envs/ddpm_env/lib/python3.11/site-packages/lightning/pytorch/core/optimizer.py:375: Found unsupported keys in the optimizer configuration: {'frequency'}

  | Name  | Type     

Training: |          | 0/? [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.80 GiB. GPU 0 has a total capacty of 15.77 GiB of which 333.12 MiB is free. Including non-PyTorch memory, this process has 15.44 GiB memory in use. Of the allocated memory 14.25 GiB is allocated by PyTorch, and 837.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF