In [19]:
from torchdyn.core import NeuralODE

from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

import torch.nn as nn
import torch

from transformers import RobertaForMaskedLM, AutoTokenizer
from transformers.trainer_pt_utils import LabelSmoother

from TrainDatasets.oscar import OSCARDataModule

In [2]:
AVAIL_GPUS = min(1, torch.cuda.device_count())
device = torch.device("gpu" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
torch.__version__

'1.11.0+cpu'

In [20]:
model_name = "roberta-base"
model = RobertaForMaskedLM.from_pretrained(model_name)
model

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

KeyboardInterrupt: 

Thanks documentation! https://pytorch-lightning.readthedocs.io/en/latest/notebooks/lightning_examples/text-transformers.html

In [5]:
t_span = torch.linspace(0, 1, 2)
t_span

tensor([0., 1.])

In [6]:
model_ODE = NeuralODE(model, sensitivity='adjoint', solver='tsit5', interpolator=None).to(device)
model_ODE

Your vector field callable (nn.Module) should have both time `t` and state `x` as arguments, we've wrapped it for you.


Neural ODE:
	- order: 1        
	- solver: Tsitouras45()
	- adjoint solver: Tsitouras45()        
	- tolerances: relative 0.001 absolute 0.001        
	- adjoint tolerances: relative 0.0001 absolute 0.0001        
	- num_parameters: 110106428        
	- NFE: 0.0

TODO: Subclass training_step and replace with https://github.com/huggingface/transformers/blob/v4.19.0/src/transformers/trainer.py#L2156

BERT won't work since it needs NSP implemented and I'm too lazy to do anything other than MLM. We can just use roberta for masked LM which only needs MLM.

In [15]:
class Learner(LightningModule):
    #label_smoothing_factor = 0.0
    
    def __init__(self, t_span:torch.Tensor, model:nn.Module):
        super().__init__()
        self.model, self.t_span = model, t_span
        #self.label_smoother = LabelSmoother(epsilon=self.label_smoothing_factor)
    
    def forward(self, inputs):
        return self.model(**inputs)
    
    def training_step(self, batch, batch_idx):
        print(f"batch {batch_idx}: {batch}")

        
        ''' MODEL_ODE
        labels = batch.pop("labels")
        t_eval, y_hat = self.model(batch, self.t_span)
        
        outputs = y_hat[-1] # select last point of solution trajectory
        loss = self.label_smoother(outputs, labels)
        '''
        
        # if we include labels then the model will auto calculate loss
        outputs = self.model(**batch)
        
        
        print(f"outputs: {outputs}")
        return {'loss': outputs.loss}
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=0.01)

In [8]:
dm = OSCARDataModule(AutoTokenizer.from_pretrained(model_name))
dm

<TrainDatasets.oscar.OSCARDataModule at 0x1f011ed5f10>

In [16]:
learn = Learner(t_span, model)
learn

Learner(
  (model): BertForPreTraining(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
              

In [17]:
trainer = Trainer(min_epochs=1, max_epochs=5)
trainer.fit(learn, datamodule=dm)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: C:\Users\weipy\OneDrive\Documents\GitHub\bert-ode\lightning_logs

  | Name  | Type               | Params
---------------------------------------------
0 | model | BertForPreTraining | 110 M 
---------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
440.426   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

batch 0: {'input_ids': tensor([[  101,  4483, 13239,  ...,     0,     0,     0],
        [  101,  2049,  1037,  ...,     0,     0,     0],
        [  101,  2178, 10733,  ...,  2032,  4435,   102],
        ...,
        [  101,  2017,  2064,  ...,  2005,  9053,   102],
        [  101,  3671,  3161,  ...,  1003,  1998,   102],
        [  101, 10886,  1996,  ...,   103,  1037,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,  ..., -100, -100, -100],
        [-100, -100, -100,

MisconfigurationException: In automatic_optimization, when `training_step` returns a dict, the 'loss' key needs to be present