In [1]:
import sys
sys.path.append("../../")

In [2]:
import logging
import math

import torch
import wandb
from torch import nn

import pytorch_lightning as pl
import torch.nn.functional as F
from torch.optim.lr_scheduler import OneCycleLR, CyclicLR

import numpy as np

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch

In [4]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor, TQDMProgressBar, StochasticWeightAveraging
from pytorch_lightning.loggers import WandbLogger

import logging

import pytorch_lightning as pl
from torch.utils.data import DataLoader

from src.PetraRQ.PetraRQDatasets import LanguageModellingDataset

In [5]:
from pytorch_lightning import seed_everything

In [6]:
seed_everything(1, workers=True)

Global seed set to 1


1

In [7]:
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -  %(message)s',
    level=logging.INFO
)

In [8]:
# with open("../../data/train/lm.txt", "r", encoding="utf-8") as f:
#     train_data = f.readlines()

In [9]:
with open("../../data/dev/lm.txt", "r", encoding="utf-8") as f:
    dev_data = f.readlines()

In [10]:
train_batch_size = 6
dev_batch_size = 6
shuffle = False
steps = 4000
d_model = 512
num_tokens = 16000
seq_length = 256
overlapping_part = 64
depth = 6
# k = 256
heads = 8
# dim_head = None
# one_kv_head = False
# share_kv = True
dropout = 0.1
optimizer = "adagrad"
lr_min=1e-5
lr_max=3e-4
accumulate_grad_batches=2
duplicate_dataset_ratio=1
inputs_masking = 0.15

In [11]:
lm_ds = LanguageModellingDataset(
    train_data=dev_data,
    test_data=None,
    dev_data=dev_data,
    vocab_size=num_tokens,
    # max_len=seq_length,
    # batch_size=train_batch_size,
    duplicate_dataset_ratio=duplicate_dataset_ratio,
    masking_token_parts=inputs_masking,
    use_incremental_samples=True,
    incremental_samples_min=50,
    incremental_samples_step=30,
    increment_every_x_steps=600
)

Hash training data:   0%|          | 0/5003 [00:00<?, ?it/s]

2022-08-13 00:02:15,215 - INFO - root -  Training data hash: 3ded16b3953922cceaa90923883a7147
2022-08-13 00:02:15,216 - INFO - root -  Loading tokenizer from disk
2022-08-13 00:02:15,238 - INFO - root -  LMDS dev_dataset: 5003
2022-08-13 00:02:15,241 - INFO - root -  LMDS train_dataset: 5003


In [12]:
lm_ds.tokenizer.token_to_id("[PAD]")

2

In [13]:
def coll_fn(batch):
    texts = []
    labels = []
    incremental_indexes = []
    
    pad_token = 2
    
    # print(len(batch[0]))
    if len(batch[0]) == 3:
        for (text, label, incremental_index) in batch:
            texts.append(torch.tensor(text).to("cpu"))
            labels.append(torch.tensor(label).to("cpu"))
            incremental_indexes.append(incremental_index)
    else:
        for (text, label) in batch:
            texts.append(torch.tensor(text).to("cpu"))
            labels.append(torch.tensor(label).to("cpu"))
        
    ins = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=pad_token)
    labels = nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
    
    if len(incremental_indexes) > 0:
        return ins.numpy(), labels.numpy(), np.array(incremental_indexes)
    else:
        return ins.numpy(), labels.numpy()

In [14]:
train_data_loader = DataLoader(
    lm_ds.train_dataset,
    batch_size=train_batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    persistent_workers=False,
    collate_fn=coll_fn,
)

In [15]:
dev_data_loader = DataLoader(
    lm_ds.dev_dataset,
    batch_size=dev_batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    persistent_workers=False,
    collate_fn=coll_fn
)

In [16]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 50000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [17]:
class PetraRQ(pl.LightningModule):
    def __init__(
            self,
            d_model,
            num_tokens,
            seq_length,
            overlapping_part,
            depth,
            heads=8,
            dropout=0.1,
            steps=1000,
            lr_min=1e-4,
            lr_max=3e-3,
            optim="adam"
    ):
        super(PetraRQ, self).__init__()

        self.d_model = d_model
        self.num_tokens = num_tokens
        self.seq_length = seq_length
        self.overlapping_part = overlapping_part
        self.depth = depth
        self.heads = heads
        self.dropout = dropout
        self.steps = steps
        self.lr_min = lr_min
        self.lr_max = lr_max
        self.optim = optim
        self.overlapping_part = overlapping_part
        self.activation = nn.GELU()
        self.out_norm = nn.LayerNorm(num_tokens)
        self.memory_norm = nn.LayerNorm(d_model)

        assert (self.optim == 'adam' or self.optim == 'adagrad'), 'Optim must be set to "adam" or "adagrad"'

        self.token_emb = nn.Embedding(num_tokens, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        
        self.former_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=heads)
        self.former = nn.TransformerEncoder(self.former_layer, num_layers=depth)
        
        self.to_logits = nn.Linear(d_model, num_tokens)


    def forward(self, x_in):
        floating_memory = None
        output_hidden_layers = None
        floating_mems = None
        
        i = 0
        while ((output_hidden_layers is None) or (output_hidden_layers.shape[1] + (x_in.shape[1] % self.overlapping_part) < x_in.shape[1])):
        
            if floating_memory is None:
                x = self.token_emb(x_in[:, :self.seq_length].to(self.device))
                x_pos = self.pos_emb(x)
            else:
                toks = x_in[:, int((self.overlapping_part * (i+3))):int((self.overlapping_part * (i+4)))].to(self.device)
                embeds = self.token_emb(toks)
                x_pos = self.pos_emb(embeds)
                x_pos = torch.cat((x[:, self.overlapping_part:, :], x_pos), dim=1)
                
            x = self.former(x_pos)
            
            if floating_memory is None:
                floating_memory = x[:, :self.overlapping_part, :]
                output_hidden_layers = x[:, :self.overlapping_part, :]
                # floating_mems = x_pos[:, :self.overlapping_part, :]
            else:
                add = floating_memory + x[:, :self.overlapping_part, :]
                floating_memory = self.memory_norm(add)
                
                output_hidden_layers = torch.cat((output_hidden_layers, x[:, :self.overlapping_part, :]), dim=1)
                # floating_mems = torch.cat((floating_mems, floating_memory), dim=1)
            i += 1

        output_hidden_layers = torch.cat((output_hidden_layers, x[:, self.overlapping_part:, :]), dim=1)
        out = self.to_logits(output_hidden_layers)
        return out

    def configure_optimizers(self):
        if self.optim == 'adagrad':
            optimizer = torch.optim.Adagrad(
                self.parameters(),
                lr=self.lr_min,
                weight_decay=0.01
            )
        elif self.optim == 'adam':
            optimizer = torch.optim.AdamW(
                self.parameters(),
                lr=self.lr_min,
                weight_decay=0.01,
                betas=(0.9, 0.999)
            )

        lr_scheduler = OneCycleLR(
            optimizer,
            max_lr=self.lr_max,
            total_steps=self.steps,
            cycle_momentum=False,
        )

        return [optimizer], [{'scheduler': lr_scheduler, 'interval': 'step'}]
        # return optimizer

    def training_step(self, batch, batch_idx):
        # x, y, token_pos = batch
        xs, ys, incremental_indexs = batch
        
        x = torch.tensor(xs).to("cpu")
        y = torch.tensor(ys).to(self.device)
        incremental_index = torch.tensor(incremental_indexs).to(self.device)
        # print(incremental_index)
        x = self.forward(x)
        # print('x shape', x.shape)
        # print('y shape', y.shape)
        # print(token_pos)
        # print('output shape', x.shape)
        # print('target shape', y.shape)
        # print('cross entropy x', x.view(-1, self.num_tokens).shape)
        # print('cross entropy y', y.long().view(-1).shape)

        # loss = F.cross_entropy(x[range(x.shape[0]), token_pos, :], y)
        # print('view', x.view(-1, self.num_tokens).shape, y.long().view(-1).shape)
        loss = F.cross_entropy(x.view(-1, self.num_tokens), y.long().view(-1))
        self.log('train/loss', loss, prog_bar=True, batch_size=x.shape[0])
        self.log('padding_mask', torch.mean(incremental_index.float()), prog_bar=True, batch_size=x.shape[0])
        wandb.log({'train/loss': loss})

        perplexity = torch.exp(loss)
        self.log('train/perplexity', perplexity, prog_bar=True, batch_size=x.shape[0])
        wandb.log({'train/perplexity': perplexity})

        # wandb.log({'train/learning_rate': self.optimizers[0].param_groups[0]['lr']})

        return {'loss': loss, 'perplexity': perplexity}

    def validation_step(self, batch, batch_idx):
        # print(batch)
        # x, y, token_pos = batch
        xs, ys = batch
        x = torch.tensor(xs).to("cpu")
        y = torch.tensor(ys).to(self.device)
        
        # print(x, y)
        # print('x', x.shape)
        # print('y', y.shape)
        x = self.forward(x)
        # loss = F.cross_entropy(x[range(x.shape[0]), token_pos, :], y)
        # print('view', x.view(-1, self.num_tokens).shape, y.long().view(-1).shape)
        loss = F.cross_entropy(x.view(-1, self.num_tokens), y.long().view(-1))
        # print('loss', loss)
        perplexity = torch.exp(loss)
        self.log('eval/loss', loss, prog_bar=True, batch_size=x.shape[0])
        wandb.log({'eval/loss': loss})
        self.log('eval/perplexity', perplexity, prog_bar=True, batch_size=x.shape[0])
        wandb.log({'eval/perplexity': perplexity})
        return {'val_loss': loss, 'val_perplexity': perplexity}


In [18]:
petra = PetraRQ(
    d_model=d_model,
    num_tokens=num_tokens,
    seq_length=seq_length,
    overlapping_part=overlapping_part,
    depth=depth,
    heads=heads,
    dropout=dropout,
    steps=steps,
    optim=optimizer
)

In [19]:
wandb_logger = WandbLogger(
    project="PetraRQ",
    name="PetraRQ_transformer_recurrent_test",
    log_model="all"
)

2022-08-13 00:02:22,739 - ERROR - wandb.jupyter -  Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: bmarcin. Use `wandb login --relogin` to force relogin


In [20]:
wandb_logger.experiment.config['batch_size'] = train_batch_size
wandb_logger.experiment.config['steps'] = steps
wandb_logger.experiment.config['d_model'] = d_model
wandb_logger.experiment.config['num_tokens'] = num_tokens
wandb_logger.experiment.config['seq_length'] = seq_length
wandb_logger.experiment.config['depth'] = depth
wandb_logger.experiment.config['heads'] = heads
wandb_logger.experiment.config['dropout'] = dropout
wandb_logger.experiment.config['optimizer'] = optimizer
wandb_logger.experiment.config['duplicate_dataset_ratio'] = duplicate_dataset_ratio

In [21]:
trainer = pl.Trainer(
    devices=1,
    max_steps=steps,
    log_every_n_steps=5,
    accelerator='gpu',
    accumulate_grad_batches=accumulate_grad_batches,
    val_check_interval=0.05,
    # val_check_interval=300,
    default_root_dir='./PetraRQmodel',
    enable_checkpointing=False,
    callbacks=[
        # ModelCheckpoint(
        #     dirpath='./PetraRQmodel/checkpoints',
        #     save_top_k=3,
        #     monitor='eval/loss',
        #     mode='min',
        #     filename='petrarq-{epoch}-{val_loss:.2f}.ckpt'
        # ),
        EarlyStopping(
            monitor='train/loss',
            mode='min',
            patience=6,
            check_finite=True,
        ),
        LearningRateMonitor(logging_interval='step'),
        TQDMProgressBar(refresh_rate=1),
        # StochasticWeightAveraging(swa_lrs=1e-3)
    ],
    logger=wandb_logger,
    reload_dataloaders_every_n_epochs=0,
    gradient_clip_val=0.5
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [22]:
trainer.validate(petra, dev_data_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

2022-08-13 00:02:38,789 - INFO - root -  Incrementing epochs done to 1


Incrementing epochs done to 1


2022-08-13 00:02:55,688 - INFO - root -  Incrementing epochs done to 2


Incrementing epochs done to 2


2022-08-13 00:03:20,420 - INFO - root -  Incrementing epochs done to 3


Incrementing epochs done to 3


2022-08-13 00:04:00,722 - INFO - root -  Incrementing epochs done to 4


Incrementing epochs done to 4


2022-08-13 00:05:15,457 - INFO - root -  Incrementing epochs done to 5


Incrementing epochs done to 5


2022-08-13 00:07:16,425 - INFO - root -  Incrementing epochs done to 6


Incrementing epochs done to 6


RuntimeError: CUDA out of memory. Tried to allocate 3.20 GiB (GPU 0; 8.00 GiB total capacity; 3.44 GiB already allocated; 2.93 GiB free; 3.46 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.fit(petra, train_data_loader, dev_data_loader)