In [1]:
import sys
sys.path.append("../../")

In [2]:
import logging
import math

import torch
import wandb
from torch import nn

import pytorch_lightning as pl
import torch.nn.functional as F
from torch.optim.lr_scheduler import OneCycleLR, CyclicLR

import numpy as np
import pandas as pd

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import torch

In [4]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor, TQDMProgressBar, StochasticWeightAveraging
from pytorch_lightning.loggers import WandbLogger

import logging

import pytorch_lightning as pl
from torch.utils.data import DataLoader

# from src.PetraRQ.PetraRQDatasets import LanguageModellingDataset

In [5]:
from pytorch_lightning import seed_everything

In [6]:
seed_everything(1, workers=True)

Global seed set to 1


1

In [7]:
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -  %(message)s',
    level=logging.INFO
)

In [8]:
data_train = pd.read_csv("../../data/train/processed.tsv", delimiter='\t', header=None, encoding="utf8", quoting=0)
data_dev = pd.read_csv("../../data/dev/processed.tsv", delimiter='\t', header=None, encoding="utf8", quoting=0)
labels_train = pd.read_csv("../../data/train/expected.tsv", delimiter='\t', header=None, encoding="utf8", quoting=0)
labels_dev = pd.read_csv("../../data/dev/expected.tsv", delimiter='\t', header=None, encoding="utf8", quoting=0)

In [9]:
unique_labels_tsv = pd.read_csv("../../data/labels.tsv", delimiter='\t', header=None, encoding="utf8", quoting=0)
unique_labels = unique_labels_tsv[0].tolist()

In [10]:
train_batch_size = 2
dev_batch_size = 6
shuffle = False
steps = 4000
d_model = 512
num_tokens = 8000
seq_length = 512
overlapping_part = 256
depth = 4
# k = 256
heads = 8
# dim_head = None
# one_kv_head = False
# share_kv = True
dropout = 0.1
optimizer = "adagrad"
lr_min=1e-5
lr_max=3e-4
accumulate_grad_batches=6
# duplicate_dataset_ratio=1
# inputs_masking = 0.15

In [11]:
import torch.utils.data
from tqdm.auto import tqdm


class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, input_texts, input_labels, unique_labels, tokenizer):
        self.input_texts = input_texts
        self.input_labels = input_labels
        self.unique_labels = unique_labels
        self.tokenizer = tokenizer
        self.label2idx = {}

        # self.bp = BasicProcessor()

        for label in self.unique_labels:
            self.label2idx[label] = len(self.label2idx)

        print(self.label2idx)

#         self.tokenized_texts = []


#         for idx in tqdm(range(len(self.input_texts)), desc='Tokenizing texts'):
#             tokenized = self.tokenizer.encode(self.input_texts.iloc[idx][0])
#             self.tokenized_texts.append(tokenized)

    def __len__(self):
        return len(self.input_texts)

    def labels2tensor(self, labels):
        return set([self.label2idx[label.strip().lower()] for label in labels])

    def tensor2labels(self, tensor):
        labels = []
        for idx, label in enumerate(self.unique_labels):
            if tensor[idx] == 1:
                labels.append(label)
        return labels

    def __getitem__(self, idx):
        tokenized = self.tokenizer.encode(self.input_texts.iloc[idx][0])
        labels = self.labels2tensor(self.input_labels.iloc[idx][0].split(' '))

        # item = {
        #     'input_ids': torch.tensor(tokenized['input_ids']),
        #     'attention_mask': torch.tensor(tokenized['attention_mask']),
        #     'labels': torch.zeros([len(self.label2idx)]).index_fill_(0, torch.tensor(list(labels)), 1)
        # }

        return np.array(tokenized.ids), torch.zeros([len(self.label2idx)]).index_fill_(0, torch.tensor(list(labels)), 1).numpy()


In [12]:
from tokenizers import Tokenizer
tokenizer = Tokenizer.from_file("./tokenizer_{}.json".format("3ded16b3953922cceaa90923883a7147"))

In [13]:
dev_ds = ClassificationDataset(data_dev, labels_dev, unique_labels, tokenizer)
train_ds = ClassificationDataset(data_train, labels_train, unique_labels, tokenizer)

{'health': 0, 'media_informations': 1, 'transportation': 2, 'industry': 3, 'work_and_employment': 4, 'agriculture': 5, 'social_life': 6, 'internal_security': 7, 'economy': 8, 'environment': 9, 'european_union': 10, 'energy': 11, 'foreign_policy': 12, 'education': 13, 'law': 14, 'politics_political_parties': 15, 'sports': 16, 'research_science_and_technology': 17, 'taxes': 18}
{'health': 0, 'media_informations': 1, 'transportation': 2, 'industry': 3, 'work_and_employment': 4, 'agriculture': 5, 'social_life': 6, 'internal_security': 7, 'economy': 8, 'environment': 9, 'european_union': 10, 'energy': 11, 'foreign_policy': 12, 'education': 13, 'law': 14, 'politics_political_parties': 15, 'sports': 16, 'research_science_and_technology': 17, 'taxes': 18}


In [14]:
def coll_fn(batch):
    texts = []
    labels = []
    incremental_indexes = []
    
    pad_token = 2
    
    for (text, label) in batch:
        texts.append(torch.tensor(text).to("cpu"))
        labels.append(label)

    ins = nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=pad_token)
    seq_len = ins.shape[1]
    
    rest = seq_len % seq_length
    # print(seq_len, 256 - rest)
    if rest != 0:
        fill_matrix = torch.zeros((ins.shape[0], seq_length - rest)).fill_(pad_token).long()
        ins = torch.cat((ins, fill_matrix), dim=1)

    return ins.numpy(), np.array(labels)

In [15]:
train_data_loader = DataLoader(
    train_ds,
    batch_size=train_batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    persistent_workers=False,
    collate_fn=coll_fn,
)

In [16]:
dev_data_loader = DataLoader(
    dev_ds,
    batch_size=dev_batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    persistent_workers=False,
    collate_fn=coll_fn
)

In [19]:
for (x, y) in dev_data_loader:
    print(x, y, x.shape, y.shape)
    break

[[ 644 4129   32 ...    2    2    2]
 [5667 5677 1324 ...    2    2    2]
 [5667 5677 1324 ...    2    2    2]
 [5667 5677 1324 ...    2    2    2]
 [5667 5677 1324 ...    2    2    2]
 [5667 5677 1324 ...    2    2    2]] [[0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]] (6, 4096) (6, 19)


In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(preds, labels):
    # print(preds.shape, preds)
    preds = (preds >= 0.5).astype(int)#.argmax(-1)

    # print(preds.shape, labels.shape)
    # print(preds, labels)

    # try:
    acc = accuracy_score(preds, labels)
    # except ValueError:

    return {
        'accuracy': acc,
        'f1': f1_score(y_true=labels, y_pred=preds, average='weighted'),
        'precision': precision_score(y_true=labels, y_pred=preds, average='weighted'),
        'recall': recall_score(y_true=labels, y_pred=preds, average='weighted')
    }

In [21]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 50000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [24]:
class PetraRQ(pl.LightningModule):
    def __init__(
            self,
            d_model,
            num_labels,
            seq_length,
            overlapping_part,
            depth,
            heads=8,
            dropout=0.1,
            steps=1000,
            lr_min=1e-4,
            lr_max=3e-3,
            optim="adam"
    ):
        super(PetraRQ, self).__init__()

        self.d_model = d_model
        self.num_labels = num_labels
        self.seq_length = seq_length
        self.overlapping_part = overlapping_part
        self.depth = depth
        self.heads = heads
        self.dropout = dropout
        self.steps = steps
        self.lr_min = lr_min
        self.lr_max = lr_max
        self.optim = optim
        self.overlapping_part = overlapping_part
        self.activation = nn.GELU()
        self.out_norm = nn.LayerNorm(d_model)
        self.memory_norm = nn.LayerNorm(d_model)
        self.sigm = nn.Sigmoid()

        assert (self.optim == 'adam' or self.optim == 'adagrad'), 'Optim must be set to "adam" or "adagrad"'

        self.token_emb = nn.Embedding(num_tokens, d_model)
        self.pos_emb = PositionalEncoding(d_model)
        
        self.former_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=heads)
        self.former = nn.TransformerEncoder(self.former_layer, num_layers=depth)
        
        self.to_logits = nn.Linear(d_model, num_labels)


    def forward(self, x_in):
        floating_memory = None
        output_hidden_layers = None
        floating_mems = None
        
        # print('x_in', x_in.shape)
        
        i = 0
        while ((output_hidden_layers is None) or (output_hidden_layers + (x_in.shape[1] % self.overlapping_part) < x_in.shape[1])):
        
            if floating_memory is None:
                x = self.token_emb(x_in[:, :self.seq_length].to(self.device))
                x_pos = self.pos_emb(x)
            else:
                part = self.seq_length / self.overlapping_part
                toks = x_in[:, int((self.overlapping_part * (i+part-1))):int((self.overlapping_part * (i+part)))].to(self.device)
                # print('from', int((self.overlapping_part * (i+part-1))), 'to', int((self.overlapping_part * (i+part))))
                embeds = self.token_emb(toks)
                x_pos = self.pos_emb(embeds)
                x_pos = torch.cat((x[:, self.overlapping_part:, :], x_pos), dim=1)
                
            # print('x_pos', x_pos.shape)
            # if output_hidden_layers is not None:
                # print('dzialanie', output_hidden_layers + (x_in.shape[1] % self.overlapping_part), x_in.shape)
            x = self.former(x_pos)
            
            if floating_memory is None:
                floating_memory = x[:, :self.overlapping_part, :]
                output_hidden_layers = overlapping_part
                # output_hidden_layers = x[:, :self.overlapping_part, :]
                # floating_mems = x_pos[:, :self.overlapping_part, :]
            else:
                add = floating_memory + x[:, :self.overlapping_part, :]
                floating_memory = self.memory_norm(add)
                output_hidden_layers += self.overlapping_part
                # output_hidden_layers = torch.cat((output_hidden_layers, x[:, :self.overlapping_part, :]), dim=1)
                # floating_mems = torch.cat((floating_mems, floating_memory), dim=1)
            del(x_pos)
            i += 1

        out = torch.cat((floating_memory, x[:, self.overlapping_part:, :]), dim=1)

        # print(out.shape)
        # out = out[:, 0, :]
        out = out.sum(dim=1)
        out = self.out_norm(out)
        out = self.to_logits(out)
        out = self.sigm(out)
        # out = self.out_norm(out)
        # out = torch.tanh(out)
        # out = (out >= 0.5).long()
        return out

    def configure_optimizers(self):
        if self.optim == 'adagrad':
            optimizer = torch.optim.Adagrad(
                self.parameters(),
                lr=self.lr_min,
                weight_decay=0.01
            )
        elif self.optim == 'adam':
            optimizer = torch.optim.AdamW(
                self.parameters(),
                lr=self.lr_min,
                weight_decay=0.01,
                betas=(0.9, 0.999)
            )

        lr_scheduler = OneCycleLR(
            optimizer,
            max_lr=self.lr_max,
            total_steps=self.steps,
            cycle_momentum=False,
        )

        return [optimizer], [{'scheduler': lr_scheduler, 'interval': 'step'}]
        # return optimizer

    def training_step(self, batch, batch_idx):
        xs, ys = batch
        x = torch.tensor(xs).to("cpu")
        y = torch.tensor(ys).to(self.device)
        
        x = self.forward(x)        

        loss = F.binary_cross_entropy(x, y)
        
        metrics = compute_metrics(x.detach().cpu().numpy(), y.long().detach().cpu().numpy())
        
        self.log('train/f1', metrics['f1'], prog_bar=True, batch_size=x.shape[0])
        self.log('train/accuracy', metrics['accuracy'], prog_bar=True, batch_size=x.shape[0])
        self.log('train/precision', metrics['precision'], prog_bar=True, batch_size=x.shape[0])
        self.log('train/recall', metrics['recall'], prog_bar=True, batch_size=x.shape[0])

        self.log('train/loss', loss, prog_bar=True, batch_size=x.shape[0])
        return {
            'loss': loss, 
            'train_f1': metrics['f1'],
            'train_acc': metrics['accuracy'],
            'train_precision': metrics['precision'],
            'train_recall': metrics['recall']
        }

    def validation_step(self, batch, batch_idx):
        xs, ys = batch
        x = torch.tensor(xs).to("cpu")
        y = torch.tensor(ys).to(self.device)
        
        x = self.forward(x)        

        loss = F.binary_cross_entropy(x, y)
        
        metrics = compute_metrics(x.cpu().numpy(), y.long().cpu().numpy())
        
        self.log('eval/f1', metrics['f1'], prog_bar=True, batch_size=x.shape[0])
        self.log('eval/accuracy', metrics['accuracy'], prog_bar=True, batch_size=x.shape[0])
        self.log('eval/precision', metrics['precision'], prog_bar=True, batch_size=x.shape[0])
        self.log('eval/recall', metrics['recall'], prog_bar=True, batch_size=x.shape[0])

        self.log('eval/loss', loss, prog_bar=True, batch_size=x.shape[0])
        return {
            'val_loss': loss, 
            'val_f1': metrics['f1'],
            'val_acc': metrics['accuracy'],
            'val_precision': metrics['precision'],
            'val_recall': metrics['recall']
        }


In [25]:
petra = PetraRQ(
    d_model=d_model,
    num_labels=len(unique_labels),
    seq_length=seq_length,
    overlapping_part=overlapping_part,
    depth=depth,
    heads=heads,
    dropout=dropout,
    steps=steps,
    optim=optimizer
)

In [26]:
wandb_logger = WandbLogger(
    project="PetraRQ",
    name="PetraRQ_classifier_recurrent_transfomer",
    log_model="all"
)

2022-08-16 17:56:14,242 - ERROR - wandb.jupyter -  Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: bmarcin. Use `wandb login --relogin` to force relogin


In [27]:
wandb_logger.experiment.config['batch_size'] = train_batch_size
wandb_logger.experiment.config['steps'] = steps
wandb_logger.experiment.config['d_model'] = d_model
wandb_logger.experiment.config['num_labels'] = len(unique_labels)
wandb_logger.experiment.config['seq_length'] = seq_length
wandb_logger.experiment.config['depth'] = depth
wandb_logger.experiment.config['heads'] = heads
wandb_logger.experiment.config['dropout'] = dropout
wandb_logger.experiment.config['optimizer'] = optimizer
# wandb_logger.experiment.config['duplicate_dataset_ratio'] = duplicate_dataset_ratio

In [28]:
trainer = pl.Trainer(
    devices=1,
    max_steps=steps,
    log_every_n_steps=5,
    accelerator='gpu',
    # accumulate_grad_batches=accumulate_grad_batches,
    val_check_interval=0.05,
    # val_check_interval=300,
    default_root_dir='./PetraRQmodel',
    enable_checkpointing=False,
    callbacks=[
        # ModelCheckpoint(
        #     dirpath='./PetraRQmodel/checkpoints',
        #     save_top_k=3,
        #     monitor='eval/loss',
        #     mode='min',
        #     filename='petrarq-{epoch}-{val_loss:.2f}.ckpt'
        # ),
        EarlyStopping(
            monitor='eval/f1',
            mode='min',
            patience=4,
            check_finite=True,
        ),
        LearningRateMonitor(logging_interval='step'),
        TQDMProgressBar(refresh_rate=1),
        # StochasticWeightAveraging(swa_lrs=1e-3)
    ],
    logger=wandb_logger,
    reload_dataloaders_every_n_epochs=0,
    gradient_clip_val=0.5
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [29]:
import warnings
warnings.filterwarnings('always')

In [26]:
trainer.validate(petra, dev_data_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Validation: 0it [00:00, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(averag

In [30]:
trainer.fit(petra, train_data_loader, dev_data_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type                    | Params
---------------------------------------------------------
0 | activation   | GELU                    | 0     
1 | out_norm     | LayerNorm               | 1.0 K 
2 | memory_norm  | LayerNorm               | 1.0 K 
3 | sigm         | Sigmoid                 | 0     
4 | token_emb    | Embedding               | 4.1 M 
5 | pos_emb      | PositionalEncoding      | 0     
6 | former_layer | TransformerEncoderLayer | 3.2 M 
7 | former       | TransformerEncoder      | 12.6 M
8 | to_logits    | Linear                  | 9.7 K 
---------------------------------------------------------
19.9 M    Trainable params
0         Non-trainable params
19.9 M    Total params
79.479    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.