In [1]:
class CFG:
    dataset = 'Fragments_markup.csv'
    n_tags=100
    train_path = None
    val_path = None
    test_path = None
    model_name = 'DeepPavlov/rubert-base-cased'
    embed_size = 768
    seed = 42
    
    lr = 1e-5
    num_warmup_steps = 100
    num_training_steps = 10000
    epochs = 10
    batch_size = 4
    n_dev = 1
    dev = 'gpu'

    checkpoint_dir = './checkpoints'
    log_dir = './logs'
    exp_name = f'DPrubert_lr={lr}_bs={batch_size}_random_split'
    model_path = f'DPrubert_lr={lr}_bs={batch_size}_random_split'
    validate_every_n = 25

# LIBS

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.optim import lr_scheduler, Adam, SGD
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModel, AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger, TensorBoardLogger

import gc
from tqdm import tqdm

from mylib.train import LitModel, SequenceLabelModel, calc_f1
from mylib.dataset import SequenceLabelDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def set_seed(seed):
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available(): # для GPU отдельный seed
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(CFG.seed)
# есть стохастические операции на GPU
# сделаем их детерминированными для воспроизводимости
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

# DATASET

In [4]:
if (CFG.train_path is None) and (CFG.val_path is None) and (CFG.test_path is None):
    df = pd.read_csv(CFG.dataset)
    documents = df['document_id'].unique()
    all_markup = pd.DataFrame(columns=['text', 'markup', 'tags'])

    for d in documents:
      rows = df[df['document_id'] == d]
      markup = [
          {
             'id': r['id'], 'start': r['start_position'], 
             'end': r['end_position'], 
             'tag': r['tag_id']
             }
          for _, r in rows.iterrows()
      ]
      all_tags = [m['tag'] for m in markup]

      all_markup.loc[d, 'text'] = rows.text.iloc[0]
      all_markup.loc[d, 'markup'] = markup
      all_markup.loc[d, 'tags'] = all_tags

    train_df, vt_df = train_test_split(all_markup, test_size=0.4, random_state=CFG.seed)
    val_df, test_df = train_test_split(vt_df, test_size=0.5, random_state=CFG.seed)
else:
    train_df = pd.read_csv(CFG.train_path)
    val_df = pd.read_csv(CFG.val_path)
    test_df = pd.read_csv(CFG.test_path)

In [5]:
train_counter = {}
for i in train_df.tags:
    for c in i:
        train_counter[c] = train_counter.get(c, 0) + 1
val_counter = {}
for i in val_df.tags:
    for c in i:
        val_counter[c] = val_counter.get(c, 0) + 1
test_counter = {}
for i in test_df.tags:
    for c in i:
        test_counter[c] = test_counter.get(c, 0) + 1

In [6]:
len(train_counter.keys()), len(val_counter.keys()), len(test_counter.keys())

(97, 73, 80)

In [7]:
len(set(train_counter.keys()).intersection(val_counter.keys()))

72

In [8]:
len(set(train_counter.keys()).intersection(test_counter.keys()))

77

In [9]:
len(set(train_counter.keys()).union(val_counter.keys()).union(test_counter.keys()))

100

In [10]:
tags = set(train_counter.keys()).union(val_counter.keys()).union(test_counter.keys())
short_tags = {t: i for i, t in enumerate(list(tags))}

In [11]:
for i in train_df.index:
    mu = train_df.loc[i, 'markup']
    for j in range(len(mu)):
        mu[j]['tag'] = short_tags[mu[j]['tag']]
    train_df.loc[i, 'markup'] = mu

for i in val_df.index:
    mu = val_df.loc[i, 'markup']
    for j in range(len(mu)):
        mu[j]['tag'] = short_tags[mu[j]['tag']]
    val_df.loc[i, 'markup'] = mu

for i in test_df.index:
    mu = test_df.loc[i, 'markup']
    for j in range(len(mu)):
        mu[j]['tag'] = short_tags[mu[j]['tag']]
    test_df.loc[i, 'markup'] = mu

In [12]:
len(train_df), len(val_df), len(test_df)

(313, 104, 105)

# EXPS

In [13]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
embedder = AutoModel.from_pretrained(CFG.model_name)
model = SequenceLabelModel(embedder, embed_size=CFG.embed_size, n_tags=CFG.n_tags)
loss = nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=CFG.lr)
scheduler = None
# get_linear_schedule_with_warmup(optimizer, num_warmup_steps=CFG.num_warmup_steps, num_training_steps=CFG.num_training_steps)
#  get_cosine_schedule_with_warmup

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
checkpoint_callback = ModelCheckpoint(
    mode="max",
    filename=CFG.model_path,
    dirpath=CFG.checkpoint_dir,
    save_top_k=1, monitor="f1_micro/Valid"
    )
logger = TensorBoardLogger(
    save_dir=CFG.log_dir,
    name=CFG.exp_name,
    )

In [15]:
train_ds = SequenceLabelDataset(df=train_df, tokenizer=tokenizer, n_tags=CFG.n_tags)
val_ds = SequenceLabelDataset(df=val_df, tokenizer=tokenizer, n_tags=CFG.n_tags)
test_ds = SequenceLabelDataset(df=test_df, tokenizer=tokenizer, n_tags=CFG.n_tags)

In [16]:
train_batch_size = CFG.batch_size
test_batch_size = CFG.batch_size
train_dataloader = DataLoader(train_ds, batch_size=train_batch_size, shuffle=True, drop_last=False)
val_dataloader = DataLoader(val_ds, batch_size=test_batch_size, shuffle=False, drop_last=False)
test_dataloader = DataLoader(test_ds, batch_size=test_batch_size, shuffle=False, drop_last=False)

In [17]:
def calc_tagless_f1(y_hat, y):
    y_hat = torch.any(y_hat, keepdim=True, dim=1)
    y = torch.any(y, keepdim=True, dim=1)
    score = calc_f1(y_hat, y)
    return {
        'tagless_f1_micro': score['f1_micro'], 
        'tagless_recall': score['recall'], 
        'tagless_precision': score['precision']
    }

In [18]:
trainer = pl.Trainer(
    logger=logger,
    max_epochs=CFG.epochs,
    devices=CFG.n_dev, accelerator=CFG.dev,
    callbacks=[checkpoint_callback],
    val_check_interval=CFG.validate_every_n
    )

clf_model = LitModel(model, loss, optimizer, scheduler, metric_functions=[calc_f1, calc_tagless_f1])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
trainer.fit(
    clf_model,
    train_dataloader,
    val_dataloader
    )

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/admin/.pyenv/versions/3.10.10/envs/learner/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:630: Checkpoint directory /home/admin/lab/manipulation_detection/code/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type               | Params
---------------------------------------------
0 | model | SequenceLabelModel | 177 M 
1 | loss  | BCEWithLogitsLoss  | 0     
---------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
711.721   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/home/admin/.pyenv/versions/3.10.10/envs/learner/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.


                                                                           

/home/admin/.pyenv/versions/3.10.10/envs/learner/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.


Epoch 0:   0%|          | 0/79 [00:00<?, ?it/s] 

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacty of 23.69 GiB of which 25.19 MiB is free. Process 294723 has 5.92 GiB memory in use. Process 295105 has 5.92 GiB memory in use. Process 296620 has 3.50 GiB memory in use. Process 296940 has 320.00 MiB memory in use. Process 297290 has 5.92 GiB memory in use. Process 297801 has 2.09 GiB memory in use. Of the allocated memory 1.73 GiB is allocated by PyTorch, and 57.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.test(clf_model, test_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/admin/.pyenv/versions/3.10.10/envs/learner/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=63` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 27/27 [00:00<00:00, 30.20it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     f1_micro/Valid        0.006813447456806898
     precision/Valid       0.003463058266788721
      recall/Valid          0.2094229906797409
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'f1_micro/Valid': 0.006813447456806898,
  'recall/Valid': 0.2094229906797409,
  'precision/Valid': 0.003463058266788721}]