# Обучение модели
## Полезные ссылки:
 - все что вы хотите знать о трансформерах https://huggingface.co/docs/transformers/index
 - база моделей https://huggingface.co/models
 - пример страницы документации модели (BERT) https://huggingface.co/docs/transformers/v4.33.0/en/model_doc/bert#overview
 - быстрый старт в pytorch lightning https://lightning.ai/docs/pytorch/stable//starter/introduction.html
 - настройки обучения https://lightning.ai/docs/pytorch/stable//common/trainer.html
 - использование pytorch lightning с torchmetrics https://torchmetrics.readthedocs.io/en/stable/pages/lightning.html
 - описание работы с данными в torch https://pytorch.org/docs/stable/data.html
 - логгирование https://www.comet.com/

In [1]:
import os

import numpy as np
import pandas as pd
import torch
import pytorch_lightning as pl
import torchmetrics
import datasets
import transformers

from model import CrossEncoderModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = datasets.load_from_disk("/mnt/cs/nlu/home/posokhov/new_merionum")

In [3]:
model_name_or_path = "cointegrated/rubert-tiny"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name_or_path)
encoder_model = transformers.AutoModel.from_pretrained(model_name_or_path)

tokenizer.add_special_tokens({"additional_special_tokens": ["[SEP_TOKEN]"]})
encoder_model.resize_token_embeddings(len(tokenizer))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 29565. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(29565, 312)

In [4]:
class Colator:
    def __init__(self, tokenizer) -> None:
        self.tokenizer = tokenizer

    def __call__(self, batch):
        batch = torch.utils.data.default_collate(batch)
        batch['pair'] = self.tokenizer(batch['pair'], max_length=True, return_tensors="pt", padding=True, truncation=True)
        return batch
    
colator = Colator(tokenizer)

train_data_loader = torch.utils.data.DataLoader(ds['train'], batch_size=32, collate_fn=colator)
val_data_loader = torch.utils.data.DataLoader(ds['train'], batch_size=32, collate_fn=colator)
test_data_loader = torch.utils.data.DataLoader(ds['train'], batch_size=32, collate_fn=colator)

In [6]:
cross_encoder = CrossEncoderModel(encoder=encoder_model, num_classes=2, num_warmup_steps=100, lr=5e-05)

  rank_zero_warn(


In [8]:
# logger
logger = pl.loggers.comet.CometLogger(
    api_key="ds5hsfdfh234ddg12s",
    save_dir="outdir",
    project_name="cross_encoder_example",
    experiment_name="exp_1",
)
logger.log_hyperparams({"model_name_or_path": model_name_or_path, "lr":cross_encoder.hparams.lr})

# checkpoint callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
     monitor='val_loss',
     dirpath="outdir",
     filename='rubert-tiny2-{epoch:02d}-{val_r1:.2f}',
     save_top_k=1,
     mode='max',
 )

CometLogger will be initialized in online mode


[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/mnt/cs/home/posokhov/projects/example' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/anpopaicoconat/cross-encoder-example/4725559f8a2842d3ab76c7fe74b655f1



In [9]:
# trainer
trainer = pl.Trainer(
    max_epochs=5,
    accelerator="gpu",
    devices=1,
    logger=logger,
    num_sanity_val_steps=1,
    callbacks=[checkpoint_callback]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(model=cross_encoder, train_dataloaders=train_data_loader, val_dataloaders=val_data_loader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loading `train_dataloader` to estimate number of stepping batches.
  rank_zero_warn(

  | Name          | Type             | Params
---------------------------------------------------
0 | encoder       | BertModel        | 11.8 M
1 | classificator | Linear           | 626   
2 | loss          | CrossEntropyLoss | 0     
3 | train_metrics | MetricCollection | 0     
4 | val_metrics   | MetricCollection | 0     
5 | test_metrics  | MetricCollection | 0     
---------------------------------------------------
11.8 M    Trainable params
0         Non-trainable params
11.8 M    Total params
47.140    Total estimated model params size (MB)
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(29565, 312)
    (position_embeddings): Embedding(512, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=T

Sanity Checking DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s]

  rank_zero_warn(


Epoch 4: 100%|██████████| 255/255 [00:07<00:00, 36.09it/s, v_num=55f1]     

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 255/255 [00:07<00:00, 36.08it/s, v_num=55f1]


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/anpopaicoconat/cross-encoder-example/4725559f8a2842d3ab76c7fe74b655f1
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     train_acc [5]        : (0.5681706666946411, 0.6663805842399597)
[1;38;5;39mCOMET INFO:[0m     train_f1 [5]         : (0.5681706666946411, 0.6663805842399597)
[1;38;5;39mCOMET INFO:[0m     train_loss_epoch [5] : (0.6042875051498413, 0.6809026598930359)
[1;38;5;39mCOMET INFO:[0m     train_loss_step [25] : (0.48392724990844727, 0.7683601379394531)
[1;38;5;39mCOMET INFO:[0m

In [11]:
trainer.test(dataloaders=[test_data_loader])

  rank_zero_warn(


[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/mnt/cs/home/posokhov/projects/example' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/anpopaicoconat/cross-encoder-example/4725559f8a2842d3ab76c7fe74b655f1

Restoring states from the checkpoint path at /mnt/cs/home/posokhov/projects/example/outdir/rubert-tiny2-epoch=00-val_r1=0.00-v2.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(29565, 312)
    (position_embeddings): Embedding(512, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_f

Testing DataLoader 0: 100%|██████████| 255/255 [00:02<00:00, 104.66it/s]


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml ExistingExperiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/anpopaicoconat/cross-encoder-example/4725559f8a2842d3ab76c7fe74b655f1
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     test_acc : 0.592815101146698
[1;38;5;39mCOMET INFO:[0m     test_f1  : 0.592815101146698
[1;38;5;39mCOMET INFO:[0m   Others:
[1;38;5;39mCOMET INFO:[0m     Created from : pytorch-lightning
[1;38;5;39mCOMET INFO:[0m     Name         : exp_1
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     encoder          : BertModel(
  (embeddings): BertEmbeddings(
    (word

[{'test_acc': 0.592815101146698, 'test_f1': 0.592815101146698}]