# prepare

In [1]:
# imports
import sys
sys.path.append("..")
import os
import argparse
import json

import torch
import pytorch_lightning as pl
import torchmetrics
import transformers

from models import T5_Model
from t5_utils import Toloka_DS, Collator


pl.utilities.seed.seed_everything(42)

  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 42


42

In [2]:
# proxy
os.environ["http_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["https_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["ftp_proxy"] = "http://proxy.ad.speechpro.com:3128"

In [3]:
# T5 config
batch_size = 32
epochs = 15
num_warmup_steps = 1000
lr = 5e-5
# experiment config


# pretrained model

In [4]:
# tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("sberbank-ai/ruT5-base", truncation_side='left', padding_side='right')
special_tokens_dict = {
    "additional_special_tokens": [
        "[Model]",
        "[User]",
        "[MaleG]",
        "[FemaleG]",
        "[UnknownG]",
        "[ModelGK]",
        "[UserGK]",
        "[WorldGK]",
        "|DialogContext|:",
        "|DialogAnswer|:",
        "|DialogModelGK|:",
        "|DialogCrossEnc|:",
    ]
}
tokenizer.add_special_tokens(special_tokens_dict)

12

In [5]:
# gpt
t5 = transformers.T5ForConditionalGeneration.from_pretrained("sberbank-ai/ruT5-base")
t5.resize_token_embeddings(len(tokenizer))

Embedding(32112, 768)

# data

In [6]:
# dataset
train_answer_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(train).jsonl', exaples='answer', context_len='all')
val_answer_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(test).jsonl', exaples='answer', context_len='all')

train_gk_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(train).jsonl', exaples='one_gk', context_len='all')
val_gk_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(test).jsonl', exaples='one_gk', context_len='all')

In [7]:
# cоllator
collator = Collator(
    spectokens=special_tokens_dict["additional_special_tokens"],
    tokenizer=tokenizer,
    padding='max_length',
    qury_len=64,
    cand_len=32,
)

In [8]:
# dataloader
train_answer_dataloader = torch.utils.data.DataLoader(
    train_answer_dataset, batch_size=batch_size, shuffle=False, collate_fn=collator.BiEncoder
)
val_answer_dataloader = torch.utils.data.DataLoader(
    val_answer_dataset, batch_size=batch_size, shuffle=False, collate_fn=collator.BiEncoder
)

train_gk_dataloader = torch.utils.data.DataLoader(
    train_gk_dataset, batch_size=batch_size, shuffle=False, collate_fn=collator.BiEncoder
)
val_gk_dataloader = torch.utils.data.DataLoader(
    val_gk_dataset, batch_size=batch_size, shuffle=False, collate_fn=collator.BiEncoder
)

In [9]:
# scheduler len
scheduler_len = max(len(train_answer_dataloader), len(train_gk_dataloader)) * epochs

In [10]:
len(train_answer_dataloader), len(train_gk_dataloader)

(2237, 861)

# pl trainloop

In [11]:
# pl model
model = T5_Model(
    T5=t5,
    train_answer_dataloader=train_answer_dataloader,
    val_answer_dataloader=val_answer_dataloader,
    train_gk_dataloader=train_gk_dataloader,
    val_gk_dataloader=val_gk_dataloader,
    tokenizer=tokenizer,
    scheduler_len=scheduler_len,
    num_warmup_steps=num_warmup_steps,
    lr=lr,
)

In [12]:
# logger
logger = pl.loggers.comet.CometLogger(
    api_key='sEJsZrYjwc0gxxUAUGQNBwTsb',
    save_dir='/home/stc/persona/logs/t5',
    project_name='T5',
    experiment_name='test',
)
#logger.log_hyperparams(gpt_args)

CometLogger will be initialized in online mode


In [13]:
# # checkpoint callback
# checkpoint_callback = pl.callbacks.ModelCheckpoint(
#      monitor='val_loss',
#      dirpath=gpt_args.save_dir,
#      filename='gpt-{epoch:02d}-{val_loss:.2f}',
#      save_top_k=1,
#      mode='min',
#  )

In [14]:
# trainer
trainer = pl.Trainer(
    max_epochs=epochs,
    accelerator="gpu",
    devices=1,
    gradient_clip_val=1,
    logger=logger,
    num_sanity_val_steps=10,
    #callbacks=[checkpoint_callback]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [15]:
# fit 12.77
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                       | Type                       | Params
--------------------------------------------------------------------------
0 | T5                         | T5ForConditionalGeneration | 222 M 
1 | BiEnc_answer_loss          | CrossEntropyLoss           | 0     
2 | BiEnc_gk_loss              | CrossEntropyLoss           | 0     
3 | train_BiEnc_answer_metrics | MetricCollection           | 0     
4 | train_BiEnc_gk_metrics     | MetricCollection           | 0     
5 | val_BiEnc_answer_metrics   | MetricCollection           | 0     
6 | val_BiEnc_gk_metrics       | MetricCollection           | 0     
7 | gen_metrics                | MetricCollection           | 0     
--------------------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.565   Total estimated model params size (MB)
COMET INFO: Experiment is live on comet.com https://www.

Sanity Checking DataLoader 0:   0%|          | 0/10 [00:00<?, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


                                                                             

  rank_zero_warn(


Epoch 0:   0%|          | 2/2576 [00:01<26:16,  1.63it/s, loss=4.46, v_num=af2d, lr=5e-8, train_loss_step=4.500, train_BiEnc_answer_loss_step=4.450] 

RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 23.69 GiB total capacity; 22.60 GiB already allocated; 2.56 MiB free; 22.64 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF