# prepare

In [1]:
# imports
import sys
sys.path.append("..")
import os
import argparse
import json

import torch
import pytorch_lightning as pl
import torchmetrics
import transformers

from t5_model import single_multitask_model
from t5_utils import Toloka_DS, Collator, SequntiaLoader


pl.utilities.seed.seed_everything(42)

  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 42


42

In [None]:
# proxy
os.environ["http_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["https_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["ftp_proxy"] = "http://proxy.ad.speechpro.com:3128"

: 

In [None]:
# T5 config
batch_size = 86
epochs = 15
num_warmup_steps = 1000
lr = 5e-5
# experiment config


: 

# pretrained model

In [None]:
# tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained("sberbank-ai/ruT5-base", truncation_side='left', padding_side='right')
special_tokens_dict = {
    "additional_special_tokens": [
        "[Model]",
        "[User]",
        "[MaleG]",
        "[FemaleG]",
        "[UnknownG]",
        "[ModelGK]",
        "[UserGK]",
        "[WorldGK]",
        "|DialogContext|:",
        "|DialogAnswer|:",
        "|DialogModelGK|:",
        "|DialogCrossEnc|:",
    ]
}
tokenizer.add_special_tokens(special_tokens_dict)

12

: 

In [None]:
# t5
t5 = transformers.T5ForConditionalGeneration.from_pretrained("sberbank-ai/ruT5-base")
t5.resize_token_embeddings(len(tokenizer))

Embedding(32112, 768)

: 

# data

In [None]:
# dataset
train_answer_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(train).jsonl', exaples='answer', ex_per_dialog='all', context_len='all')
val_answer_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(test).jsonl', exaples='answer', ex_per_dialog='all', context_len='all')

train_gk_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(train).jsonl', exaples='all_gk', ex_per_dialog='all', context_len='all')
val_gk_dataset = Toloka_DS('/home/stc/persona/data/TlkPersonaChatRus/TolokaPersonaChat_gk(test).jsonl', exaples='all_gk', ex_per_dialog='all', context_len='all')

: 

In [None]:
# cоllator
collator = Collator(
    spectokens=special_tokens_dict["additional_special_tokens"],
    tokenizer=tokenizer,
    padding='max_length',
    qury_len=64,
    cand_len=32,
)

: 

In [None]:
# dataloader
train_answer_dataloader = torch.utils.data.DataLoader(
    train_answer_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator.BiEncoder
)
val_answer_dataloader = torch.utils.data.DataLoader(
    val_answer_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator.BiEncoder
)

train_gk_dataloader = torch.utils.data.DataLoader(
    train_gk_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator.BiEncoder
)
val_gk_dataloader = torch.utils.data.DataLoader(
    val_gk_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator.BiEncoder
)

: 

In [None]:
train_dataloader = SequntiaLoader({'answer':train_answer_dataloader, 'gk':train_gk_dataloader}, shuffle=True)
val_dataloader = SequntiaLoader({'answer':val_answer_dataloader, 'gk':val_gk_dataloader}, shuffle=True)

: 

In [None]:
# scheduler len
scheduler_len = len(train_dataloader) * epochs
scheduler_len

2265

: 

In [None]:
train_dataloader.lens_dict, val_dataloader.lens_dict

({'answer': 105, 'gk': 46}, {'answer': 12, 'gk': 6})

: 

# pl trainloop

In [None]:
# pl model
model = single_multitask_model(
    transformer=t5,
    tokenizer=tokenizer,
    scheduler_len=scheduler_len,
    num_warmup_steps=num_warmup_steps,
    lr=lr,
)

: 

In [None]:
# logger
logger = pl.loggers.comet.CometLogger(
    api_key='sEJsZrYjwc0gxxUAUGQNBwTsb',
    save_dir='/home/stc/persona/logs/t5',
    project_name='T5',
    experiment_name='seq_2bienc',
)
#logger.log_hyperparams(gpt_args)

CometLogger will be initialized in online mode


: 

In [None]:
# # checkpoint callback
# checkpoint_callback = pl.callbacks.ModelCheckpoint(
#      monitor='val_loss',
#      dirpath=gpt_args.save_dir,
#      filename='gpt-{epoch:02d}-{val_loss:.2f}',
#      save_top_k=1,
#      mode='min',
#  )

: 

In [None]:
# trainer
trainer = pl.Trainer(
    max_epochs=epochs,
    accelerator="gpu",
    devices=1,
    gradient_clip_val=1,
    logger=logger,
    num_sanity_val_steps=10,
    #callbacks=[checkpoint_callback]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


: 

In [None]:
# fit 12.77
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                       | Type                       | Params
--------------------------------------------------------------------------
0 | transformer                | T5ForConditionalGeneration | 222 M 
1 | bienc_answer_loss          | CrossEntropyLoss           | 0     
2 | bienc_gk_loss              | CrossEntropyLoss           | 0     
3 | train_bienc_answer_metrics | MetricCollection           | 0     
4 | train_bienc_gk_metrics     | MetricCollection           | 0     
5 | val_bienc_answer_metrics   | MetricCollection           | 0     
6 | val_bienc_gk_metrics       | MetricCollection           | 0     
7 | gen_metrics                | MetricCollection           | 0     
--------------------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.565   Total estimated model params size (MB)
COMET INFO: Experiment is live on comet.com https://www.

Sanity Checking DataLoader 0:  10%|█         | 1/10 [00:01<00:13,  1.52s/it]



Epoch 0:  86%|████████▋ | 146/169 [01:13<00:11,  1.98it/s, loss=4.51, v_num=1084]



Epoch 0:  89%|████████▉ | 151/169 [01:16<00:09,  1.98it/s, loss=4.5, v_num=1084] 



Epoch 0: 100%|██████████| 169/169 [01:21<00:00,  2.06it/s, loss=4.5, v_num=1084]



Epoch 1:  33%|███▎      | 56/169 [00:29<00:58,  1.92it/s, loss=4.52, v_num=1084]

: 