In [1]:
import torch
import pytorch_lightning as pl
import transformers
import torchmetrics

import pandas as pd
import os
import json

from model import T5MultiTask
from data_module import TolokaDataModule

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# proxy
os.environ["http_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["https_proxy"] = "http://proxy.ad.speechpro.com:3128"
os.environ["ftp_proxy"] = "http://proxy.ad.speechpro.com:3128"

In [3]:
t5 = transformers.T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-small-chitchat2")
tokenizer = transformers.AutoTokenizer.from_pretrained("cointegrated/rut5-small-chitchat2", truncation_side='left', padding_side='right')

with open('/home/stc/persona/data/preprocessing/spec_tokens.json') as spec_tokens_config:
    spec_tokens = json.load(spec_tokens_config)
tokenizer.add_special_tokens(
            {"additional_special_tokens": [spec_tokens[k] for k in spec_tokens]}
        )

15

In [4]:
datamodule=TolokaDataModule(
    data_dir='/home/stc/persona/data',
    datasets=['next_answer', 'current_gk'], #'next_answer', 'current_gk', 'next_gk'
    tokenizer=tokenizer,
    spec_tokens=spec_tokens,
    train_batch_size=256,
    val_batch_size=256,
    test_batch_size=256,
)



In [5]:
model = T5MultiTask(
    model=t5,
    datamodule=datamodule,
    lr=3e-4,
    num_warmup_steps=1000,
    pooling="mean",
    distance="cosine",
    scale=20,
    train_batch_size=256,
    val_batch_size=256,
    test_batch_size=256,
)

  rank_zero_warn(


In [6]:
# logger
logger = pl.loggers.comet.CometLogger(
    api_key='sEJsZrYjwc0gxxUAUGQNBwTsb',
    save_dir='/home/stc/persona/logs',
    project_name='chaT5',
    experiment_name='genbeam',
    log_code=True,
)

CometLogger will be initialized in online mode


In [7]:
# trainer
trainer = pl.Trainer(
    max_epochs=30,
    accelerator="gpu",
    devices=1,
    gradient_clip_val=1,
    logger=logger,
    num_sanity_val_steps=10,
)
trainer.fit(model, datamodule=datamodule)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
Loading cached shuffled indices for dataset at /home/stc/persona/data/next_answer/train/cache-7d8ead2833e8c7a3.arrow
Loading cached shuffled indices for dataset at /home/stc/persona/data/current_gk/train/cache-4a62906eae6ae068.arrow
100%|██████████| 559/559 [00:26<00:00, 21.07ba/s]
100%|██████████| 594/594 [00:10<00:00, 56.40ba/s]

  | Name                      | Type                       | Params
-------------------------------------------------------------------------
0 | transformer               | T5ForConditionalGeneration | 64.8 M
1 | cross_entropy_loss        | CrossEntropyLoss           | 0     
2 | train_next_answer_metrics | MetricCollection           | 0     
3 | val_next_answer_metrics   | MetricCollection           

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


COMET INFO: Experiment is live on comet.com https://www.comet.com/anpopaicoconat/chat5/534293c14590440fb7808bcd10ae4ac3

  (shared): Embedding(20100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(20100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
    

Sanity Checking: 0it [00:00, ?it/s]

Loading cached shuffled indices for dataset at /home/stc/persona/data/next_answer/test/cache-1e14808b13cc501f.arrow
Loading cached shuffled indices for dataset at /home/stc/persona/data/current_gk/test/cache-45f4e4eb50321f4e.arrow
100%|██████████| 5/5 [00:00<00:00,  7.80ba/s]
100%|██████████| 5/5 [00:00<00:00, 44.66ba/s]

Sanity Checking DataLoader 0:   0%|          | 0/9 [00:00<?, ?it/s]


COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.com/anpopaicoconat/chat5/534293c14590440fb7808bcd10ae4ac3
COMET INFO:   Others:
COMET INFO:     Created from : pytorch-lightning
COMET INFO:     Name         : next_answer, current_gk
COMET INFO:   Parameters:
COMET INFO:     data_dir                   : /home/stc/persona/data
COMET INFO:     datamodule                 : <data_module.TolokaDataModule object at 0x7f64c11d1e20>
COMET INFO:     datasets                   : ['next_answer', 'current_gk']
COMET INFO:     distance                   : cosine
COMET INFO:     lr                         : 0.0003
COMET INFO:     model                      : T5ForConditionalGeneration(
  (shared): Embedding(20100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(20100, 512)
    (block): ModuleList(
     

ValueError: `temperature` has to be a strictly positive float, but is 0.0