In [1]:
!pip install transformers sentencepiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m27.4 MB/s[0m eta

We finetune our Russian-English T5 model on several tasks:
* Topic classification (to do)
* Dialog act classification (MIDAS & Switchboard) (to do)
* Emotional classification
* Sentiment classification (to do)
* Toxic classification (to do)
* Factoid classification (to do)

In [2]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import torch
from datasets import load_dataset

In [3]:
from google.colab import drive
drive.mount('/gd')

Mounted at /gd


In [4]:
import os

In [5]:
raw_model = 'cointegrated/rut5-base'  
MODEL_NAME = '/gd/MyDrive/models/rut5-base-partial'

if os.path.exists(MODEL_NAME):  # continue fine-tuning
    raw_model = MODEL_NAME
model = T5ForConditionalGeneration.from_pretrained(raw_model)
tokenizer = T5Tokenizer.from_pretrained(raw_model)

### Create the tasks

``` 
def task():
    return input_text, output_text
```

In [6]:
opus_wiki = load_dataset("opus_wikipedia", lang1="en", lang2="ru")
len(opus_wiki)

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.86k [00:00<?, ?B/s]

Downloading and preparing dataset opus_wikipedia/en-ru to /root/.cache/huggingface/datasets/opus_wikipedia/en-ru-lang1=en,lang2=ru/0.0.0/4a18b1be119afcbc678dac8b8f58888a10016b2ba19ea2ca0adfb4777f0d2b6b...


Downloading data:   0%|          | 0.00/57.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/572717 [00:00<?, ? examples/s]

Dataset opus_wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/opus_wikipedia/en-ru-lang1=en,lang2=ru/0.0.0/4a18b1be119afcbc678dac8b8f58888a10016b2ba19ea2ca0adfb4777f0d2b6b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

1

In [7]:
print(len(opus_wiki['train']))

572717


In [8]:
import random
random.choice(opus_wiki['train'])

{'id': '115217',
 'translation': {'en': 'Some species have angiocarpous, i.e., closed fruitbodies.',
  'ru': 'На территории стран бывшего СССР встречается около 90 видов.'}}

In [9]:
def translate_task():
    item = random.choice(opus_wiki['train'])['translation']
    if random.random() < 0.5:
        return f'translate ru-en | {item["ru"]}', item["en"]
    else:
        return f'translate en-ru | {item["en"]}', item["ru"]

translate_task()

('translate en-ru | ==Show-hosting career==Prior to her debut as a member of Girls\' Generation, she was a VJ of the 2005 M.net "Hello Chat", alongside Super Junior\'s Kangin.',
 'Ещё до дебюта в Girls Generation, в 2005 году, Суён была виджеем M.net Hello Chat вместе с Канином из Super Junior.')

**emotion_classification**

In [10]:
dataset = load_dataset("cedr")

Downloading builder script:   0%|          | 0.00/8.75k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.23k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]



Downloading and preparing dataset cedr/main to /root/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66...


Downloading data:   0%|          | 0.00/693k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7528 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1882 [00:00<?, ? examples/s]

Dataset cedr downloaded and prepared to /root/.cache/huggingface/datasets/cedr/main/0.1.1/117570489cbabbdf8de619bd31918a1cd680a7f286b89d04af340d0691dc2d66. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Emotion label codes: {0: "joy", 1: "sadness", 2: "surprise", 3: "fear", 4: "anger"}

In [11]:
codes = {0: "joy", 1: "sadness", 2: "surprise", 3: "fear", 4: "anger"}

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'source'],
        num_rows: 7528
    })
    test: Dataset({
        features: ['text', 'labels', 'source'],
        num_rows: 1882
    })
})

In [13]:
import random

In [14]:
random.choice(dataset['train'])

{'text': 'мне как и  девочки уже на аске угрожают!)',
 'labels': [0],
 'source': 'twitter'}

In [80]:
def emotion_classification_task():
    item = random.choice(dataset['train'])
    if len(item["labels"]) > 0:
      return f'emotion | {item["text"]}', codes[item["labels"][0]]
    else:
      return f'emotion | {item["text"]}', ''

emotion_classification_task()

('emotion | Количеству подписчиков на его соцсети в свое время подивились даже сотрудники пресс-службы канадского космического агентства .',
 'surprise')

### Train the model

In [19]:
# # raw_model = '/gd/MyDrive/models/rut5-base-raw'  # start fine-tuning
# raw_model = '/gd/MyDrive/models/rut5-base-partial'  # continue fine-tuning
# model = T5ForConditionalGeneration.from_pretrained(raw_model)
# tokenizer = T5Tokenizer.from_pretrained(raw_model)

In [44]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = T5Tokenizer.from_pretrained("cointegrated/rut5-base-multitask")

model = T5ForConditionalGeneration.from_pretrained("cointegrated/rut5-base-multitask")

Downloading spiece.model:   0%|          | 0.00/828k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/977M [00:00<?, ?B/s]

In [45]:
device = torch.device('cuda')
model.to(device);

In [52]:
optimizer = torch.optim.Adam(params = [p for p in model.parameters() if p.requires_grad], lr=1e-5)

In [53]:
TASKS = [
    emotion_classification_task,
    # quiz_task,
    # answer_task,
    # reply_task,
    # fill_gap_task,
    # assemble_task,
    translate_task,
    # headline_task,
    # paraphrase_task,
    # ask_task, 
    # rsg_task,
    # simplify_task, 
    # comprehend_task,
]
# omit sumarize_task because texts are too long
len(TASKS)

2

In [54]:
def predict(x, n=3):
    inputs = tokenizer(x, return_tensors='pt')
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        hypotheses = model.generate(
            **inputs, 
            do_sample=True, 
            top_p=0.9, 
            num_return_sequences=3, 
            repetition_penalty=2.5,
            max_length=64,
        )
    return [tokenizer.decode(h, skip_special_tokens=True) for h in hypotheses]

In [72]:
model.eval()

for t in TASKS:
    x, y = t()
    print(x, ' \n --> ', y)
    # for p in predict(x, n=3):
    #     print(p)
    print()

emotion | Отмечалось, что злоумышленники забрали у него сумку Louis Vuitton и скрылись.  
 -->  

translate en-ru | This configuration encourages enthusiasts to modify its functions, particularly in the form of so-called images.  
 -->  Эта конфигурация способствует энтузиастам изменить их функции, в частности, в виде так называемого образа.



In [56]:
import gc

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [57]:
optimizer.param_groups[0]['lr'] = 1e-5

In [58]:
def eval_losses(n=10, max_len=1024):
    for task in TASKS:
        tot = 0
        for i in range(n):
            xxx, yyy = task()
  
            x = tokenizer(xxx, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)
            y = tokenizer(yyy, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)

            loss = model(
                input_ids=x.input_ids,
                attention_mask=x.attention_mask,
                labels=y.input_ids,
                decoder_attention_mask=y.attention_mask,
                return_dict=True
            ).loss
            loss.backward()
            tot += loss.item()
        print(f'{task.__name__:20s} {tot / n :2.2f}')

eval_losses(n=20)

emotion_classification_task 10.13
translate_task       1.73


In [59]:
from tqdm import trange 

In [60]:
model.train();
batch_size = 2
max_len = 1024
epochs = 5
accumulation_steps = 32
save_steps = 5000

window = 5000
ewm = 0

tq = trange(int(10000 / batch_size))
cleanup()

for i in tq:
    xx = []
    yy = []
    for _ in range(batch_size):
        xxx, yyy = random.choice(TASKS)()  # rsg is more various, increase its occurrence 4-fold
        xx.append(xxx)
        yy.append(yyy)

    try:
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(device)
        # do not force the model to predict pad tokens
        y.input_ids[y.input_ids==0] = -100

        loss = model(
            input_ids=x.input_ids,
            attention_mask=x.attention_mask,
            labels=y.input_ids,
            decoder_attention_mask=y.attention_mask,
            return_dict=True
        ).loss
        loss.backward()
        # print('ok')
    except RuntimeError:
        print([xxx.split(' |')[0] for xxx in xx])
        loss = None
        cleanup()
        continue

    w = 1 / min(i+1, window)
    ewm = ewm * (1-w) + loss.item() * w
    tq.set_description(f'loss: {ewm}')
    
    if i % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
        cleanup()
    
    if i % window == 0 and i > 0:
        print(ewm)
        cleanup()
        # optimizer.param_groups[0]['lr'] *= 0.999
    if i % save_steps == 0 and i > 0:
        model.save_pretrained(MODEL_NAME)
        tokenizer.save_pretrained(MODEL_NAME)
        print('saving...', i, optimizer.param_groups[0]['lr'])

        try:
            optimizer.step()
            optimizer.zero_grad()
            eval_losses()
            optimizer.step()
            optimizer.zero_grad()
        except RuntimeError:
            cleanup()

loss: 4.063214055339889:  28%|██▊       | 1397/5000 [03:16<18:46,  3.20it/s]

['emotion', 'translate en-ru']


loss: 4.022619642329198:  30%|███       | 1503/5000 [03:31<13:39,  4.27it/s]

['translate ru-en', 'translate en-ru']


loss: 3.7810686618220792:  61%|██████▏   | 3070/5000 [07:07<03:49,  8.42it/s]

['translate ru-en', 'translate ru-en']


loss: 3.7258002345952983:  68%|██████▊   | 3412/5000 [07:57<10:30,  2.52it/s]

['translate ru-en', 'translate en-ru']


loss: 3.5642455692060464:  90%|████████▉ | 4479/5000 [10:24<01:00,  8.67it/s]

['translate ru-en', 'emotion']


loss: 3.482194066319474: 100%|██████████| 5000/5000 [11:39<00:00,  7.14it/s]


In [61]:
model.eval()

def answer(x, **kwargs):
    inputs = tokenizer(x, return_tensors='pt').to(model.device)
    with torch.no_grad():
        hypotheses = model.generate(**inputs, **kwargs)
    return tokenizer.decode(hypotheses[0], skip_special_tokens=True)

In [81]:
# Должно быть 'surprise'
print(answer(
    'emotion | Количеству подписчиков на его соцсети в свое время подивились даже сотрудники пресс-службы канадского космического агентства .'))

sadness


In [63]:
# Должно быть 'sadness'
print(answer(
    'emotion | Но мне ужасно грустно , и плохо ('))

sadness


In [64]:
# Хочется видеть 'joy'
print(answer(
    'emotion | Я рада.'))

sadness


In [66]:
# Должно быть 'fear'
print(answer(
    'emotion | Когда Бердсли внезапно почувствовала, что с ее лицом происходит что-то странное, она вызвала скорую помощь, опасаясь инсульта.'))

sadness


In [67]:
codes

{0: 'joy', 1: 'sadness', 2: 'surprise', 3: 'fear', 4: 'anger'}

In [65]:
print(answer('translate ru-en | Каждый охотник желает знать, где сидит фазан.'))
# Each hunter wants to know, where he is.

The hunter would like to know, where is the fagan.


In [93]:
model.save_pretrained(MODEL_NAME)
tokenizer.save_pretrained(MODEL_NAME)

('/gd/MyDrive/models/rut5-base-partial/tokenizer_config.json',
 '/gd/MyDrive/models/rut5-base-partial/special_tokens_map.json',
 '/gd/MyDrive/models/rut5-base-partial/spiece.model',
 '/gd/MyDrive/models/rut5-base-partial/added_tokens.json',
 '/gd/MyDrive/models/rut5-base-partial/tokenizer.json')

In [91]:
# !ls $MODEL_NAME