In [1]:
!pip install datasets evaluate transformers corus razdel seqeval transformers[torch] --quiet
!pip install accelerate -U --quiet

# **Задание**
1.  Дообучить берт на задачу NER
2.  Дообучить GPT на генерацию текста
3. *Дообучить T5 на задачу суммаризации текста


## Импорты

In [26]:
import numpy as np
import pandas as pd
import re

import torch
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load as load_metric

from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForTokenClassification, AutoModelForCausalLM, T5ForConditionalGeneration
from transformers import DataCollatorForTokenClassification
from transformers import TextDataset, DataCollatorForLanguageModeling

from corus import load_rudrec
from razdel import tokenize

from sklearn.model_selection import train_test_split

## Настройки

In [3]:
BERT_MODEL = 'cointegrated/rubert-tiny'
BERT_DATA = './data/rudrec_annotated.json'
BERT_BATCH_SIZE = 16
BERT_EPOCHS = 10

GPT_MODEL = 'sberbank-ai/rugpt3small_based_on_gpt2'
GPT_DATA = './data/all_recepies_inter.csv'
GPT_BATCH_SIZE = 16
GPT_EPOCHS = 3

T5_MODEL = 'IlyaGusev/rut5_base_sum_gazeta'
T5_DATA = 'IlyaGusev/gazeta'
T5_BATCH_SIZE = 16
T5_EPOCHS = 5

device = 'cuda'

## 1. Дообучить берт на задачу NER

### Данные

In [4]:
def extract_labels(item):
    raw_toks = list(tokenize(item.text))
    words = [tok.text for tok in raw_toks]
    word_labels = ['O'] * len(raw_toks)
    char2word = [None] * len(item.text)
    for i, word in enumerate(raw_toks):
        char2word[word.start:word.stop] = [i] * len(word.text)

    for e in item.entities:
        e_words = sorted({idx for idx in char2word[e.start:e.end] if idx is not None})
        word_labels[e_words[0]] = 'B-' + e.entity_type
        for idx in e_words[1:]:
            word_labels[idx] = 'I-' + e.entity_type

    return {'tokens': words, 'tags': word_labels}

In [5]:
data = [extract_labels(item) for item in list(load_rudrec(BERT_DATA))]

data_train, data_test = train_test_split(data, test_size=0.1)
pd.DataFrame(data_train).head()

Unnamed: 0,tokens,tags
0,"[Время, использования, :, 1-2, дня]","[O, O, O, O, O]"
1,"[Но, сколько, раз, мы, ни, покупали, его, ,, я...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[Успела, пропить, две, таблетки, ингавирина, .]","[O, O, O, B-Drugform, B-Drugname, O]"
3,"[Если, принимать, таблетки, точно, следуя, инс...","[O, O, B-Drugform, O, O, O, O, O, O, O, O, O, ..."
4,"[Общее, впечатление, :, Лучше, не, употреблять]","[O, O, O, O, O, O]"


In [6]:
label_list = sorted({label for item in data_train for label in item['tags']})
if 'O' in label_list:
    label_list.remove('O')
    label_list = ['O'] + label_list

label_list

['O',
 'B-ADR',
 'B-DI',
 'B-Drugclass',
 'B-Drugform',
 'B-Drugname',
 'B-Finding',
 'I-ADR',
 'I-DI',
 'I-Drugclass',
 'I-Drugform',
 'I-Drugname',
 'I-Finding']

In [7]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

def tokenize(examples, label_all_tokens=True):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 truncation=True,
                                 padding=True,
                                 is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        label_ids = [label_list.index(idx) if isinstance(idx, str) else idx for idx in label_ids]

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
ner_data = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame(data_train)),
    'test': Dataset.from_pandas(pd.DataFrame(data_test))
})
tokenized_data = ner_data.map(tokenize, batched=True)

Map:   0%|          | 0/4328 [00:00<?, ? examples/s]

Map:   0%|          | 0/481 [00:00<?, ? examples/s]

### Модель

In [9]:
model = AutoModelForTokenClassification.from_pretrained(BERT_MODEL, num_labels=len(label_list))
model.config.id2label = dict(enumerate(label_list))
model.config.label2id = {v: k for k, v in model.config.id2label.items()}

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly

In [10]:
for param in model.parameters():
    param.requires_grad = True

In [11]:
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [12]:
model = model.to(device)
tokenized_data = tokenized_data.with_format("torch", device=device)

args = TrainingArguments("ner",
                         evaluation_strategy = "epoch",
                         learning_rate=2e-5,
                         per_device_train_batch_size=BERT_BATCH_SIZE,
                         per_device_eval_batch_size=BERT_BATCH_SIZE,
                         num_train_epochs=BERT_EPOCHS,
                         weight_decay=0.01,
                         save_strategy='no',
                         report_to='none')

data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(model,
                  args,
                  train_dataset=tokenized_data["train"],
                  eval_dataset=tokenized_data["test"],
                  data_collator=data_collator,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

### Обучение

In [13]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.553229,0.543743,0.312142,0.396607,0.866099
2,0.710900,0.453755,0.540266,0.439288,0.484572,0.881261
3,0.710900,0.409914,0.557173,0.511125,0.533156,0.888075
4,0.404200,0.382608,0.637026,0.555626,0.593548,0.898893
5,0.404200,0.36361,0.564908,0.605849,0.584663,0.8977
6,0.344100,0.349583,0.6,0.621742,0.610677,0.902896
7,0.344100,0.340064,0.614673,0.633821,0.6241,0.905622
8,0.305800,0.335157,0.607831,0.641449,0.624188,0.905537
9,0.305800,0.332415,0.620245,0.642721,0.631283,0.906899
10,0.288600,0.331736,0.623393,0.647171,0.635059,0.907325


TrainOutput(global_step=2710, training_loss=0.4002577398096063, metrics={'train_runtime': 147.819, 'train_samples_per_second': 292.79, 'train_steps_per_second': 18.333, 'total_flos': 130948738699680.0, 'train_loss': 0.4002577398096063, 'epoch': 10.0})

In [14]:
trainer.evaluate()

{'eval_loss': 0.331736296415329,
 'eval_precision': 0.6233925290875689,
 'eval_recall': 0.6471710108073745,
 'eval_f1': 0.6350592638802246,
 'eval_accuracy': 0.9073253833049404,
 'eval_runtime': 0.5978,
 'eval_samples_per_second': 804.552,
 'eval_steps_per_second': 51.853,
 'epoch': 10.0}

In [15]:
text = ' '.join(ner_data['test'][64]['tokens'])
tokens = tokenizer(text, return_tensors='pt')
tokens = {k: v.to(model.device) for k, v in tokens.items()}
with torch.no_grad():
    pred = model(**tokens)

indices = pred.logits.argmax(dim=-1)[0].cpu().numpy()
token_text = tokenizer.convert_ids_to_tokens(tokens['input_ids'][0])

print(text)
for t, idx in zip(token_text, indices):
    print(f'{t:15s} {label_list[idx]:10s}')

И в процессе лечения тоже не помог .
[CLS]           O         
И               O         
в               O         
процессе        O         
лечения         O         
тоже            O         
не              O         
пом             O         
##ог            O         
.               O         
[SEP]           O         


## 2.  Дообучить GPT на генерацию текста

### Данные

In [16]:
data = pd.read_csv(GPT_DATA, sep='\t')
data.head()

Unnamed: 0.1,Unnamed: 0,name,composition,cooking_type,Инструкции,dish_type,Дата,photo,source,composition_inter
0,0,рассольник классический с перловкой и солеными...,"[{'Перловка': 0.1, 'unit': 'стак. (200 мл)'}, ...","варка,жарка",Подготовить указанные ингредиенты для приготов...,первое,05.06.2015,photo_1000menu_1.jpg,https://1000.menu/cooking/33395-rassolnik-s-pe...,"[{'product_id': 4253, 'name_source': 'Перловая..."
1,1,Суп пюре из белокочаной капусты,"[{'Капуста белокочанная': 50.0, 'unit': 'гр'},...",варка,"Необходимые ингредиенты\r\nНарезаем лук, морко...",первое,27.06.2015,photo_1000menu_2.jpg,https://1000.menu/cooking/25399-sup-pure-iz-be...,"[{'product_id': 2286, 'name_source': 'Капуста ..."
2,2,Постные щи из квашеной капусты,"[{'Капуста квашеная': 116.7, 'unit': 'гр'}, {'...","варка,жарка,тушение","Честно признаюсь, у меня не было репы на момен...",первое,12.02.2013,photo_1000menu_3.jpg,https://1000.menu/cooking/5159-postnje-shchi,"[{'product_id': 0, 'name_source': 'Капуста ква..."
3,3,Тюря- простой суп быстро и вкусно,"[{'Квас': 0.2, 'unit': 'л'}, {'Лук репчатый': ...",сырое,"\r\nНачинаем мы приготовление тюри с того, что...",первое,02.03.2011,photo_1000menu_4.jpg,https://1000.menu/cooking/5085-turya,"[{'product_id': 0, 'name_source': 'Квас', 'uni..."
4,4,Фасолевый суп из красной фасоли,"[{'Вода': 0.3, 'unit': 'л'}, {'Картошка': 0.3,...",варка,Подготовить ингредиенты. Для приготовления суп...,первое,28.01.2013,photo_1000menu_5.jpg,https://1000.menu/cooking/38765-fasolevyi-sup-...,"[{'product_id': 828, 'name_source': 'Вода', 'u..."


In [17]:
def build_text_files(data_json, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_json:
        summary = str(texts).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

In [18]:
data = data.loc[:5000, 'Инструкции']
data_train, data_test = train_test_split(data, test_size=0.15)

train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

build_text_files(data_train, train_path)
build_text_files(data_test, test_path)

In [19]:
tokenizer = AutoTokenizer.from_pretrained(GPT_MODEL)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
def load_dataset(train_path, test_path, tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset, test_dataset, data_collator

train_dataset, test_dataset, data_collator = load_dataset(train_path, test_path, tokenizer)



### Модель

In [21]:
model = AutoModelForCausalLM.from_pretrained(GPT_MODEL)
model.to(device)

args = TrainingArguments(output_dir="./gpt2-chief",
                         overwrite_output_dir=True,
                         num_train_epochs=GPT_EPOCHS,
                         per_device_train_batch_size=GPT_BATCH_SIZE,
                         per_device_eval_batch_size=GPT_BATCH_SIZE,
                         eval_steps = 400,
                         save_steps=800,
                         warmup_steps=500)

trainer = Trainer(model=model,
                  args=args,
                  data_collator=data_collator,
                  train_dataset=train_dataset,
                  eval_dataset=test_dataset)

### Обучение

In [22]:
trainer.train()



Step,Training Loss
500,2.4466
1000,2.2366


TrainOutput(global_step=1257, training_loss=2.2955007135820655, metrics={'train_runtime': 724.7843, 'train_samples_per_second': 27.728, 'train_steps_per_second': 1.734, 'total_flos': 1312796491776000.0, 'train_loss': 2.2955007135820655, 'epoch': 3.0})

In [23]:
prefix = "Берём сковородку "
tokens = tokenizer(prefix, return_tensors='pt').to(device)
size = tokens['input_ids'].shape[1]
res = model.generate(**tokens,
                     do_sample=False,
                     max_length=size+50,
                     repetition_penalty=5.,
                     temperature=0.5,
                     num_beams=10,)

decoded = tokenizer.decode(res[0])
result = decoded[len(prefix):]
print(prefix + result)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Берём сковородку  и на ней обжариваем мелко нарезанный репчатый лук до золотистого цвета. Добавляем в сковороду к луку морковь, жарим 3-4 минуты с каждой стороны. Затем добавляем томатную пасту (1 ст.


## 3*. Дообучить T5 на задачу суммаризации текста

In [None]:
from datasets import load_dataset

### Данные

In [27]:
data_train = load_dataset(T5_DATA, revision="v1.0", split='train[:10%]')
data_test = load_dataset(T5_DATA, revision="v1.0", split='test[:10%]')
data_train



Dataset({
    features: ['text', 'summary', 'title', 'date', 'url'],
    num_rows: 5240
})

In [28]:
def len_tok(text):
    return len(text.split())

max_len_sum = max(map(len_tok,data_train['summary']))
max_len_tl = max(map(len_tok,data_train['title']))

tokenizer = AutoTokenizer.from_pretrained(T5_MODEL)

def tokenize(batch):
    tokenized_input = tokenizer(batch['summary'], padding='max_length', truncation=True, max_length=max_len_sum)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tl)
    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

data_train = data_train.map(tokenize, batched=True, batch_size=8)
data_test = data_test.map(tokenize, batched=True, batch_size=8)

data_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
data_test.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])



### Модель

In [29]:
output_dir = 'gazeta/output'
model = T5ForConditionalGeneration.from_pretrained(T5_MODEL).to(device)

args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=T5_EPOCHS,
    per_device_train_batch_size=T5_BATCH_SIZE,
    per_device_eval_batch_size=T5_BATCH_SIZE,
    eval_accumulation_steps=1,
    prediction_loss_only=True,
    learning_rate=0.00001,
    evaluation_strategy='steps',
    save_steps=1000,
    save_total_limit=1,
    remove_unused_columns=True,
    run_name='run_gazeta',
    logging_steps=500,
    eval_steps=500,
    logging_first_step=False,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=data_train,
    eval_dataset=data_test
)


### Обучение

In [30]:
trainer.train()



Step,Training Loss,Validation Loss
500,8.4631,2.942599
1000,3.6889,2.761876
1500,2.6878,2.735948


TrainOutput(global_step=1640, training_loss=4.744864952273485, metrics={'train_runtime': 719.7763, 'train_samples_per_second': 36.4, 'train_steps_per_second': 2.278, 'total_flos': 2608755379200000.0, 'train_loss': 4.744864952273485, 'epoch': 5.0})

In [31]:
ID = 64
input_text = data_test['summary'][ID]

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids'].to(device, dtype = torch.long)
    source_mask = tokenized_text['attention_mask'].to(device, dtype = torch.long)

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask,
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1,
        length_penalty=1,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print(input_text)
print(pred)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Жителей Миасса возмутило хвастовство мэра города Григория Тонких — чиновник опубликовал пост, в котором рассказал, как по его требованию в местной школе №23 дыры, которые ученики использовали в качестве туалетов, заменили на нормальные унитазы. На приложенном к новости снимке нового санузла видно, что между унитазами нет никакой перегородки — миассцы назвали это позором. При этом директор учебного заведения уверила, что кабины еще установят, а мэр «немного поторопился» с новостью.
«Унитазы заменили на нормальную перегородку»
