# задание:
- 1. Взять модель для суммаризации текстов https://huggingface.co/IlyaGusev/mbart_ru_sum_gazeta и запустить суммаризацию текстов на тестовой части вот этого датасета: https://huggingface.co/datasets/IlyaGusev/gazeta. Код, который это делает, есть в карточке модели. Посчитать метрики blue и rouge
- 2. Дообучить модель google/mt5-small для суммаризации текстов из датасета https://huggingface.co/datasets/IlyaGusev/gazeta, запустить суммаризацию на тестовой части датасета, посчитать метрики blue и rouge
Таким образом, у вас должно получиться сравнение метрик для оценки качества суммаризации, сделанной двумя разными моделями

# Установка необходимых библиотек

In [1]:
from datasets import load_dataset
import pandas as pd
import evaluate
import torch
from transformers import MBartTokenizer, MBartForConditionalGeneration
from tqdm import tqdm
from datasets import load_dataset, load_metric
import pickle

In [2]:
# ! pip install rouge_score
# ! pip install evaluate

# Загрузка дата сета

In [2]:
device = torch.device('cuda:0')

dataset = load_dataset("IlyaGusev/gazeta")
dataset

Downloading builder script:   0%|          | 0.00/3.01k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/550M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/56.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/61.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/60964 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6793 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6369 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 60964
    })
    test: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 6793
    })
    validation: Dataset({
        features: ['text', 'summary', 'title', 'date', 'url'],
        num_rows: 6369
    })
})

# Метрики

In [3]:
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

model.to(device)
None

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/406 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.47G [00:00<?, ?B/s]

In [4]:
len(dataset['test']['text'])
len(dataset['train']['text'])

60964

# Функции

In [5]:
def gen_batch(inputs, batch_size):
    batch_start = 0
    while batch_start < len(inputs):
        yield inputs[batch_start: batch_start + batch_size]
        batch_start += batch_size

# Считаем для каждого текста посчитать bleu, и вывести среднюю метрику по датасету dataset['test']

In [6]:
bleus = []
rougs = []

batch_size = 6
batches = gen_batch(dataset['test']['text'], batch_size)

with torch.no_grad():
    for batch in tqdm(batches, total=len(dataset['test']['text'])//batch_size):
        input_ids = tokenizer(
            batch,
            max_length=600,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )["input_ids"].to(device)

        output_ids = model.generate(
            input_ids=input_ids,
            no_repeat_ngram_size=4
        )


        summaries = list(tokenizer.batch_decode(output_ids, skip_special_tokens=True))
        references = batch
        for summ, ref in zip(summaries, references):
            bl = bleu.compute(predictions=[summ], references=[ref])
            rg = rouge.compute(predictions=[summ], references=[ref])
            bleus.append(bl)
            rougs.append(rg)

1133it [1:45:14,  5.57s/it]                          


# bleu

In [7]:
av_bleu = [bl['bleu'] for bl in bleus]
av_bleu = sum(av_bleu)/len(av_bleu)

av_bleu

0.00048452955174222735

# rouge

In [8]:
av_rgs = {
    "rouge1": 0,
    "rouge2": 0,
    "rougeL": 0,
    "rougeLsum": 0,
}

for rg in rougs:
    for key, val in rg.items():
        c = av_rgs[key]
        c += val
        av_rgs.update({key:c})

for key, val in av_rgs.items():
    print(f"{key}: {val/len(rg)}")

rouge1: 235.80881092739483
rouge2: 118.71577173936986
rougeL: 232.67349697139684
rougeLsum: 232.67349697139684
