## Инициализация метрик оценки моделей

In [None]:
# pip install datasets

In [None]:
# pip install rouge_score

In [None]:
from datasets import load_metric


metric = load_metric("rouge")

def calc_rouge_scores(candidates, references, name_of_method):
    result = metric.compute(predictions=candidates, references=references, use_stemmer=True)
    result = {key: round(value.mid.fmeasure * 100, 1) for key, value in result.items()}
    result["summarizer"] = name_of_method
    return result

In [None]:
number_of_ex = 500 # количество примеров для оценки моделей

In [1]:
import csv


with open("rouge_metrics4.csv", "w") as f:
    columns = ['rouge1','rouge2','rougeL','rougeLsum','summarizer']
    writer = csv.DictWriter(f, fieldnames=columns)
    writer.writeheader()

def add_to_file(row):
    with open("rouge_metrics4.csv", "a") as f:
        writer = csv.DictWriter(f, fieldnames=columns)
        writer.writerow(row)

## Инициализация датасета

In [None]:
## первый, https://huggingface.co/datasets/IlyaGusev/gazeta
# import json
# 
# 
# def read_gazeta_records(file_name, shuffle=False, sort_by_date=True):
#     assert shuffle != sort_by_date
#     records = []
#     with open(file_name, "r") as r:
#         for line in r:
#             records.append(json.loads(line))
#     if sort_by_date:
#         records.sort(key=lambda x: x["date"])
#     if shuffle:
#         random.shuffle(records)
#     return records
# data = read_gazeta_records("gazeta_test.jsonl")
# data_sum = [data[i]["summary"] for i in range(len(data))]
# data_text = [data[i]["text"] for i in range(len(data))]
# print(f"{data_sum = },\n,{data_text =}")

In [None]:
## второй,  https://www.kaggle.com/datasets/thedevastator/mlsam-multilingual-summarization-dataset?select=ru_train.csv
# import pandas as pd
# 
# 
# df = pd.read_csv("ru_test.csv", sep=',')
# # print(df.head(1))
# data_sum = df["summary"].tolist()
# data_text = df["text"].tolist()

In [None]:
### третий, https://huggingface.co/datasets/csebuetnlp/xlsum
# from datasets import load_dataset
# import pandas as pd


# ds = load_dataset("csebuetnlp/xlsum", "russian")
# df = pd.DataFrame(ds["test"])
# data_sum = df["summary"].tolist()
# data_text = df["text"].tolist()

In [None]:
### четвертый, https://huggingface.co/datasets/esdurmus/wiki_lingua

from datasets import load_dataset
import pandas as pd


ds = load_dataset("esdurmus/wiki_lingua", "russian")
df = pd.DataFrame(ds["train"]["article"])
data_sum = df["summary"].tolist()
data_sum = [" ".join(data_sum[i]) for i in range(len(data_sum))]
data_text = df["document"].tolist()
data_text = [" ".join(data_text[i]) for i in range(len(data_text))]

In [None]:
print(f"{len(data_text) = }")

## реализация метода Луны
Вычисляем значимые слова документа:
- Делаем стемминг или лемматизацию слов: разные словоформы одной леммы должны считаться как одно слово.
- Считаем частоты слов, формируем список слов по убыванию частоты.
- Убираем стоп-слова: частотные слова, у которых нет отдельной смысловой нагрузки, например предлоги и частицы.
- Убираем слишком редкие слова, например такие, которые встречаются только 1 раз, либо убираем какой-то перцентиль слов по частоте.
- Все оставшиеся слова считаем значимыми.

Считаем значимость для предложений:
- Предложение делим на промежутки, которые начинаются и заканчиваются значимыми словами. В промежутке могут быть и незначимые слова, но не более 4 подряд.
- Значимость промежутка — квадрат количества значимых слов в промежутке, делённый на размер промежутка.
- Значимость предложения — максимум из значимостей промежутков.
- Берём в качестве реферата предложения со значимостью выше определённого порога.

с помощью библиотеки Spacy

In [None]:
# pip install spacy

In [None]:
# !python -m spacy download ru_core_news_lg

In [None]:
from collections import Counter
from string import punctuation
import spacy


nlp = spacy.load("ru_core_news_lg")

In [None]:
def luna_sum(text, limit):
    keywords = []
    tags = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    doc = nlp(text.lower())
    for token in doc:
        if token.pos_ in tags and not(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            keywords.append(token.lemma_)
    word_freq = Counter(keywords)
    max_freq = Counter(keywords).most_common(1)[0][1]
    word_freq = {word: word_freq[word] for word in word_freq if word_freq[word] != 1}
    for word in word_freq:
        word_freq[word] = (word_freq[word]/max_freq)

    sent_power={}
    for sent in doc.sents:
        for word in sent:
            if word.lemma_ in word_freq.keys():
                if sent in sent_power.keys():
                    sent_power[sent] += word_freq[word.lemma_]
                else:
                    sent_power[sent] = word_freq[word.lemma_]

    summary = []
    sorted_sents = sorted(sent_power.items(), key=lambda kv: kv[1], reverse=True)
    coef_limit = sorted_sents[limit-1][1]
    i = 0
    while i < len(sorted_sents) and i<limit:
        if sorted_sents[i][1] >= coef_limit:
            summary.append(str(sorted_sents[i][0]).capitalize())
        i += 1

    return ' '.join(summary)

## Оценка метода Луны

In [None]:
data_res_of_luna = [luna_sum(data_text[i], 4) for i in range(number_of_ex )]

In [None]:
from random import randint


index_random = randint(0, number_of_ex - 1)
print(f"{data_text[index_random] = },\n {data_sum[index_random] = },\n {data_res_of_luna[index_random] = }")

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_luna )],data_res_of_luna, "Luna")
res

In [None]:
add_to_file(res)

## Оценка метода TextRank

In [None]:
# pip install summa

In [None]:
from summa.summarizer import summarize

In [None]:
data_res_of_textRank = [summarize(data_text[i], ratio=0.2) for i in range(number_of_ex )]

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_textRank )], data_res_of_textRank, "TextRank")
res

In [None]:
add_to_file(res)

## Оценка метода - первые 3 предложения

In [None]:
import re

In [None]:
first_sent = lambda x: ' '.join(re.split(r'(?<=[.:;])\s', x)[:3+1])

In [None]:
data_res_of_firstSents = [first_sent(data_text[i]) for i in range(number_of_ex )]

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_firstSents )], data_res_of_firstSents, "FirstSents")
res

In [None]:
add_to_file(res)

## Оценка mT5 - model_name = "IlyaGusev/rut5_base_sum_gazeta"
https://huggingface.co/IlyaGusev/rut5_base_sum_gazeta

In [None]:
from transformers import pipeline

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

In [None]:
model_name = "IlyaGusev/rut5_base_sum_gazeta"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
def ruT5_base_G(text):
  input_ids = tokenizer(
      [text],
      max_length=600,
      add_special_tokens=True,
      padding="max_length",
      truncation=True,
      return_tensors="pt"
  )["input_ids"]
  output_ids = model.generate(
      input_ids=input_ids,
      no_repeat_ngram_size=4
  )[0]

  summary = tokenizer.decode(output_ids, skip_special_tokens=True)
  return summary


In [None]:
data_res_of_ruT5_base_G = [ruT5_base_G(data_text[i]) for i in range(number_of_ex//10)]

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_ruT5_base_G )], data_res_of_ruT5_base_G, "ruT5_G")
res

In [None]:
add_to_file(res)

## Оценка mT5 - MODEL_NAME = 'cointegrated/rut5-base-absum'

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer


MODEL_NAME = 'cointegrated/rut5-base-absum'
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
# model.cuda();
# model.eval();


In [None]:
def ruT5_base_A(
    text, n_words=None, compression=None,
    max_length=1000, num_beams=3, do_sample=False, repetition_penalty=10.0,
    **kwargs
):
    if n_words:
        text = '[{}] '.format(n_words) + text
    elif compression:
        text = '[{0:.1g}] '.format(compression) + text
    x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
    with torch.inference_mode():
        out = model.generate(
            **x,
            max_length=max_length, num_beams=num_beams,
            do_sample=do_sample, repetition_penalty=repetition_penalty,
            **kwargs
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [None]:
data_res_of_ruT5_base_A = [ruT5_base_A(data_text[i]) for i in range(number_of_ex//10)]

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_ruT5_base_A )], data_res_of_ruT5_base_A, "ruT5_A")

In [None]:
add_to_file(res)

## Оценка mT5 - model_name = 'utrobinmv/t5_summary_en_ru_zh_base_2048'

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer


model_name = 'utrobinmv/t5_summary_en_ru_zh_base_2048'
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
def ruT5_base_M(text):
    input_ids = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(**input_ids)
    result = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    return result

In [None]:
data_res_of_ruT5_base_M = [ruT5_base_M(data_text[i]) for i in range(number_of_ex//10)]

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_ruT5_base_M )], data_res_of_ruT5_base_M, "ruT5_M")

In [None]:
add_to_file(res)

## оценка модели mBART - model_name = "IlyaGusev/mbart_ru_sum_gazeta"

In [None]:
from transformers import MBartTokenizer, MBartForConditionalGeneration


model_name = "IlyaGusev/mbart_ru_sum_gazeta"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [None]:
def mBART_base_G(text):
    input_ids = tokenizer(
        [text],
        max_length=600,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4
    )[0]

    summary = tokenizer.decode(output_ids, skip_special_tokens=True)
    return summary

In [None]:
data_res_of_mBART_base_G = [mBART_base_G(data_text[i]) for i in range(number_of_ex//10)]

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_mBART_base_G )], data_res_of_mBART_base_G, "mBART_G")

In [None]:
add_to_file(res)

## Оценка ruGPT3 - G model_name = "IlyaGusev/rugpt3medium_sum_gazeta"

In [None]:
# import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


model_name = "IlyaGusev/rugpt3medium_sum_gazeta"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
def ruGPT3_G(text):
    text_tokens = tokenizer(
        text,
        max_length=600,
        add_special_tokens=False,
        padding=False,
        truncation=True
    )["input_ids"]
    input_ids = text_tokens + [tokenizer.sep_token_id]
    input_ids = torch.LongTensor([input_ids])

    output_ids = model.generate(
        input_ids=input_ids,
        no_repeat_ngram_size=4
    )

    summary = tokenizer.decode(output_ids[0], skip_special_tokens=False)
    summary = summary.split(tokenizer.sep_token)[1]
    summary = summary.split(tokenizer.eos_token)[0]
    return summary

In [None]:
data_res_of_ruGPT3_base_G = [ruGPT3_G(data_text[i]) for i in range(number_of_ex//10)]

In [None]:
res = calc_rouge_scores(data_sum[:len(data_res_of_ruGPT3_base_G )], data_res_of_ruGPT3_base_G, "ruGPT3_G")

In [None]:
add_to_file(res)

In [8]:
import pandas as pd

df1 = pd.read_csv("rouge_metrics.csv")
df2 = pd.read_csv("rouge_metrics2.csv")
df3 = pd.read_csv("rouge_metrics3.csv")
df4 = pd.read_csv("rouge_metrics4.csv")
print("\n\nдатасет Gazeta:\n\n", df1, "\n\nдатасет MLSUM:\n\n", df2, "\n\nдатасет XLSUM:\n\n", df3, "\n\nдатасет wiki-lingua:\n\n", df4)



датасет Gazeta:

    rouge1  rouge2  rougeL  rougeLsum  summarizer
0    15.2     3.6    13.8       13.7        Luna
1    12.9     3.3    12.7       12.8    TextRank
2    13.6     3.0    12.8       12.9  FirstSents
3    22.0     9.7    22.0       22.0      ruT5_G
4     9.7     0.0     9.7        9.7      ruT5_A
5    19.9    17.0    19.9       19.9      ruT5_M
6    19.3     6.7    19.3       19.7     mBART_G
7     8.3     0.0     8.3        8.3    ruGPT3_G 

датасет MLSUM:

    rouge1  rouge2  rougeL  rougeLsum  summarizer
0     3.6     0.5     3.4        3.3        Luna
1     3.5     0.5     3.3        3.2    TextRank
2     2.1     0.4     2.1        2.1  FirstSents
3     0.0     0.0     0.0        0.0      ruT5_G
4     0.0     0.0     0.0        0.0      ruT5_A
5    10.0     0.0    10.0       10.0      ruT5_M
6     6.7     0.0     6.7        6.7     mBART_G
7     0.0     0.0     0.0        0.0    ruGPT3_G 

датасет XLSUM:

    rouge1  rouge2  rougeL  rougeLsum  summarizer
0     4.6  