In [None]:
!pip install rouge
!pip install nltk
from rouge import Rouge 
import nltk
import nltk.translate.bleu_score as bleu
nltk.download('punkt')
import numpy as np
import networkx as nx
import gensim.downloader as api
from nltk.translate.bleu_score import sentence_bleu
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def cosine_similarity(vec1, vec2):
    """Вычислить косинусное сходство между двумя векторами."""
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

def build_similarity_matrix(sentences, threshold=0.1):
    """Строим матрицу сходства предложений."""
    n = len(sentences)
    similarity_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            similarity = cosine_similarity(sentences[i], sentences[j])
            if similarity > threshold:
                similarity_matrix[i][j] = similarity
    return similarity_matrix

def lexrank(sentences, threshold=0.1, damping_factor=0.85, max_iter=100):
    """Рассчитываем баллы LexRank для предложений."""
    #строим матрицу подобия
    similarity_matrix = build_similarity_matrix(sentences, threshold=threshold)

    # Нормализуем строки матрицы подобия
    row_sums = similarity_matrix.sum(axis=1, keepdims=True)
    similarity_matrix = np.divide(similarity_matrix, row_sums)

    n = len(sentences)
    scores = np.ones(n) / n

    for i in range(max_iter):
        new_scores = np.zeros(n)
        for j in range(n):
            for k in range(n):
                if similarity_matrix[k][j] > 0:
                    new_scores[j] += similarity_matrix[k][j] * scores[k]
            new_scores[j] = (1 - damping_factor) + damping_factor * new_scores[j]
        if np.allclose(new_scores, scores):
            break
        scores = new_scores

  
    return scores

In [None]:
sentences = [
    np.array([0.1, 0.2, 0.3]),
    np.array([0.2, 0.3, 0.4]),
    np.array([0.3, 0.4, 0.5]),
    np.array([0.4, 0.5, 0.6])
]
scores = lexrank(sentences)

In [None]:
model = api.load('word2vec-google-news-300')



In [None]:
#Методы для преобразования текстов
def summary_to_sentences(summary):
    sentences = re.split("[.!?]",summary)
    

    sentence_lists = [sentence.split() for sentence in sentences]
    
    return sentence_lists

def paragraph_to_wordlist(paragraph):

    words = paragraph.split()
    return words

def listToString(s):
 
    str1 = ""

    for ele in s:
        str1 += ele
 
    return str1

In [None]:
text_len_l = []
text_len_s = []
text_len_w = []
sum_len_w = []
sum_len_l = []
######
rouge1_f1_5 = []
rouge1_f1_10 = []
rouge1_f1_15 = []
rouge1_f1_20 = []

rouge2_f1_5 = []
rouge2_f1_10 = []
rouge2_f1_15 = []
rouge2_f1_20 = []

rougel_f1_5 = []
rougel_f1_10 = []
rougel_f1_15 = []
rougel_f1_20 = []

bleu_5 = []
bleu_10 = []
bleu_15 = []
bleu_20 = []

summary_list = []

In [None]:
import os,glob
path = "/content/drive/My Drive/texts/short2"
for filename in glob.glob(os.path.join(path, '*.txt')):
    with open(os.path.join(os.getcwd(), filename), 'r') as f: 
      text =f.read().replace('\n', '').replace('\r', '')
    num_sentenses = 10
    while num_sentenses < 41:
        # Разбиваем текст на предложения
        sentences = re.split("[.!?]",text)
        text_len_l.append(len(text))
        text_len_s.append(len(sentences))
        text_len_w.append(len(listToString(sentences).split()))
        # Генерируем вложения для каждого предложения
        sentence_embeddings = []
        for sentence in sentences:
            words = sentence.split()
            embeddings = [model[word] for word in words if word in model.key_to_index ]
            if len(embeddings) > 0:
                sentence_embeddings.append(np.mean(embeddings, axis=0))
        sentence_embeddings
        # Рассчитываем баллы LexRank для предложений
        scores = lexrank(sentence_embeddings)

        # сортируем предложения по баллам и получите n лучших предложений в качестве резюме
        top_sentences = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:num_sentenses]
        summary = [sentences[i].strip() for i in top_sentences]
        summ= listToString(summary)
        rouge = Rouge()
        scores = rouge.get_scores(summ, text)
        reference_paragraph = text
        reference_summary = summary_to_sentences(reference_paragraph)
        predicted_paragraph = summ
        predicted_summary = paragraph_to_wordlist(predicted_paragraph)
        score = sentence_bleu(reference_summary, predicted_summary)
        sum_len_w.append(len(listToString(summary).split()))
        sum_len_l.append(len(summ))
        summary_list.append(summ)
        match num_sentenses:
          case 10:
            rouge1_f1_5.append(scores[0]['rouge-1']['f'])

            bleu_5.append(score)
          case 20:
            rouge1_f1_10.append(scores[0]['rouge-1']['f'])

            bleu_10.append(score) 
          case 30:
            rouge1_f1_15.append(scores[0]['rouge-1']['f'])

            bleu_15.append(score)
          case 40:
            rouge1_f1_20.append(scores[0]['rouge-1']['f'])
            bleu_20.append(score)    

        num_sentenses = num_sentenses+10

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  similarity_matrix = np.divide(similarity_matrix, row_sums)


In [None]:
import csv

fields1 = ['text_len_l', 'text_len_s','text_len_w', 'sum_len_l','sum_len_w']
rows1 = [text_len_l, text_len_s, text_len_w, sum_len_l, sum_len_w]

fields2 = ['rouge1_f1_5','bleu5','rouge1_f1_10','bleu10','rouge1_f1_15','bleu_15','rouge1_f1_20','bleu_20','summary_list']
rows2 = [rouge1_f1_5,bleu_5,rouge1_f1_10,bleu_10,rouge1_f1_15,bleu_15,rouge1_f1_20,bleu_20,summary_list]

from itertools import zip_longest
#list1 = ['a', 'b', 'c', 'd', 'e']
#list2 = ['f', 'g', 'i', 'j']
#d = [list1, list2]
export_data = zip_longest(*rows1, fillvalue = '')
with open('len_LexRank.csv', 'w', encoding="utf-8", newline='') as myfile:
      wr = csv.writer(myfile)
      wr.writerow((fields1))
      wr.writerows(export_data)
myfile.close()


export_data = zip_longest(*rows2, fillvalue = '')
with open('metric_LexRank.csv', 'w', encoding="utf-8", newline='') as myfile:
      wr = csv.writer(myfile)
      wr.writerow((fields2))
      wr.writerows(export_data)
myfile.close()


In [None]:
from statistics import mean
print ("text_len_l", mean(text_len_l))
print ("text_len_s", mean(text_len_s))
print ("text_len_w", mean(text_len_w))
print ("sum_len_l", mean(sum_len_l))
print ("sum_len_w", mean(sum_len_w))

print("rouge1_f1_5", mean(rouge1_f1_5))
print("bleu_5", mean(bleu_5))

print("rouge1_f1_10", mean(rouge1_f1_10))
print("bleu_10", mean(bleu_10))

print("rouge1_f1_15", mean(rouge1_f1_15))
print("bleu_15", mean(bleu_15))

print("rouge1_f1_20", mean(rouge1_f1_20))
print("bleu_20", mean(bleu_20))