In [1]:
import nltk
import networkx as nx
from rouge import Rouge
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np
from termcolor import colored


# You may need to download 'punkt' and 'stopwords' from nltk
nltk.download('punkt')
nltk.download('stopwords')

stop_words = stopwords.words('english')

def similarity_matrix(sentences):
    # Create an empty similarity matrix
    sim_mat = np.zeros([len(sentences), len(sentences)])

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentences[i].reshape(1, -1), sentences[j].reshape(1, -1))[0,0]
    return sim_mat

def vectorize_sentences(sentences):
    vectorizer = TfidfVectorizer().fit_transform(sentences)
    # Convert sparse matrix to dense matrix
    vectors = vectorizer.toarray()
    return vectors

def text_rank(text, n):
    # Tokenize the text and remove stopwords
    sentences = sent_tokenize(text)
    sentences = [' '.join(w for w in word_tokenize(sentence) if w not in stop_words) for sentence in sentences]

    # Convert sentences to vectors
    vectors = vectorize_sentences(sentences)

    # Create a similarity matrix
    sim_mat = similarity_matrix(vectors)

    # Use the similarity matrix to create a graph
    nx_graph = nx.from_numpy_array(sim_mat)

    # Apply the PageRank algorithm to the graph
    scores = nx.pagerank(nx_graph)

    # Sort the sentences by score and return the n best
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    best_sentences = [s for score, s in ranked_sentences[:n]]

    return ' '.join(best_sentences)

def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        data = file.read().replace('\n', '')
    return data

# Specify the file path
file_path = "./dataset/business/037.txt"

# Read the text from the file
text = read_text_from_file(file_path)

# Generate the summary
summary = text_rank(text, 2)

print(summary)

# Specify the file path for saving the summary
output_path = "./res.txt"

# Save the summary to the file
with open(output_path, 'w') as file:
    file.write(summary)


def read_reference_summary(file_path):
    with open(file_path, 'r') as file:
        data = file.read().replace('\n', '')
    return data


# Specify the file path
reference_summary_path = "./reference res.txt"

# Read the reference summary from the file
reference_summary = read_reference_summary(reference_summary_path)

def calculate_rouge_scores(hypothesis, reference):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference, avg=True)
    return scores

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(summary, reference_summary)
print(rouge_scores)

def highlight_matches(summary, reference):
    summary_words = summary.split()
    reference_words = reference.split()
    highlighted_summary = ''

    for word in summary_words:
        if word in reference_words:
            highlighted_summary += colored(word, 'green') + ' '
        else:
            highlighted_summary += word + ' '

    print("Summary:")
    print(highlighted_summary)
    print("\nReference:")
    print(reference)

highlight_matches(summary, reference_summary)



ModuleNotFoundError: No module named 'rouge'

In [3]:
pip install rouge



SyntaxError: invalid syntax (<ipython-input-3-e42dd572397a>, line 1)