In [2]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import networkx as nx
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from rouge import Rouge

In [4]:

# resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Sample text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation.
"""

# Preprocess the text
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    
    def tokenize_and_remove_stopwords(sentence):
        words = word_tokenize(sentence.lower())
        return ' '.join([word for word in words if word.isalpha() and word not in stop_words])

    preprocessed_sentences = [tokenize_and_remove_stopwords(sentence) for sentence in sentences]
    return sentences, preprocessed_sentences

# TextRank algorithm with TF-IDF
def build_similarity_matrix(sentences, preprocessed_sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix

def textrank_summarizer(text, top_n=2):
    sentences, preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(sentences, preprocessed_sentences)

    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    
    summary = " ".join([ranked_sentences[i][1] for i in range(top_n)])
    return summary

# Evaluate using ROUGE
def evaluate_summary(reference_summary, generated_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores

# Example usage
reference_summary = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language. Challenges in NLP frequently involve speech recognition, natural language understanding, and natural language generation.
"""
generated_summary = textrank_summarizer(text, top_n=2)

print("Generated Summary:", generated_summary)
scores = evaluate_summary(reference_summary, generated_summary)
print("ROUGE Scores:", scores)



Generated Summary: Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural language generation. 
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.
ROUGE Scores: [{'rouge-1': {'r': 0.9375, 'p': 0.75, 'f': 0.8333333283950618}, 'rouge-2': {'r': 0.8611111111111112, 'p': 0.62, 'f': 0.7209302276906436}, 'rouge-l': {'r': 0.9375, 'p': 0.75, 'f': 0.8333333283950618}}]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\22anj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\22anj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
