In [1]:
import nltk
import time
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
from textstat import flesch_reading_ease

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to C:\Users\Roopesh
[nltk_data]     P\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Roopesh
[nltk_data]     P\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def pre_process_text(text):
    sentences = sent_tokenize(text)
    corpus = []
    stop_words = set(stopwords.words('english'))
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
        corpus.append(words)
    return corpus

def calculate_sentence_similarity(sentence1, sentence2):
    all_words = list(set(sentence1 + sentence2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)

    for word in sentence1:
        vector1[all_words.index(word)] += 1

    for word in sentence2:
        vector2[all_words.index(word)] += 1

    return 1 - cosine_distance(vector1, vector2)

def generate_similarity_matrix(sentences):
    sim_mat = np.zeros((len(sentences), len(sentences)))

    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = calculate_sentence_similarity(sentences[i], sentences[j])

    return sim_mat  

def rank_sentences(sim_mat):
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    return scores 

def generate_summary(text, top_n=10):
    processed_sentences = pre_process_text(text)
    sim_mat = generate_similarity_matrix(processed_sentences)
    sentence_scores = rank_sentences(sim_mat)

    ranked_sentences = sorted(((sentence_scores[i], s) for i, s in enumerate(sent_tokenize(text))), reverse=True)
    top_n = min(top_n, len(ranked_sentences))
    summary = ' '.join([ranked_sentences[i][1] for i in range(top_n)])
    return summary

def calculate_similarity(original_text, summary):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([original_text, summary])
    similarity = cosine_similarity(vectors[0], vectors[1])
    return similarity[0][0]

def calculate_compression(original_text, summary):
    original_length = len(original_text.split())
    summary_length = len(summary.split())
    return (summary_length / original_length) * 100

def calculate_rouge(original_text, summary):
    rouge = Rouge()
    scores = rouge.get_scores(summary, original_text)
    return scores[0]

def calculate_readability(summary):
    return flesch_reading_ease(summary) 

def summarize_text_file(file_path, top_n=10):
    try: 
        with open(file_path, 'r') as file:
            text = file.read()
        summary = generate_summary(text, top_n)
        return text, summary
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None, None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

file_path = r'C:\Users\Roopesh P\OneDrive\Pictures\Documents\Task 1\te.txt'   #add your own path to the file

start_time = time.time()
original_text, summary = summarize_text_file(file_path, top_n=5)
execution_time = time.time() - start_time

if summary:
    print("Generated Summary:")
    print(summary)

    similarity_score = calculate_similarity(original_text, summary)
    compression_ratio = calculate_compression(original_text, summary)
    rouge_scores = calculate_rouge(original_text, summary)
    readability_score = calculate_readability(summary)

    print("\nPerformance Metrics:")
    print(f"Cosine Similarity: {similarity_score:.2f}")
    print(f"Compression Ratio: {compression_ratio:.2f}%")
    print(f"ROUGE Scores: {rouge_scores}")
    print(f"Readability Score: {readability_score:.2f}")
    print(f"Execution Time: {execution_time:.4f} seconds")
else:
    print("No summary was generated.")

Generated Summary:
Steve Jobs was a visionary leader and co-founder of Apple Inc. In 1976, Jobs and his friend Steve Wozniak founded Apple Computer in the Jobs family garage. Although NeXT was not a commercial success, its technology was later used as the foundation for Apple's operating system when Jobs returned to the company in 1997. After leaving Apple in 1985, Jobs founded NeXT, a company that developed high-end computers for business and education markets. Returning to Apple, Jobs revitalized the company with the introduction of the iMac, iTunes, iPod, iPhone, and iPad.

Performance Metrics:
Cosine Similarity: 0.80
Compression Ratio: 24.73%
ROUGE Scores: {'rouge-1': {'r': 0.315, 'p': 1.0, 'f': 0.4790874488282324}, 'rouge-2': {'r': 0.2522255192878338, 'p': 0.9550561797752809, 'f': 0.39906102955839895}, 'rouge-l': {'r': 0.315, 'p': 1.0, 'f': 0.4790874488282324}}
Readability Score: 44.54
Execution Time: 0.1661 seconds
