In [None]:

import pandas as pd
import nltk
import re
import string
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Dataset: Synthetic data (sample news articles)
documents = [
    "The stock market saw a significant drop today as several tech companies reported lower earnings than expected.",
    "Researchers have discovered a new species of frog in the Amazon rainforest that is capable of changing its skin color.",
    "The local government has announced new measures to tackle the rising pollution levels in the city.",
    "A recent study shows that regular exercise can reduce the risk of heart disease and improve mental health.",
    "Tech companies are facing increased scrutiny over privacy concerns as data breaches continue to occur.",
    "The new policy aims to support small businesses by providing tax relief and access to low-interest loans."
]

# Preprocess text
stop_words = set(nltk.corpus.stopwords.words('english'))
def preprocess(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)  # Remove punctuation
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

processed_docs = [preprocess(doc) for doc in documents]
processed_docs


In [None]:

# Step 1: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_docs)
tfidf_matrix.toarray()


In [None]:

# Step 2: Compute cosine similarity
similarity_matrix = cosine_similarity(tfidf_matrix)
similarity_matrix


In [None]:

# Step 3: Create a graph using similarity matrix
graph = nx.from_numpy_array(similarity_matrix)

# Step 4: Apply PageRank algorithm for scoring sentences
scores = nx.pagerank(graph)
scores


In [None]:

# Step 5: Rank sentences and create summary
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(documents)), reverse=True)
summary = ' '.join([ranked_sentences[i][1] for i in range(2)])  # Top 2 sentences as summary

# Output the summary
print("Document Summary:")
print(summary)
