In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from heapq import nlargest # Function to extract the largest values from a collection
from collections import Counter  # Used for counting word frequencies
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake #extracting keywords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange





In [2]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Load SentenceTransformer model for BERT embeddings
bert_model = SentenceTransformer('all-MiniLM-L6-v2')



In [4]:
# Define stop words and punctuation
stopwords = list(STOP_WORDS)
punctuation = string.punctuation + "\n"

In [5]:
def word_frequency(doc):
    """Calculates word frequency in a document"""
    word_counts = Counter()
    for word in doc:
        if word.text.lower() not in stopwords and word.text.lower() not in punctuation:
            word_counts[word.text.lower()] += 1
    return word_counts

In [6]:
def sentence_score(sentence_tokens, word_frequencies):
    """Scores sentences based on word frequencies"""
    scores = {}
    for sent in sentence_tokens:
        score = 0
        for word in sent:
            if word.text.lower() in word_frequencies:
                score += word_frequencies[word.text.lower()]
        scores[sent.text] = score
    return scores

In [7]:
def lexrank(sentences):
    """LexRank implementation for sentence ranking"""
    #  ranks sentences based on their importance within the text 
    # Placeholder for LexRank implementation
    # Return a dummy score for each sentence
    return {sent: 1 for sent in sentences}  # Dummy implementation

In [8]:
def tfidf_ranking(sentences, vectorizer):
    """TF-IDF-based ranking"""
    vectors = vectorizer.fit_transform(sentences)
    scores = (vectors * vectors.T).diagonal()
    return scores

In [9]:
def extract_keywords(text):
    """Keyword extraction using RAKE"""
    rake = Rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

In [10]:
def generate_heading(keywords, summary):
    """Generate a suitable heading based on keywords and summary"""
    keyword_str = ' '.join(keywords)
    heading = f"Summary: {keyword_str[:50]}..."  # Generate a simple heading based on keywords
    return heading

In [11]:
def bert_similarity(sentences):
    """BERT-based similarity scoring"""
    embeddings = bert_model.encode(sentences)
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix.diagonal()

In [12]:
def summarize(text):
    """Summarizes the text with improved ranking and heading generation"""
    doc = nlp(text)
    word_frequencies = word_frequency(doc)
    sentence_tokens = [sent.text for sent in doc.sents]
    
    # Score sentences
    sentence_objects = [sent for sent in doc.sents]
    sentence_scores = sentence_score(sentence_objects, word_frequencies)

    # Combine LexRank and TF-IDF for sentence ranking
    lexrank_scores = lexrank(sentence_tokens)
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_scores = tfidf_ranking(sentence_tokens, tfidf_vectorizer)
    bert_scores = bert_similarity(sentence_tokens)
    
    # Ensure all scores are aligned
    combined_scores = {}
    for i, sent in enumerate(sentence_tokens):
        if i < len(lexrank_scores) and i < len(tfidf_scores) and i < len(bert_scores):
            combined_scores[sent] = lexrank_scores.get(sent, 0) * tfidf_scores[i] * bert_scores[i]

    # Select top sentences based on combined scores
    summary_length = 3  # Adjust number of sentences for summary
    summary_sentences = nlargest(summary_length, combined_scores, key=combined_scores.get)

    # Extract keywords
    keywords = extract_keywords(text)

    # Generate heading
    heading = generate_heading(keywords, summary_sentences)

    # Calculate word counts
    original_word_count = len(text.split())
    summary_word_count = len(" ".join(summary_sentences).split())

    return heading, " ".join(summary_sentences), original_word_count, summary_word_count

In [14]:
# Example usage
text = "The Industrial Revolution marked a significant turning point in human history, characterized by the shift from agrarian economies to industrial ones. Driven by technological advancements, such as the steam engine and the spinning jenny, factories replaced farms as the primary economic centers. This transformation led to rapid urbanization, increased production efficiency, and improved living standards for some. However, it also exacerbated social inequalities and environmental problems.One of the most profound impacts of the Industrial Revolution was the urbanization of societies. As people migrated from rural areas to cities in search of work, cities grew at an unprecedented rate. This rapid urbanization created numerous challenges, including overcrowding, pollution, and inadequate infrastructure. Despite these difficulties, cities also offered new opportunities and cultural experiences.The Industrial Revolution also led to significant advancements in technology and production methods. The invention of the steam engine revolutionized transportation and manufacturing, enabling factories to produce goods on a massive scale. The spinning jenny and other textile machines increased the efficiency of textile production, leading to a decline in the cost of clothing. These technological advancements contributed to a rise in living standards for many people, as they had access to more affordable goods and services.However, the Industrial Revolution also had negative consequences. The shift from a rural to an industrial society led to increased social inequalities. Factory workers often faced long hours, low wages, and dangerous working conditions. Child labor was widespread, as children were exploited for their cheap labor. These social problems contributed to the rise of labor movements and social reforms.In addition to social inequalities, the Industrial Revolution also had a significant impact on the environment. The burning of fossil fuels for energy released large amounts of greenhouse gases into the atmosphere, contributing to climate change. Industrial waste and pollution also caused environmental damage. These environmental problems continue to be major challenges facing society today.In conclusion, the Industrial Revolution was a transformative period in human history. It brought about significant changes in society, technology, and the economy. While it led to improvements in living standards for some, it also exacerbated social inequalities and environmental problems. The legacy of the Industrial Revolution continues to shape our world today, and addressing the challenges it created remains a crucial task for future generations."


heading, summary, original_word_count, summary_word_count = summarize(text)

print("Heading:", heading)
print("Summary:", summary)
print("Original Word Count:", original_word_count)
print("Summary Word Count:", summary_word_count)

Heading: Summary: factory workers often faced long hours cities also...
Summary: Driven by technological advancements, such as the steam engine and the spinning jenny, factories replaced farms as the primary economic centers. Despite these difficulties, cities also offered new opportunities and cultural experiences. Industrial waste and pollution also caused environmental damage.
Original Word Count: 374
Summary Word Count: 40
