In [1]:
!pip install nltk



In [8]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import defaultdict
import math

# nltk.download('punkt')
# nltk.download('stopwords')

def summarize_text(text, compression_ratio=0.4):
    # Step 1: Tokenize the text into words and sentences
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())

    # Step 2: Remove stopwords and build a frequency table
    stop_words = set(stopwords.words("english"))
    filtered_words = [w for w in words if w.isalnum() and w not in stop_words]

    freq_table = defaultdict(int)
    for word in filtered_words:
        freq_table[word] += 1

    # Normalize frequency (ensures fairness across large texts)
    max_freq = max(freq_table.values())
    for word in freq_table:
        freq_table[word] /= max_freq

    # Step 3: Score each sentence using normalized word frequencies
    sentence_scores = defaultdict(float)
    for sentence in sentences:
        sentence_words = word_tokenize(sentence.lower())
        for word in sentence_words:
            if word in freq_table:
                sentence_scores[sentence] += freq_table[word]

        # Length normalization encourages concise sentences
        sentence_scores[sentence] /= (len(sentence_words) + 1)

    # Step 4: Select top sentences (based on compression ratio)
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    summary_count = max(1, int(len(sentences) * compression_ratio))
    summary = " ".join(sorted_sentences[:summary_count])

    return summary


# ------------------------------------------
# SAMPLE USAGE
text = """Text summarization is a crucial process in an age of information overload, designed to distill extensive articles, reports, or documents into a concise, manageable overview while preserving the original content's core meaning and essential information. This process can be performed manually or through advanced AI-powered tools that use algorithms to identify and extract the most relevant sentences (extractive summarization) or generate new, human-like sentences to cover the main ideas (abstractive summarization). The primary goal is to save time, enhance understanding, and quickly determine whether the full text is worth reading, which is especially beneficial for students, researchers, and professionals reviewing large volumes of information. Key components of a good summary include clarity, brevity, objectivity (avoiding personal opinions or analysis), and accuracy, focusing solely on the main topic, purpose, and key supporting details while omitting minor details, descriptive language, or direct quotations (unless cited). Users can often customize the output to be a single paragraph or a list of bullet points, and adjust the length to suit different needs, from a brief overview to a more detailed executive summary. A variety of online platforms, such as QuillBot, Scribbr, and Grammarly, offer free and premium summarization tools to help with this task.
"""

summary = summarize_text(text)
print("\n--- SUMMARY ---\n", summary)


--- SUMMARY ---
 Text summarization is a crucial process in an age of information overload, designed to distill extensive articles, reports, or documents into a concise, manageable overview while preserving the original content's core meaning and essential information. This process can be performed manually or through advanced AI-powered tools that use algorithms to identify and extract the most relevant sentences (extractive summarization) or generate new, human-like sentences to cover the main ideas (abstractive summarization).


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
