In [None]:
#downloading the required data and resources
nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [None]:
#University Project

import math
# imports the entire nltk library
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist

# imports only the sent_tokenize function from the nltk.tokenize module.
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
import re

# nlargest return the n largest elements from an iterable.
# imports the nlargest function from the heapq module.
import heapq
from heapq import nlargest

#import the WordNetLemmatizer class from the nltk.stem module. 
from nltk.stem import WordNetLemmatizer, PorterStemmer

from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

#creates a set of stop words in English language using the stopwords module
stop_words = set(stopwords.words('english'))

#The WordNetLemmatizer class can be used to lemmatize individual words, lists of words, or entire sentences.
lemmatizer = WordNetLemmatizer()

In [None]:
# Define a function to clean the text
def clean_text(text):
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z\s]', '', text)
    # Tokenize the text into words
    words = word_tokenize(text)
    # Lemmatize the words
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # Remove the stop words from the words list
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in lemmatized_words if word.casefold() not in stop_words]
    # Combine the filtered words into a cleaned text
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

def summarize(text):
    # Clean the text
    cleaned_text = clean_text(text)
    # Tokenize the cleaned text into sentences
    sentences = sent_tokenize(cleaned_text)
    
    # Calculate the frequency of each word
    word_frequencies = nltk.FreqDist(cleaned_text.split())
    
    # Calculate the score of each sentence
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word.casefold() in word_frequencies:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_frequencies[word.casefold()]
                else:
                    sentence_scores[sentence] += word_frequencies[word.casefold()]
                    
    # Get the top n sentences with the highest scores
    summary_sentences = nlargest(5, sentence_scores, key=sentence_scores.get)
    
    # Combine the summary sentences into a summary
    summary = ' '.join(summary_sentences)
    return summary

# Function to generate frequency-based summary
def freq_summarize(text, n=5):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    # Remove stop words and tokenize words
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.lower() not in stop_words]
    # Calculate the frequency of each word
    word_freq = dict()
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    # Normalize word frequency by dividing by maximum frequency
    max_freq = max(word_freq.values())
    for word in word_freq.keys():
        word_freq[word] /= max_freq
    # Create a TF-IDF matrix of the sentences
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    # Calculate the sentence scores based on TF-IDF values
    sentence_scores = dict()
    for i in range(len(sentences)):
        score = 0
        for word in word_tokenize(sentences[i]):
            if word.lower() in word_freq.keys():
                score += word_freq[word.lower()]
        sentence_scores[i] = score
    # Select the top n sentences based on scores
    summary_sentences = heapq.nlargest(n, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in summary_sentences]
    return ' '.join(summary)


# Function to generate position-based summary
def pos_summarize(text, num_sentences=3):
    # Clean the text
    cleaned_text = clean_text(text)
    # Tokenize the cleaned text into sentences
    sentences = sent_tokenize(cleaned_text)
    # Calculate the position of each sentence in the text
    sentence_positions = [(i+1)/len(sentences) for i in range(len(sentences))]
    
    # Get the top n sentences with the highest scores
    top_sentences = heapq.nlargest(num_sentences, sentences, key=lambda s: sentence_positions[sentences.index(s)])
    
    # Combine the summary sentences into a summary
    summary = ' '.join(top_sentences)
    return summary

# Example use cases:

#Test Case 1
#News article
news_article = """As the COVID-19 pandemic continues to grip the world, scientists are working around the clock to develop vaccines that can protect people from the virus. Researchers at several leading pharmaceutical companies have developed vaccines that have been shown to be highly effective in clinical trials. However, challenges remain in terms of producing and distributing these vaccines on a global scale."""

# Summarize of news article
# Generate the frequency-based summary
freq_summary_news_article = freq_summarize(news_article)
print("Test Case 1 - News article\nFrequency-based summary:\n", freq_summary_news_article)

# Generate the position-based summary
pos_summary_news_article = pos_summarize(news_article)
print("\nPosition-based summary:\n", pos_summary_news_article)


#Test Case 2
#Legal document
legal_document = """This agreement, entered into by and between the parties here to, sets forth the terms and conditions under which the parties shall conduct their business. The parties agree to be bound by the terms and conditions of this agreement, and acknowledge that any breach of this agreement may result in damages to the non-breaching party."""

# Summarize the legal document
# Generate the frequency-based summary
freq_summary_legal_document = freq_summarize(legal_document)
print("\n\n\nTest Case 2 - Legal Document\nFrequency-based summary:\n", freq_summary_legal_document)

# Generate the position-based summary
pos_summary_legal_document = pos_summarize(legal_document)
print("\nPosition-based summary:\n", pos_summary_legal_document)


#Test Case 3
#Scientific paper
scientific_paper = """In this study, we investigate the effects of a new drug on patients with a rare genetic disorder. Our results show that the drug is highly effective in reducing symptoms of the disorder, and that it is well-tolerated by patients. These findings have important implications for the development of new treatments for rare genetic disorders."""

# Summarize the scientific paper
# Generate the frequency-based summary
freq_summary_scientific_paper = freq_summarize(scientific_paper)
print("\n\n\nTest Case 3 - Scientific paper\nFrequency-based summary:\n", freq_summary_scientific_paper)

# Generate the position-based summary
pos_summary_scientific_paper = pos_summarize(scientific_paper)
print("\nPosition-based summary:\n", pos_summary_scientific_paper)



article = "Peter and Elizabeth took a taxi to attend the night party in the city. While in the party, Elizabeth collapsed and WAS rushed to the hospital"

freq_summary_article = freq_summarize(article)
print("\n\nSample Case - \nFrequency-based summary:\n", freq_summary_article)

# Generate the position-based summary
pos_summary_article = pos_summarize(article)
print("\nPosition-based summary:\n", pos_summary_article)

Test Case 1 - News article
Frequency-based summary:
 As the COVID-19 pandemic continues to grip the world, scientists are working around the clock to develop vaccines that can protect people from the virus. Researchers at several leading pharmaceutical companies have developed vaccines that have been shown to be highly effective in clinical trials. However, challenges remain in terms of producing and distributing these vaccines on a global scale.

Position-based summary:
 COVID pandemic continues grip world scientist working around clock develop vaccine protect people virus Researchers several leading pharmaceutical company developed vaccine shown highly effective clinical trial However challenge remain term producing distributing vaccine global scale



Test Case 2 - Legal Document
Frequency-based summary:
 This agreement, entered into by and between the parties here to, sets forth the terms and conditions under which the parties shall conduct their business. The parties agree to be b

In [None]:
def _create_frequency_table(text_string) -> dict:
    """
    we create a dictionary for the word frequency table.
    For this, we should only use the words that are not part of the stopWords array.
    Removing stop words and making frequency table
    Stemmer - an algorithm to bring words to its root word.
    :rtype: dict
    """
    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable


def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix


def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix


def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table


def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix


def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix


def _score_sentences(tf_idf_matrix) -> dict:
    """
    score a sentence by its word's TF
    Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
    :rtype: dict
    """

    sentenceValue = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

    return sentenceValue


def _find_average_score(sentenceValue) -> int:
    """
    Find the average score from the sentence value dictionary
    :rtype: int
    """
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentenceValue))

    return average


def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary


def run_summarization(text):
    """
    :param text: Plain summary_text of long article
    :return: summarized summary_text
    """

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''
    # 1 Sentence Tokenize
    sentences = sent_tokenize(text)
    total_documents = len(sentences)
    #print(sentences)

    # 2 Create the Frequency matrix of the words in each sentence.
    freq_matrix = _create_frequency_matrix(sentences)
    #print(freq_matrix)

    '''
    Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
    '''
    # 3 Calculate TermFrequency and generate a matrix
    tf_matrix = _create_tf_matrix(freq_matrix)
    #print(tf_matrix)

    # 4 creating table for documents per words
    count_doc_per_words = _create_documents_per_words(freq_matrix)
    #print(count_doc_per_words)

    '''
    Inverse document frequency (IDF) is how unique or rare a word is.
    '''
    # 5 Calculate IDF and generate a matrix
    idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
    #print(idf_matrix)

    # 6 Calculate TF-IDF and generate a matrix
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    #print(tf_idf_matrix)

    # 7 Important Algorithm: score the sentences
    sentence_scores = _score_sentences(tf_idf_matrix)
    #print(sentence_scores)

    # 8 Find the threshold
    threshold = _find_average_score(sentence_scores)
    #print(threshold)

    # 9 Important Algorithm: Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
    return summary

# Function to generate frequency-based summary
def freq_summarize(text, n=5):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    # Remove stop words and tokenize words
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for sentence in sentences for word in word_tokenize(sentence) if word.lower() not in stop_words]
    # Calculate the frequency of each word
    word_freq = dict()
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    # Normalize word frequency by dividing by maximum frequency
    max_freq = max(word_freq.values())
    for word in word_freq.keys():
        word_freq[word] /= max_freq
    # Create a TF-IDF matrix of the sentences
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    # Calculate the sentence scores based on TF-IDF values
    sentence_scores = dict()
    for i in range(len(sentences)):
        score = 0
        for word in word_tokenize(sentences[i]):
            if word.lower() in word_freq.keys():
                score += word_freq[word.lower()]
        sentence_scores[i] = score
    # Select the top n sentences based on scores
    summary_sentences = heapq.nlargest(n, sentence_scores, key=sentence_scores.get)
    summary = [sentences[i] for i in summary_sentences]
    return ' '.join(summary)

text_str = '''
Those Who Are Resilient Stay In The Game Longer
“On the mountains of truth you can never climb in vain: either you will reach a point higher up today, or you will be training your powers so that you will be able to climb higher tomorrow.” — Friedrich Nietzsche
Challenges and setbacks are not meant to defeat you, but promote you. However, I realise after many years of defeats, it can crush your spirit and it is easier to give up than risk further setbacks and disappointments. Have you experienced this before? To be honest, I don’t have the answers. I can’t tell you what the right course of action is; only you will know. However, it’s important not to be discouraged by failure when pursuing a goal or a dream, since failure itself means different things to different people. To a person with a Fixed Mindset failure is a blow to their self-esteem, yet to a person with a Growth Mindset, it’s an opportunity to improve and find new ways to overcome their obstacles. Same failure, yet different responses. Who is right and who is wrong? Neither. Each person has a different mindset that decides their outcome. Those who are resilient stay in the game longer and draw on their inner means to succeed.
I’ve coached mummy and mom clients who gave up after many years toiling away at their respective goal or dream. It was at that point their biggest breakthrough came. Perhaps all those years of perseverance finally paid off. It was the 19th Century’s minister Henry Ward Beecher who once said: “One’s best success comes after their greatest disappointments.” No one knows what the future holds, so your only guide is whether you can endure repeated defeats and disappointments and still pursue your dream. Consider the advice from the American academic and psychologist Angela Duckworth who writes in Grit: The Power of Passion and Perseverance: “Many of us, it seems, quit what we start far too early and far too often. Even more than the effort a gritty person puts in on a single day, what matters is that they wake up the next day, and the next, ready to get on that treadmill and keep going.”
I know one thing for certain: don’t settle for less than what you’re capable of, but strive for something bigger. Some of you reading this might identify with this message because it resonates with you on a deeper level. For others, at the end of their tether the message might be nothing more than a trivial pep talk. What I wish to convey irrespective of where you are in your journey is: NEVER settle for less. If you settle for less, you will receive less than you deserve and convince yourself you are justified to receive it.
“Two people on a precipice over Yosemite Valley” by Nathan Shipps on Unsplash
Develop A Powerful Vision Of What You Want
“Your problem is to bridge the gap which exists between where you are now and the goal you intend to reach.” — Earl Nightingale
I recall a passage my father often used growing up in 1990s: “Don’t tell me your problems unless you’ve spent weeks trying to solve them yourself.” That advice has echoed in my mind for decades and became my motivator. Don’t leave it to other people or outside circumstances to motivate you because you will be let down every time. It must come from within you. Gnaw away at your problems until you solve them or find a solution. Problems are not stop signs, they are advising you that more work is required to overcome them. Most times, problems help you gain a skill or develop the resources to succeed later. So embrace your challenges and develop the grit to push past them instead of retreat in resignation. Where are you settling in your life right now? Could you be you playing for bigger stakes than you are? Are you willing to play bigger even if it means repeated failures and setbacks? You should ask yourself these questions to decide whether you’re willing to put yourself on the line or settle for less. And that’s fine if you’re content to receive less, as long as you’re not regretful later.
If you have not achieved the success you deserve and are considering giving up, will you regret it in a few years or decades from now? Only you can answer that, but you should carve out time to discover your motivation for pursuing your goals. It’s a fact, if you don’t know what you want you’ll get what life hands you and it may not be in your best interest, affirms author Larry Weidel: “Winners know that if you don’t figure out what you want, you’ll get whatever life hands you.” The key is to develop a powerful vision of what you want and hold that image in your mind. Nurture it daily and give it life by taking purposeful action towards it.
Vision + desire + dedication + patience + daily action leads to astonishing success. Are you willing to commit to this way of life or jump ship at the first sign of failure? I’m amused when I read questions written by millennials on Quora who ask how they can become rich and famous or the next Elon Musk. Success is a fickle and long game with highs and lows. Similarly, there are no assurances even if you’re an overnight sensation, to sustain it for long, particularly if you don’t have the mental and emotional means to endure it. This means you must rely on the one true constant in your favour: your personal development. The more you grow, the more you gain in terms of financial resources, status, success — simple. If you leave it to outside conditions to dictate your circumstances, you are rolling the dice on your future.
So become intentional on what you want out of life. Commit to it. Nurture your dreams. Focus on your development and if you want to give up, know what’s involved before you take the plunge. Because I assure you, someone out there right now is working harder than you, reading more books, sleeping less and sacrificing all they have to realise their dreams and it may contest with yours. Don’t leave your dreams to chance.
'''
text_str1 = "Hi, my name is Sumit and I'm a college student. My hobbies include watching and playing football, analysing and developing data science projects, and I also sleep a lot. Right now I'm planning to go for my master's degree and then keep doing jobs untill I implement my startup ideas. First I want to learn to be an employee and see how corporate companies work and then become an employer. Most of the time I've my earphones plugged in my ears and the music is playing non-stop. I also love spending my time at the gym but due to college and studies I'm not able to add it in my daily schedule. Apart from all this, I like playing pin bowling as well."

#result = run_summarization(text_str)
#print("\n\n",result)

print(freq_summarize(text_str))

My hobbies include watching and playing football, analysing and developing data science projects, and I also sleep a lot. I also love spending my time at the gym but due to college and studies I'm not able to add it in my daily schedule. Right now I'm planning to go for my master's degree and then keep doing jobs untill I implement my startup ideas. Hi, my name is Sumit and I'm a college student. Apart from all this, I like playing pin bowling as well.


In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m94.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from transformers import pipeline

# Load the pre-trained summarization model
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
# Define the input text to be summarized
input_text = """ The Amazon rainforest is the world's largest tropical rainforest, covering over five and a half million square kilometers. It is home to millions of species of plants and animals, many of which are found nowhere else on earth. However, the rainforest is under threat from deforestation, which is driven by logging, agriculture, and other human activities. This destruction of the rainforest is having a devastating impact on both the environment and the indigenous communities who call it home. """

# Generate a summary of the input text
summary = summarizer(input_text, max_length=100, min_length=30, do_sample=False)

# Print the summary
print(summary[0]['summary_text'])

Your max_length is set to 100, but you input_length is only 97. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)


 The Amazon rainforest is the world's largest tropical rainforest, covering over five and a half million square kilometers . It is home to millions of species of plants and animals, many of which are found nowhere else on earth . However, deforestation is under threat from logging, agriculture, and other human activities .
