In [1]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from language_tool_python import LanguageTool
import spacy
from heapq import nlargest
from nltk.translate.bleu_score import sentence_bleu
from nltk.corpus import stopwords
from string import punctuation
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
from nltk.translate import meteor_score
from nltk.translate.meteor_score import single_meteor_score

In [3]:

# Load spaCy model for extractive text summarization
nlp = spacy.load("en_core_web_sm")

# Load T5 model and tokenizer for abstractive text summarization
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
custom_stopwords = set(stopwords.words("english"))

## Extractive Text Summarization:

In [5]:
def extractive_summarize(text):
    # Your extractive summarization logic here
    # ...
    # Get input text
        #text = input_text.value

        doc = nlp(text)

        word_frequencies = {}
        for word in doc:
            if word.text.lower() not in custom_stopwords:
                if word.text.lower() not in punctuation:
                    if word.text not in word_frequencies.keys():
                        word_frequencies[word.text] = 1
                    else:
                        word_frequencies[word.text] += 1

        # SENTENCE TOKENIZATION
        max_frequency = max(word_frequencies.values())
        for word in word_frequencies.keys():
            word_frequencies[word] = word_frequencies[word] / max_frequency

        sentence_tokens = [sent for sent in doc.sents]

        # WORD FREQUENCY TABLE
        sentence_scores = {}
        for sent in sentence_tokens:
            for word in sent:
                if word.text.lower() in word_frequencies.keys():
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

        from heapq import nlargest
        select_length = int(len(sentence_tokens) * 0.3)
        summary = nlargest(select_length, sentence_scores, key=sentence_scores.get)
        final_summary = [word.text for word in summary]
        generated_summary = ''.join(final_summary)
        
        return generated_summary

## Abstractive Text Summarization:

In [6]:
def abstractive_summarize(text):
    # initialize the pretrained model
    model = T5ForConditionalGeneration.from_pretrained('t5-small')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    device = torch.device('cpu')
    preprocessed_text = text.strip().replace('\n', '')
    t5_input_text = 'summarize: ' + preprocessed_text

    tokenized_text = tokenizer.encode(t5_input_text, return_tensors='pt', max_length=512).to(device)

    summary_ids = model.generate(tokenized_text, min_length=30, max_length=300)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # Post-process the summary for capitalization and grammar
    tool = LanguageTool('en-US')
    summary = tool.correct(summary)

        # print("Generated Summary:")
        # print(summary)
        
    return summary

In [7]:
def process_summaries(btn):
    clear_output(wait=True)

    # Extractive Summarization
    extractive_summary = extractive_summarize(text_area.value)

    # Abstractive Summarization
    abstractive_summary = abstractive_summarize(text_area.value)

    # Display the results
    display(widgets.HTML("<h3>Extractive Summary:</h3>"))
    display(widgets.HTML(extractive_summary))
    display(widgets.HTML("<h3>Abstractive Summary:</h3>"))
    display(widgets.HTML(abstractive_summary))

# Text area for input
text_area = widgets.Textarea(
    value='',
    placeholder='Enter your text here...',
    description='Input Text:',
    layout=widgets.Layout(width='50%', height='200px')
)

# Button for triggering summarization
summarize_button = widgets.Button(description="Summarize", button_style='success')
summarize_button.on_click(process_summaries)

# Display widgets
display(text_area)
display(summarize_button)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


HTML(value='<h3>Extractive Summary:</h3>')

HTML(value="The start of Australia's chase was dented by India's new-ball bowling pair of Jasprit Bumrah and M…

HTML(value='<h3>Abstractive Summary:</h3>')

HTML(value="India's strong run at the world cup 2023 was put to a halt by Australia. The win was put to a halt…