<a href="https://www.kaggle.com/code/farrelad/extractive-text-summarization?scriptVersionId=219918807" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Simple Extractive Text Summarization Model

This is a simple project extractive text summarization model that use technique TF-IDF vectorization and with cosine similarity. 

## Install necessary dependency

In [3]:
!pip install scikit-learn nltk --quiet

In [6]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import heapq

## Text Preprocessing

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)
    clean_sentences = []
    
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        clean_words = [word for word in words if word.isalnum() and word not in stop_words]
        clean_sentences.append(" ".join(clean_words))
        
    return clean_sentences, sentences

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## TF-IDF Vectorization and Sentence Similarity

In [8]:
def summarize_text(text, num_sentences=3):
    clean_sentences, original_sentences = preprocess_text(text)
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(clean_sentences)
    
    cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    sentence_scores = cosine_similarities.sum(axis=1)
    
    top_sentence_indices = heapq.nlargest(num_sentences, range(len(sentence_scores)), key=sentence_scores.take)
    
    summary = [original_sentences[i] for i in sorted(top_sentence_indices)]
    
    return " ".join(summary)

## Testing
You can try interactive testing below with `ipywidgets`. You can input any sentences in any languages to get your summarization.

In [13]:
import ipywidgets as widgets

output = widgets.Output()

text_input = widgets.Textarea(
    value='',
    placeholder='Type something...',
    description='Input text:',
    disabled=False,
    layout=widgets.Layout(width='500px', height='200px')
)

num_sentences_input = widgets.IntText(
    value=0,
    description='Total sentences:',
    disabled=False
)

submit_btn = widgets.Button(
    description='Submit',
    button_style='info'
)

def process_input(_):
    text = text_input.value
    total_sentences = num_sentences_input.value

    with output:
        output.clear_output()
        print(summarize_text(text, num_sentences=total_sentences))

submit_btn.on_click(process_input)

layout = widgets.VBox([
    text_input,
    num_sentences_input,
    submit_btn,
    output
])

layout

VBox(children=(Textarea(value='', description='Input text:', layout=Layout(height='200px', width='500px'), pla…