# Text Summarization Demo

This notebook demonstrates extractive and abstractive text summarization methods.


In [None]:
import json
import pandas as pd
from preprocessing import TextPreprocessor
from extractive_summarizer import TFIDFSummarizer, TextRankSummarizer, LeadKSummarizer
from abstractive_summarizer import AbstractiveSummarizer, T5Summarizer
from evaluation import RougeEvaluator


## Load Sample Article


In [None]:
with open('data/articles.json', 'r', encoding='utf-8') as f:
    articles = json.load(f)

sample_article = articles[0]
print(f"Title: {sample_article['title']}")
print(f"\nText length: {len(sample_article['text'])} characters")
print(f"\nFirst 500 characters:\n{sample_article['text'][:500]}...")


## Preprocessing


In [None]:
preprocessor = TextPreprocessor()
processed_text = preprocessor.preprocess(sample_article['text'])
sentences = preprocessor.segment_sentences(processed_text)

print(f"Number of sentences: {len(sentences)}")
print(f"\nFirst 3 sentences:\n")
for i, sent in enumerate(sentences[:3], 1):
    print(f"{i}. {sent}")


## Extractive Summarization


In [None]:
tfidf_summarizer = TFIDFSummarizer(preprocessor)
textrank_summarizer = TextRankSummarizer(preprocessor)
leadk_summarizer = LeadKSummarizer(preprocessor)

tfidf_summary = tfidf_summarizer.summarize(processed_text, num_sentences=3)
textrank_summary = textrank_summarizer.summarize(processed_text, num_sentences=3)
leadk_summary = leadk_summarizer.summarize(processed_text, num_sentences=3)

print("TF-IDF Summary:")
print(tfidf_summary)
print("\n" + "="*50 + "\n")

print("TextRank Summary:")
print(textrank_summary)
print("\n" + "="*50 + "\n")

print("Lead-3 Summary:")
print(leadk_summary)


## Abstractive Summarization


In [None]:
print("Loading abstractive models (this may take a while)...")

bart_summarizer = AbstractiveSummarizer(model_name="facebook/bart-large-cnn")
t5_summarizer = T5Summarizer(model_name="t5-small")

bart_summary = bart_summarizer.summarize(processed_text, max_length=150, min_length=50)
t5_summary = t5_summarizer.summarize(processed_text, max_length=150, min_length=50)

print("BART Summary:")
print(bart_summary)
print("\n" + "="*50 + "\n")

print("T5 Summary:")
print(t5_summary)


## Evaluation with ROUGE


In [None]:
reference_summary = ' '.join(sentences[:3])
evaluator = RougeEvaluator()

summaries = {
    'TF-IDF': tfidf_summary,
    'TextRank': textrank_summary,
    'Lead-3': leadk_summary,
    'BART': bart_summary,
    'T5': t5_summary
}

results = []
for method, summary in summaries.items():
    scores = evaluator.evaluate(reference_summary, summary)
    results.append({
        'Method': method,
        'ROUGE-1 F1': scores['rouge1_f1'],
        'ROUGE-2 F1': scores['rouge2_f1'],
        'ROUGE-L F1': scores['rougeL_f1']
    })

df_results = pd.DataFrame(results)
print(df_results.to_string(index=False))
