In [1]:
# ----------------------------------------
# Install/Import required libraries
# ----------------------------------------
!pip install nltk spacy transformers datasets rouge-score tqdm 
import nltk
import spacy
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Download NLTK data (punkt for tokenization, stopwords list) if not already present
nltk.download('punkt')
nltk.download('stopwords')

# Load English tokenizer for spaCy (for more advanced sentence segmentation, if needed)
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 3.9 MB/s eta 0:00:04
     ---- ----------------------------------- 1.3/12.8 MB 3.7 MB/s eta 0:00:04
     -------- ------------------------------- 2.6/12.8 MB 3.8 MB/s eta 0:00:03
     --------- ------------------------------ 3.1/12.8 MB 3.6 MB/s eta 0:00:03
     ------------- -------------------------- 4.2/12.8 MB 3.6 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 3.5 MB/s eta 0:00:03
     ----------------- ---------------------- 5.5/12.8 MB 3.5 MB/s eta 0:00:03
     ------------------- -------------------- 6.3/12.8 MB 3.5 MB/s eta 0:00:02
     ---------------------- ----------------- 7.

In [2]:
# ----------------------------------------
# Load the CNN/DailyMail dataset
# ----------------------------------------
dataset = load_dataset("cnn_dailymail", "3.0.0")  # this will fetch train/val/test splits
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

In [3]:

# For demonstration or quick testing, you can sample a small subset:
# train_data = train_data.select(range(1000))  # first 1000 examples for faster training (optional)

# ----------------------------------------
# Preprocessing helper functions
# ----------------------------------------
# Function to preprocess article text for extractive summarization
stop_words = set(stopwords.words('english'))

def preprocess_text_for_extractive(text):
    """Tokenize text into sentences and words, remove stopwords. Returns list of sentences and list of token lists."""
    # Use NLTK or spaCy to split into sentences
    try:
        sentences = sent_tokenize(text)
    except Exception as e:
        # If NLTK punkt not available, fallback to spaCy
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]
    # Tokenize each sentence into words and filter stopwords
    sentence_words = []
    for sent in sentences:
        words = word_tokenize(sent.lower())  # tokenize and lowercase
        # Keep alphabetic words (remove punctuation tokens) and filter stopwords
        words = [w for w in words if w.isalpha() and w not in stop_words]
        sentence_words.append(words)
    return sentences, sentence_words

# Function to compute sentence similarity (for TextRank) based on word overlap (could also use cosine similarity with embeddings)
def sentence_similarity(words1, words2):
    if not words1 or not words2:
        return 0.0
    # Use Jaccard similarity or overlap coefficient
    overlap = len(set(words1) & set(words2))
    # Normalize by average length (to avoid bias for longer sentences)
    norm = (len(set(words1)) + len(set(words2))) / 2
    return overlap / norm

In [4]:
# ----------------------------------------
# Extractive Summarization (TextRank-inspired)
# ----------------------------------------
import numpy as np

def extractive_summarize(text, max_sentences=3):
    """Generate an extractive summary by selecting top-ranked sentences using a graph-based algorithm."""
    sentences, sentence_words = preprocess_text_for_extractive(text)
    n = len(sentences)
    if n == 0:
        return ""
    # If the text is very short or has few sentences, return it as is (or return first sentence)
    if n <= max_sentences:
        return " ".join(sentences)
    # Compute similarity matrix between sentences
    sim_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if i != j:
                sim_matrix[i][j] = sentence_similarity(sentence_words[i], sentence_words[j])
    # PageRank algorithm: iterative scoring of sentences
    # Initialize scores evenly
    scores = np.ones(n) / n
    # Power iteration
    for _ in range(100):  # 100 iterations or until convergence (can add convergence check)
        new_scores = np.ones(n) * 0.15  # damping factor d=0.85, so (1-d)=0.15 distributes equally
        for i in range(n):
            for j in range(n):
                if sim_matrix[j][i] > 0:
                    # Summation of incoming edges weighted by their score
                    new_scores[i] += 0.85 * scores[j] * (sim_matrix[j][i] / (sum(sim_matrix[j]) + 1e-8))
        # Check convergence (if scores not changing significantly)
        if np.allclose(new_scores, scores, atol=1e-4):
            break
        scores = new_scores
    # Rank sentences by score
    ranked_indices = np.argsort(scores)[::-1]
    top_indices = sorted(ranked_indices[:max_sentences])  # sort them in order of appearance
    summary_sentences = [sentences[i] for i in top_indices]
    return " ".join(summary_sentences)

# Example usage of extractive summarization:
example_article = test_data[0]['article']  # take one article from test set
print("Original Article (truncated):", example_article[:150], "...")
print("Extractive Summary:", extractive_summarize(example_article, max_sentences=3))

Original Article (truncated): (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisd ...
Extractive Summary: "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court's treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement.


In [5]:

# ----------------------------------------
# Abstractive Summarization (using Hugging Face Transformers)
# ----------------------------------------
# Load pre-trained tokenizer and model for abstractive summarization (BART)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# If fine-tuning is to be done:
training_mode = False  # set True if we want to fine-tune the model on train_data
if training_mode:
    # Prepare the training data in the format needed (tokenize inputs and outputs)
    def tokenize_batch(batch):
        tokenized_input = tokenizer(batch['article'], max_length=1024, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            tokenized_output = tokenizer(batch['highlights'], max_length=128, truncation=True, padding="max_length")
        tokenized_input["labels"] = tokenized_output["input_ids"]
        return tokenized_input
    train_dataset = train_data.map(tokenize_batch, batched=True, remove_columns=train_data.column_names)
    val_dataset = val_data.map(tokenize_batch, batched=True, remove_columns=val_data.column_names)
    # Set training arguments (small number of epochs for illustration)
    training_args = TrainingArguments(
        output_dir="summarizer_model",
        num_train_epochs=1,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        save_steps=1000,
        save_total_limit=2,
        evaluation_strategy="epoch",
        logging_steps=500,
        logging_dir="./logs",
        report_to="none",  # no report to wandb or others
    )
    # Define a simple ROUGE compute function for evaluation during training
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    def compute_metrics(eval_pred):
        preds, labels = eval_pred
        # Decode predictions and labels
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        # Compute ROUGE scores for each pair and average them
        rouge1 = rouge2 = rougel = 0.0
        for pred, label in zip(decoded_preds, decoded_labels):
            scores = scorer.score(label, pred)
            rouge1 += scores['rouge1'].fmeasure
            rouge2 += scores['rouge2'].fmeasure
            rougel += scores['rougeL'].fmeasure
        n = len(decoded_preds)
        return {
            "rouge1": rouge1 / n,
            "rouge2": rouge2 / n,
            "rougeL": rougel / n
        }
    # Initialize Trainer and train
    trainer = Trainer(model=model, args=training_args, 
                      train_dataset=train_dataset, eval_dataset=val_dataset,
                      compute_metrics=compute_metrics)
    trainer.train()
    # After training, we could save the fine-tuned model
    # model.save_pretrained("summarizer_model")
    # tokenizer.save_pretrained("summarizer_model")

# Abstractive inference on an example (either using fine-tuned model if trained, or pre-trained directly)
input_text = example_article  # using the same example article from above
inputs = tokenizer([input_text], max_length=1024, truncation=True, return_tensors="pt")
summary_ids = model.generate(inputs["input_ids"], max_length=130, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Abstractive Summary:", generated_summary)

Abstractive Summary: The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the Palestinians' efforts to join the body.


In [6]:


# ----------------------------------------
# Evaluation: ROUGE and BLEU on a sample
# ----------------------------------------
# Let's evaluate the above summaries against the reference summary for the example article
reference_summary = test_data[0]['highlights']  # the human-written summary (highlights)

# Compute ROUGE scores for the example
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
extractive_sum = extractive_summarize(example_article, max_sentences=3)
abstractive_sum = generated_summary
print("\nReference Summary:", reference_summary)
print("\nExtractive Summary:", extractive_sum)
print("Abstractive Summary:", abstractive_sum)

scores_ext = scorer.score(reference_summary, extractive_sum)
scores_abs = scorer.score(reference_summary, abstractive_sum)
print("\nROUGE-1/2/L for Extractive Summary:", 
      f"{scores_ext['rouge1'].fmeasure:.3f}, {scores_ext['rouge2'].fmeasure:.3f}, {scores_ext['rougeL'].fmeasure:.3f}")
print("ROUGE-1/2/L for Abstractive Summary:", 
      f"{scores_abs['rouge1'].fmeasure:.3f}, {scores_abs['rouge2'].fmeasure:.3f}, {scores_abs['rougeL'].fmeasure:.3f}")

# Compute BLEU score for the example (on sentence level or overall)
# Note: BLEU expects reference as list of reference sentences (or list of tokens)
ref_tokens = [word_tokenize(reference_summary.lower())]
ext_tokens = word_tokenize(extractive_sum.lower())
abs_tokens = word_tokenize(abstractive_sum.lower())
# Use a smoothing function for BLEU to account for short sentences
smoother = SmoothingFunction().method1
bleu_ext = sentence_bleu(ref_tokens, ext_tokens, smoothing_function=smoother)
bleu_abs = sentence_bleu(ref_tokens, abs_tokens, smoothing_function=smoother)
print(f"BLEU for Extractive Summary: {bleu_ext:.3f}")
print(f"BLEU for Abstractive Summary: {bleu_abs:.3f}")



Reference Summary: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

Extractive Summary: "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court's treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement.
Abstractive Summary: The Palestinian Authority becomes the 12

In [7]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset


In [8]:
# 1) Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# 2) Preprocessing and similarity functions for extractive summarization
stop_words = set(stopwords.words('english'))

In [9]:
def preprocess(text):
    sentences = sent_tokenize(text)
    tokenized = [
        [w for w in word_tokenize(s.lower()) if w.isalpha() and w not in stop_words]
        for s in sentences
    ]
    return sentences, tokenized

def similarity(a, b):
    if not a or not b:
        return 0.0
    inter = len(set(a) & set(b))
    return inter / ((len(set(a)) + len(set(b))) / 2)

def extractive_summary(text, max_sentences=3):
    sents, tokenized = preprocess(text)
    N = len(sents)
    if N <= max_sentences:
        return " ".join(sents)
    # Build similarity matrix
    M = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            if i != j:
                M[i, j] = similarity(tokenized[i], tokenized[j])
    # PageRank
    scores = np.ones(N) / N
    for _ in range(100):
        new = np.ones(N) * 0.15
        for i in range(N):
            for j in range(N):
                if M[j, i] > 0:
                    new[i] += 0.85 * scores[j] * (M[j, i] / (M[j].sum() + 1e-8))
        if np.allclose(new, scores, atol=1e-4):
            break
        scores = new
    # Select top sentences
    idx = sorted(np.argsort(scores)[-max_sentences:])
    return " ".join(sents[i] for i in idx)


In [10]:

# 3) Load abstractive model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model     = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

def abstractive_summary(text, max_length=130, min_length=30):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    outs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(outs[0], skip_special_tokens=True)


In [11]:
def main():
    # Load the test split
    ds = load_dataset("cnn_dailymail", "3.0.0", split="test")
    # Take the first example
    article   = ds[0]["article"]
    reference = ds[0]["highlights"]

    # Generate summaries
    ext = extractive_summary(article, max_sentences=3)
    abs_ = abstractive_summary(article)

    # Print all
    print("\n=== Original Article ===\n")
    print(article)
    print("\n=== Reference Summary ===\n")
    print(reference)
    print("\n=== Extractive Summary ===\n")
    print(ext)
    print("\n=== Abstractive Summary ===\n")
    print(abs_)

if __name__ == "__main__":
    main()



=== Original Article ===

(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking 