In [1]:
# Ensure all required packages are installed
!pip install datasets transformers rouge_score nltk

# Import necessary libraries
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import re

# 1. Load the CNN/Daily Mail dataset
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")

# Split the dataset
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

print("Dataset loaded successfully!")

# 2. Preprocess the text
def clean_text(text):
    """Function to clean and normalize text."""
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text.lower()

# Apply text cleaning to training data
train_articles = [clean_text(example['article']) for example in train_data]
train_summaries = [clean_text(example['highlights']) for example in train_data]

# Example of cleaned text
print("Cleaned Article Example:", train_articles[0])
print("Cleaned Summary Example:", train_summaries[0])

# 3. Extractive summarization
print("Starting extractive summarization...")
extractive_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Perform summarization on a test article
test_article = test_data[0]['article']
extractive_summary = extractive_summarizer(test_article, max_length=50, min_length=25, truncation=True)
extractive_summary_text = extractive_summary[0]['summary_text']

print("Extractive Summary:", extractive_summary_text)

# 4. Abstractive summarization
print("Starting abstractive summarization...")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

inputs = tokenizer(test_article, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
abstractive_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Abstractive Summary:", abstractive_summary)

# 5. Evaluation metrics
print("Evaluating summaries...")
reference = test_data[0]['highlights']

# ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores_extractive = scorer.score(reference, extractive_summary_text)
rouge_scores_abstractive = scorer.score(reference, abstractive_summary)

# BLEU score
bleu_score_abstractive = sentence_bleu([reference.split()], abstractive_summary.split())

# Display results
print("ROUGE Scores (Extractive):", rouge_scores_extractive)
print("ROUGE Scores (Abstractive):", rouge_scores_abstractive)
print("BLEU Score (Abstractive):", bleu_score_abstractive)

# 6. Save results to file
with open("summary_results.txt", "w") as f:
    f.write(f"Extractive Summary:\n{extractive_summary_text}\n")
    f.write(f"Abstractive Summary:\n{abstractive_summary}\n")
    f.write(f"ROUGE Scores (Extractive): {rouge_scores_extractive}\n")
    f.write(f"ROUGE Scores (Abstractive): {rouge_scores_abstractive}\n")
    f.write(f"BLEU Score (Abstractive): {bleu_score_abstractive}\n")

print("Summarization and evaluation completed! Results saved in 'summary_results.txt'.")


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset loaded successfully!
Cleaned Article Example: london england reuters  harry potter star daniel radcliffe gains access to a reported 20 million 411 million fortune as he turns 18 on monday but he insists the money wont cast a spell on him daniel radcliffe as harry potter in harry potter and the order of the phoenix to the disappointment of gossip columnists around the world the young actor says he has no plans to fritter his cash away on fast cars drink and celebrity parties i dont plan to be one of those people who as soon as they turn 18 suddenly buy themselves a massive sports car collection or something similar he told an australian interviewer earlier this month i dont think ill be particularly extravagant the things i like buying are things that cost about 10 pounds  books and cds and dvds at 18 radcliffe will be able to gamble in a casino buy a drink in a pub or see the horror film hostel part ii currently six places below his number one movie on the uk box office chart d

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Extractive Summary: Palestinian Authority becomes 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the Palestinians' efforts to join the body.
Starting abstractive summarization...
Abstractive Summary: The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the Palestinians' efforts to join the body.
Evaluating summaries...
ROUGE Scores (Extractive): {'rouge1': Score(precision=0.5428571428571428, recall=0.5588235294117647, fmeasure=0.5507246376811593), 'rouge2': Score(precision=0.38235294117647056, recall=0.3939393939393939, fmeasure=0.38805970149253727), 'rougeL': Score(precision=0.4857142857142857, recall=0.5, fmeasure=0.49275362318840576)}
ROUGE Scores (Abstractive): {'rouge1': Score(precision=0.5135135135135