In [1]:
# Uninstall existing PyTorch-related packages to avoid version conflicts
!pip uninstall -y torch torchvision torchaudio

# Install PyTorch, torchvision, and torchaudio with CUDA 11.8
!pip install torch==2.6.0+cu118 torchvision==0.21.0+cu118 torchaudio==2.6.0+cu118 --index-url https://download.pytorch.org/whl/cu118

# Install remaining libraries
!pip install transformers datasets spacy torchtext nltk rouge-score

# Install spaCy English model
!python -m spacy download en_core_web_sm

# Since spaCy requires a runtime restart after installing the model, we'll handle imports in the next cell
print("Dependencies installed successfully! Please proceed to the next cell.")

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.6.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (27 kB)
Collecting torchvision==0.21.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.21.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio==2.6.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.6.0%2Bcu118-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch==2.6.

In [2]:
# Update datasets library to the latest version
!pip install --upgrade datasets

# Verify the installed version
import datasets
print(f"datasets version: {datasets.__version__}")

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [3]:
# Import libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Add this line

import spacy
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import BertTokenizer, BertModel, pipeline, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
import torch
from torch.utils.data import Dataset, DataLoader
import re

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Verify CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("Setup completed successfully!")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Using device: cuda
Setup completed successfully!


In [4]:
# Load the CNN/Daily Mail dataset with a specific version
try:
    dataset = load_dataset('cnn_dailymail', '3.0.0')
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Use a smaller subset for faster processing in Colab
train_dataset = dataset['train'].select(range(1000))
val_dataset = dataset['validation'].select(range(100))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
train_dataset = train_dataset.map(lambda x: {'article': preprocess_text(x['article']), 'highlights': preprocess_text(x['highlights'])})
val_dataset = val_dataset.map(lambda x: {'article': preprocess_text(x['article']), 'highlights': preprocess_text(x['highlights'])})

# Display a sample
print("Sample Article:")
print(train_dataset[0]['article'][:500] + "...")
print("\nSample Summary:")
print(train_dataset[0]['highlights'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset loaded successfully!


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Sample Article:
london england reuters harry potter star daniel radcliffe gains access reported million million fortune turns monday insists money wont cast spell daniel radcliffe harry potter harry potter order phoenix disappointment gossip columnists around world young actor says plans fritter cash away fast cars drink celebrity parties dont plan one people soon turn suddenly buy massive sports car collection something similar told australian interviewer earlier month dont think ill particularly extravagant t...

Sample Summary:
harry potter star daniel radcliffe gets fortune turns monday young actor says plans fritter cash away radcliffes earnings first five potter films held trust fund


In [5]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Function for extractive summarization
def extractive_summary(text, num_sentences=3):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    if len(sentences) <= num_sentences:
        return ' '.join(sentences)

    # Score sentences based on word frequency
    word_freq = {}
    for word in text.split():
        word_freq[word] = word_freq.get(word, 0) + 1
    sentence_scores = []
    for sent in sentences:
        score = sum(word_freq.get(word, 0) for word in sent.split())
        sentence_scores.append((score, sent))

    # Select top sentences
    summary_sentences = sorted(sentence_scores, reverse=True)[:num_sentences]
    return ' '.join(sent[1] for sent in summary_sentences)

# Test extractive summarization
sample_article = train_dataset[0]['article']
extractive_sum = extractive_summary(sample_article)
print("Extractive Summary:")
print(extractive_sum)

Extractive Summary:
london england reuters harry potter star daniel radcliffe gains access reported million million fortune turns monday insists money wont cast spell daniel radcliffe harry potter harry potter order phoenix disappointment gossip columnists around world young actor says plans fritter cash away fast cars drink celebrity parties dont plan one people soon turn suddenly buy massive sports car collection something similar told australian interviewer earlier month dont think ill particularly extravagant things like buying things cost pounds books cds dvds radcliffe able gamble casino buy drink pub see horror film hostel part ii currently six places number one movie uk box office chart details hell mark landmark birthday wraps agent publicist comment plans ill definitely sort party said interview hopefully none reading radcliffes earnings first five potter films held trust fund able touch despite growing fame riches actor says keeping feet firmly ground people always looking s

In [6]:
# Load T5 tokenizer and model
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)

# Function for abstractive summarization
def abstractive_summary(text, max_length=50):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Test abstractive summarization
abstractive_sum = abstractive_summary(sample_article)
print("Abstractive Summary:")
print(abstractive_sum)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Abstractive Summary:
harry potter star daniel radcliffe insists money wont cast spell despite growing fame richest actor says keeping feet firmly ground people always looking say kid star goes rails last month try hard go


In [2]:
# Update datasets library to the latest version
!pip install --upgrade datasets

# Verify the installed version
import datasets
print(f"datasets version: {datasets.__version__}")

# Add this line to upgrade transformers as well
!pip install --upgrade transformers

datasets version: 3.6.0


In [6]:
# Update datasets and transformers libraries
!pip install --upgrade datasets transformers

# Verify the installed versions
import datasets
import transformers
print(f"datasets version: {datasets.__version__}")
print(f"transformers version: {transformers.__version__}")

# Import necessary libraries for Dataset and Trainer
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
import torch
import re
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import nltk

# Ensure nltk data is downloaded if not already
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Verify CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the CNN/Daily Mail dataset with a specific version
try:
    dataset = load_dataset('cnn_dailymail', '3.0.0')
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Use a smaller subset for faster processing
train_dataset_raw = dataset['train'].select(range(1000))
val_dataset_raw = dataset['validation'].select(range(100))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
train_dataset_processed = train_dataset_raw.map(lambda x: {'article': preprocess_text(x['article']), 'highlights': preprocess_text(x['highlights'])})
val_dataset_processed = val_dataset_raw.map(lambda x: {'article': preprocess_text(x['article']), 'highlights': preprocess_text(x['highlights'])})


# Load T5 tokenizer and model
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)


# Custom Dataset class for fine-tuning
class SummaryDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_target_length=50):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        article = str(self.data[idx]['article'])
        summary = str(self.data[idx]['highlights'])

        inputs = self.tokenizer("summarize: " + article, max_length=self.max_input_length, padding="max_length", truncation=True, return_tensors="pt")
        targets = self.tokenizer(summary, max_length=self.max_target_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            'input_ids': inputs.input_ids.squeeze(),
            'attention_mask': inputs.attention_mask.squeeze(),
            'labels': targets.input_ids.squeeze()
        }

# Create datasets for fine-tuning
train_dataset_finetune = SummaryDataset(train_dataset_processed, tokenizer)
val_dataset_finetune = SummaryDataset(val_dataset_processed, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    report_to="none",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_finetune,
    eval_dataset=val_dataset_finetune,
)

# Fine-tune the model
print("Starting model fine-tuning...")
trainer.train()
print("Model fine-tuning completed!")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")
print("Fine-tuned model saved!")

datasets version: 3.6.0
transformers version: 4.52.3
Using device: cuda
Dataset loaded successfully!


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Starting model fine-tuning...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
100,3.0352,3.038435
200,2.4321,1.969489
300,2.3331,1.915384
400,2.2064,1.882652
500,2.2173,1.864807


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Model fine-tuning completed!
Fine-tuned model saved!


In [9]:
# Load fine-tuned model
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_t5")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_t5")
fine_tuned_model.to(device)

# Function to generate summary with fine-tuned model
def fine_tuned_summary(text, max_length=50):
    inputs = fine_tuned_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = fine_tuned_model.generate(inputs.input_ids, max_length=max_length, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = fine_tuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for validation set
val_summaries = [fine_tuned_summary(val_dataset[i]['article']) for i in range(len(val_dataset))]

# Import necessary libraries for ROUGE calculation
from rouge_score import rouge_scorer
import numpy as np

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = []
for i in range(len(val_dataset)):
    score = scorer.score(val_dataset[i]['highlights'], val_summaries[i])
    rouge_scores.append(score)

# Average ROUGE scores
avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

print("Evaluation Results:")
print(f"Average ROUGE-1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2: {avg_rouge2:.4f}")
print(f"Average ROUGE-L: {avg_rougeL:.4f}")

# Test with a real-world article
real_article = """
LONDON (Reuters) - British Prime Minister Keir Starmer said on Tuesday he wanted to shift the focus of the government away from traditional Westminster politics towards a more regional approach, as he chaired the first meeting of his Council of the Nations and Regions. Starmer, who won a landslide election victory in July, has promised to reset the relationship between the central government and the leaders of England’s regions as well as the devolved governments of Scotland, Wales and Northern Ireland. The new Labour government wants to hand more powers to local leaders, giving them greater control over strategic planning for transport, skills and employment in a bid to drive economic growth and improve living standards across the country. “For too long, the Westminster system has been like an over-centralised computer, where all the processing power sits in one place, and when that centre fails, the whole system crashes,” Starmer told regional leaders at the meeting in Edinburgh.
"""
real_summary = fine_tuned_summary(real_article)
print("\nReal-world Article Summary:")
print(real_summary)
# Generate summaries for validation set
val_summaries = [fine_tuned_summary(val_dataset[i]['article']) for i in range(len(val_dataset))]

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = []
for i in range(len(val_dataset)):
    score = scorer.score(val_dataset[i]['highlights'], val_summaries[i])
    rouge_scores.append(score)

# Average ROUGE scores
avg_rouge1 = np.mean([score['rouge1'].fmeasure for score in rouge_scores])
avg_rouge2 = np.mean([score['rouge2'].fmeasure for score in rouge_scores])
avg_rougeL = np.mean([score['rougeL'].fmeasure for score in rouge_scores])

print("Evaluation Results:")
print(f"Average ROUGE-1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2: {avg_rouge2:.4f}")
print(f"Average ROUGE-L: {avg_rougeL:.4f}")

# Test with a real-world article
real_article = """
Something’s been bothering me lately, and judging from what I know about the people who read these articles each week, I bet it’s bothered some of you before too.

It’s that phrase—“Welcome to the Real World.”

Have you ever heard that? It’s usually intended as a sarcastic remark about what someone else has said or is doing.

It might also have been phrased like this:

That’s just not how it works.

You’ll understand better one day when you’re (older, wiser, have a mortgage, whatever)

That sounds nice, but it’s unrealistic.

Let me share something very important with you: these are the things that people say when they want to marginalize you.

Other negative adjectives are idealistic, naïve, and well-meaning. If you hear those words, get ready – someone is very close to telling you about their interpretation of the ‘real world.’

To be more precise, here’s what the real world looks like from the perspective of those who would like to welcome you to this world:

Remaining true to principles or values is admirable to a point, but after a while we are expected to compromise them in order to be true to a greater good
No one should be ‘too much’ of anything. If you’re too smart, you can’t relate to regular people. If you’re too rich, you don’t understand how the rest of us live. If you’re too nice, even, you’re naïve for not knowing that the world is a dog-eat-dog place where each person must compete for scarce resources.
Anyone who is able to break loose and find their own way should be treated with suspicion. The attitude is, “If I can’t do that, you shouldn’t be able to either.”
"""
real_summary = fine_tuned_summary(real_article)
print("\nReal-world Article Summary:")
print(real_summary)

Evaluation Results:
Average ROUGE-1: 0.3222
Average ROUGE-2: 0.1337
Average ROUGE-L: 0.2569

Real-world Article Summary:
the new labour government wants to hand more powers to local leaders . it wants to drive economic growth and improve living standards across the country .
Evaluation Results:
Average ROUGE-1: 0.3222
Average ROUGE-2: 0.1337
Average ROUGE-L: 0.2569

Real-world Article Summary:
"welcome to the real world" is often intended as a sarcastic remark about what someone else has said or is doing . other negative adjectives are idealistic, nave, and
