# T5 Abstractive Text Summarization

Train a T5 model on news articles and generate summaries.

## Install Requirements

In [None]:
import subprocess
import sys

# Install required packages
packages = [
    "torch",
    "pytorch-lightning",
    "transformers",
    "scikit-learn",
    "pandas",
    "numpy"
]

for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

print("✓ All requirements installed")

✓ All requirements installed


## Download dataset

Run this cell first to download dataset:

In [49]:
import os
import requests

# Make sure local folder exists
os.makedirs("/content/data", exist_ok=True)

url = "https://drive.usercontent.google.com/u/0/uc?id=16KxuBVMHnTpZziRMUJ1HXEKDt2QDFWvr&export=download"
output_path = "/content/data/news_summary.csv"

# Download only if not already present
if not os.path.exists(output_path):
    print("⏳ Downloading dataset...")
    response = requests.get(url, stream=True)
    response.raise_for_status()  # Throw error if download fails
    with open(output_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    print(f"✅ Dataset downloaded to {output_path}")
else:
    print("✅ Dataset already exists locally")


✅ Dataset already exists locally


In [5]:
import os
import pandas as pd
import torch
import pytorch_lightning as pl
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split

print(f"PyTorch: {torch.__version__}")
print(f"PyTorch Lightning: {pl.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

PyTorch: 2.8.0+cu126
PyTorch Lightning: 2.5.5
CUDA available: True


## Configuration

In [6]:
class Config:
    num_workers = os.cpu_count()
    n_epochs = 3
    batch_size = 8
    dataset_path = "/content/data/news_summary.csv"
    checkpoint_path = "t5_checkpoints/t5-best-checkpoint.ckpt"
    text_token_max_length = 1024
    summary_token_max_length = 256
    learning_rate = 0.0001
    use_gpu = torch.cuda.is_available()
    t5_model_path = "t5-base"
    t5_tokenizer = T5TokenizerFast.from_pretrained(t5_model_path)
    t5_pretrained_model = T5ForConditionalGeneration.from_pretrained(t5_model_path, return_dict=True)

config = Config()
print(f"GPU enabled: {config.use_gpu}")
print(f"Batch size: {config.batch_size}, Epochs: {config.n_epochs}")
print(f"Max input tokens: {config.text_token_max_length}, Max output tokens: {config.summary_token_max_length}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


GPU enabled: True
Batch size: 8, Epochs: 3
Max input tokens: 1024, Max output tokens: 256


## Data Preparation

In [7]:
class NewsSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, text_max_token_len=512, summary_max_token_len=128):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        text = data_row['text']
        summary = data_row['summary']

        text_encoding = self.tokenizer(
            text, max_length=self.text_max_token_len, padding="max_length",
            truncation=True, return_attention_mask=True, add_special_tokens=True,
            return_tensors="pt"
        )

        summary_encoding = self.tokenizer(
            summary, max_length=self.summary_max_token_len, padding="max_length",
            truncation=True, return_attention_mask=True, add_special_tokens=True,
            return_tensors="pt"
        )

        labels = summary_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            text=text, summary=summary,
            text_input_ids=text_encoding['input_ids'].flatten(),
            text_attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            labels_attention_mask=summary_encoding['attention_mask'].flatten()
        )

# Load and prepare data
data = pd.read_csv(config.dataset_path, encoding='latin-1')
data = data[['ctext', 'text']].rename(columns={'ctext': 'text', 'text': 'summary'}).dropna()
train_df, val_df = train_test_split(data, test_size=0.1, random_state=42)

print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}")

Train samples: 3956, Val samples: 440


## Model Definition

In [8]:
class NewsSummaryModel(pl.LightningModule):
    def __init__(self, model=None):
        super().__init__()
        self.model = config.t5_pretrained_model

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids, attention_mask=attention_mask,
            labels=labels, decoder_attention_mask=decoder_attention_mask
        )
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        loss, _ = self(
            input_ids=batch['text_input_ids'],
            attention_mask=batch['text_attention_mask'],
            decoder_attention_mask=batch['labels_attention_mask'],
            labels=batch['labels']
        )
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, _ = self(
            input_ids=batch['text_input_ids'],
            attention_mask=batch['text_attention_mask'],
            decoder_attention_mask=batch['labels_attention_mask'],
            labels=batch['labels']
        )
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=config.learning_rate)

## Training

In [None]:
# Create dataloaders
train_dataset = NewsSummaryDataset(train_df, config.t5_tokenizer)
val_dataset = NewsSummaryDataset(val_df, config.t5_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers)

# ModelCheckpoint callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath="t5_checkpoints",
    filename="t5-best-checkpoint",
    save_top_k=1,
    monitor="val_loss",
    mode="min",
    verbose=True
)

# Trainer
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=config.n_epochs,
    accelerator="gpu" if config.use_gpu else "cpu",
    devices=1,
    enable_progress_bar=True
)

# Resume if checkpoint exists, else start fresh
checkpoint_path = config.checkpoint_path
model = NewsSummaryModel()
model.train()

if os.path.exists(checkpoint_path):
    print(f"✓ Resuming from: {checkpoint_path}")
    trainer.fit(model, train_loader, val_loader, ckpt_path=checkpoint_path)
else:
    print("✓ Starting fresh training")
    trainer.fit(model, train_loader, val_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at t5_checkpoints/t5-best-checkpoint.ckpt


✓ Resuming from: t5_checkpoints/t5-best-checkpoint.ckpt


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params | Mode 
-------------------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M  | train
-------------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)
541       Modules in train mode
0         Modules in eval mode
INFO:pytorch_lightning.utilities.rank_zero:Restored all states from the checkpoint at t5_checkpoints/t5-best-checkpoint.ckpt


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

## Inference

In [56]:
# Load best checkpoint
checkpoint_path = config.checkpoint_path
if os.path.exists(checkpoint_path):
    trained_model = NewsSummaryModel.load_from_checkpoint(checkpoint_path)
    trained_model.freeze()
    print(f"Loaded checkpoint from {checkpoint_path}")
else:
    print(f"Checkpoint not found at {checkpoint_path}")
    trained_model = model

Loaded checkpoint from t5_checkpoints/t5-best-checkpoint.ckpt


In [48]:
# Test article
test_text = """Machine learning is an important component of the growing field of data science. Through the use of statistical methods, algorithms are trained to make classifications or predictions, uncovering key insights within data mining projects.
These insights subsequently drive decision making within applications and businesses, ideally impacting key growth metrics. As big data continues to expand and grow, the market demand for data scientists will increase, requiring them to
assist in the identification of the most relevant business questions and subsequently the data to answer them."""

print("Original Text:")
print(test_text)
print("\n" + "="*80 + "\n")

Original Text:
Machine learning is an important component of the growing field of data science. Through the use of statistical methods, algorithms are trained to make classifications or predictions, uncovering key insights within data mining projects.
These insights subsequently drive decision making within applications and businesses, ideally impacting key growth metrics. As big data continues to expand and grow, the market demand for data scientists will increase, requiring them to 
assist in the identification of the most relevant business questions and subsequently the data to answer them.




In [55]:
# Generate summary
text_encoding = config.t5_tokenizer(
    test_text, max_length=config.text_token_max_length, padding="max_length", truncation=True,
    return_attention_mask=True, add_special_tokens=True, return_tensors="pt"
)

device = next(trained_model.model.parameters()).device

input_ids = text_encoding['input_ids'].to(device)
attention_mask = text_encoding['attention_mask'].to(device)

generated_ids = trained_model.model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=256,
    num_beams=2,
    repetition_penalty=2.5,
    length_penalty=1.0,
    early_stopping=True
)

summary = config.t5_tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

print("Generated Summary:")
print(summary)

Generated Summary:
Machine-learning is an important component of the growing field of data science. Through machine learning, algorithms are trained to make classifications or predictions, uncovering key insights within applications and businesses. This subsequently drives decision-making within applications and businesses, impacting key growth metrics.


## Checkpoint Management

The best checkpoint is automatically saved during training. Use the options below to download or access it.

In [None]:
# Download checkpoint from Google Drive
from google.colab import files
files.download("/content/drive/MyDrive/checkpoints/t5-best-checkpoint.ckpt")
print("✓ Checkpoint downloaded")