In [1]:
# Check GPU availability in Colab
!nvidia-smi

Wed Sep  3 06:37:43 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
# Install necessary NLP and utility libraries
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
!pip install --upgrade accelerate   # upgrade accelerate (used for distributed training)
!pip uninstall -y transformers accelerate
!pip install transformers accelerate  # reinstall transformers + accelerate for compatibility

!pip install evaluate -q

  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-airflow 2.10.2 requires alembic<2.0,>=1.13.1, which is not installed.
apache-airflow 2.10.2 requires argcomplete>=1.10, which is not installed.
apache-airflow 2.10.2 requires asgiref>=2.3.0, which is not installed.
apache-airflow 2.10.2 requires blinker>=1.6.2, which is not installed.
apache-airflow 2.10.2 requires colorlog>=6.8.2, which is not insta

Found existing installation: transformers 4.56.0
Uninstalling transformers-4.56.0:
  Successfully uninstalled transformers-4.56.0


ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [3]:
# Import Hugging Face pipeline and dataset utilities
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
from evaluate import load as load_metric
import matplotlib.pyplot as plt
import pandas as pd

In [4]:
# Model + Tokenizer imports
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [5]:
# NLP tools
import nltk
from nltk.tokenize import sent_tokenize

In [6]:
# Progress bar + PyTorch
from tqdm import tqdm
import torch

In [7]:
# Download NLTK tokenizer
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
# Set device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [9]:
# Load PEGASUS model checkpoint for summarization
model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

##**Dataset setup**

In [10]:
# Download and unzip custom SAMSum dataset

!wget https://github.com/AdMub/Text-Summarization-NLP-End-to-End-Project/raw/refs/heads/main/datasets/summarizer-data.zip
!unzip summarizer-data.zip

--2025-09-03 06:41:05--  https://github.com/AdMub/Text-Summarization-NLP-End-to-End-Project/raw/refs/heads/main/datasets/summarizer-data.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/AdMub/Text-Summarization-NLP-End-to-End-Project/refs/heads/main/datasets/summarizer-data.zip [following]
--2025-09-03 06:41:06--  https://raw.githubusercontent.com/AdMub/Text-Summarization-NLP-End-to-End-Project/refs/heads/main/datasets/summarizer-data.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7903594 (7.5M) [application/zip]
Saving to: ‘summarizer-data.zip’


2025-09-03 06:41:07 (87.1 MB/s) - ‘summa

In [11]:
# Load datasets
dataset_samsum = load_from_disk('samsum_dataset')
dataset_samsum

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [12]:
# Inspect dataset structure
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]
print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']


In [13]:
# Print example dialogue + summary
print("\nDialogue:")
print(dataset_samsum["test"][1]["dialogue"])
print("\nSummary:")
print(dataset_samsum["test"][1]["summary"])


Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


##**Preprocessing**

In [14]:
# Convert text samples to tokenized input + target features
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'], max_length=256, truncation=True)  #1024

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=64, truncation=True)   #1024

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

In [15]:
# Apply preprocessing to dataset
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)
dataset_samsum_pt["train"]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 14732
})

##**Training setup**

In [16]:
# Import training utilities
from transformers import DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

In [17]:
# Data collator handles batching & padding
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

In [18]:
# Define training hyperparameters
trainer_args = TrainingArguments(
    output_dir='pegasus-samsum',
    num_train_epochs=1,    #10
    warmup_steps=200,      #500
    per_device_train_batch_size=1,  # Reduced batch size
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    logging_steps=50,    #10
    eval_strategy='steps',
    eval_steps=500,
    save_steps=500,
    gradient_accumulation_steps=8     # Increased gradient accumulation steps
)

In [19]:
# Initialize Trainer API
trainer = Trainer(
    model=model_pegasus,
    args=trainer_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=dataset_samsum_pt["train"],
    eval_dataset=dataset_samsum_pt["validation"]
)

  trainer = Trainer(


In [20]:
# Train PEGASUS on SAMSum dataset
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33madmub465[0m ([33madmub465-university-of-ibadan[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
500,1.6697,1.510432
1000,1.6168,1.453949
1500,1.5934,1.430704




TrainOutput(global_step=1842, training_loss=1.7083594314956252, metrics={'train_runtime': 3390.5314, 'train_samples_per_second': 4.345, 'train_steps_per_second': 0.543, 'total_flos': 5115782636642304.0, 'train_loss': 1.7083594314956252, 'epoch': 1.0})

##**Evaluation setup**

In [21]:
# Helper: split data into smaller batches
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from a list."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [24]:
# Evaluate model on test set with ROUGE metric
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=8, device=device,
                               column_text="dialogue",        # SAMSum uses 'dialogue'
                               column_summary="summary"):     # SAMSum uses 'summary'
    """
    Evaluate a seq2seq model on a dataset using ROUGE.

    Args:
        dataset: Hugging Face Dataset (e.g., SAMSum)
        metric: Evaluation metric (e.g., load_metric("rouge"))
        model: Trained Hugging Face model
        tokenizer: Corresponding tokenizer
        batch_size: Number of samples per batch
        device: 'cuda' or 'cpu'
        column_text: Name of text/dialogue column
        column_summary: Name of summary/target column
    """
    # Split into smaller chunks for batching
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        # Tokenize input dialogues
        inputs = tokenizer(article_batch, max_length=256, truncation=True,
                           padding="max_length", return_tensors="pt").to(device)

        # Generate summaries
        summaries = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            length_penalty=0.8,
            num_beams=4,        # smaller beams to avoid OOM
            max_length=64       # shorter outputs to save memory
        )

        # Decode model predictions
        decoded_summaries = [
            tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for s in summaries
        ]

        # Add predictions and references to metric
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    # Compute and return ROUGE scores
    score = metric.compute()
    return score


In [25]:
# Define ROUGE
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_metric = load_metric('rouge')

Downloading builder script: 0.00B [00:00, ?B/s]

In [26]:
# Run evaluation with ROUGE
score = calculate_metric_on_test_ds(
    dataset_samsum['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'dialogue', column_summary= 'summary'
)

100%|██████████| 410/410 [08:50<00:00,  1.29s/it]


In [28]:
# Format results into DataFrame
rouge_dict = dict((rn, score[rn]) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=['pegasus'])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
pegasus,0.426393,0.196404,0.338669,0.338614


###**Save model + tokenizer**

In [29]:
## Save model
model_pegasus.save_pretrained("pegasus-samsum-model")

In [30]:
## Save tokenizer
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/spiece.model',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

###**Inference / Prediction**

In [31]:
# Reload tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/tokenizer")

In [36]:
# Define generation parameters
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 64}

In [37]:
# Sample test dialogue
sample_text = dataset_samsum["test"][0]["dialogue"]
reference = dataset_samsum["test"][0]["summary"]

In [38]:
# Create summarization pipeline
pipe = pipeline("summarization", model="pegasus-samsum-model", tokenizer=tokenizer)

Device set to use cuda:0


In [39]:
# Show results
print("Dialogue:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Reference Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.

Model Summary:
Amanda can't find Betty's number. Larry called her last time they were at the park together. Hannah would rather she text him. Amanda will text him.
