In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install -U transformers==4.37.0 evaluate sacrebleu rouge_score accelerate==0.26.0 peft==0.6.0 trl==0.7.4 pyarrow==13.0.0 datasets==2.14.5

Collecting transformers==4.37.0
  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate==0.26.0
  Downloading accelerate-0.26.0-py3-none-any.whl.metadata (18 kB)
Collecting peft==0.6.0
  Downloading peft-0.6.0-py3-none-any.whl.metadata (23 kB)
Collecting trl==0.7.4
  Downloading trl-0.7.4-py3-none-any.whl.metadata (10 kB)
Collecting pyarrow==13.0.0
  Downloading pyarrow-13.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.me

In [2]:
from datasets import get_dataset_config_names

configs = get_dataset_config_names("cnn_dailymail")
print(configs)

Downloading readme: 0.00B [00:00, ?B/s]

['1.0.0', '2.0.0', '3.0.0']


In [3]:
from datasets import load_dataset

cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")
print(cnn_dataset)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [None]:
from transformers import pipeline, set_seed

set_seed(42)
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
print(f"Article: \n\n")
print(cnn_dataset["train"][0]["article"])
pipe_out = pipe(cnn_dataset["train"][0]["article"])
print(f"Summary:\n\n")
print(pipe_out[0]["summary_text"])

In [None]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=[pipe_out[0]["summary_text"]], references=[[cnn_dataset["train"][0]["article"]]])
print(list(results.keys()))
print(results)

In [5]:
from tqdm import tqdm

def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer, batch_size=16):
    article_batches = list(chunks(dataset["article"], batch_size))
    target_batches = list(chunks(dataset["highlights"], batch_size))

    predictions=[]
    references=[]
    for article_batch, target_batch in tqdm(zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024, truncation=True, padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device), attention_mask=inputs["attention_mask"].to(device), length_penalty=0.8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        predictions.extend(decoded_summaries)
        references.extend([[tgt] for tgt in target_batch])

    return predictions, references

In [5]:
import torch
import evaluate
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

model_ckpt = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
predictions, references = evaluate_summaries_pegasus(cnn_dataset["train"].select(range(64)), rouge, model, tokenizer, batch_size=8)
print(predictions[0])
print(references[0])

2025-11-01 17:23:14.013915: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762017794.037093     341 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762017794.044017     341 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 8/8 [00:44<00:00,  5.58s/it]

Harry Potter star Daniel Radcliffe gains access to a reported £20 million fortune. Young actor says he has no plans to fritter his cash away. Radcliffe's earnings from the first five Potter films have been held in a trust fund.
["Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund ."]





In [6]:
score = rouge.compute(predictions=predictions, references=references)
print(score)

{'rouge1': 0.5452652795907229, 'rouge2': 0.3730590724436544, 'rougeL': 0.47180346030884207, 'rougeLsum': 0.5103095455208939}


# Fine-tuning

In [7]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["article"], max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["highlights"], max_length=128, truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

cnn_validation_pt = cnn_dataset["validation"].select(range(1024)).map(convert_examples_to_features, batched=True)
columns = ["input_ids", "labels", "attention_mask"]
cnn_validation_pt.set_format(type="torch", columns=columns)
print(cnn_validation_pt)

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]



Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1024
})


In [8]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [9]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='pegasus-cnn_dailymail', num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16, report_to="none")

In [10]:
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=cnn_validation_pt,
                  eval_dataset=cnn_validation_pt)
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=64, training_loss=2.419437438249588, metrics={'train_runtime': 287.2203, 'train_samples_per_second': 3.565, 'train_steps_per_second': 0.223, 'total_flos': 2028708766973952.0, 'train_loss': 2.419437438249588, 'epoch': 1.0})

# Model upload to huggingface

In [21]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
trainer.push_to_hub("Training complete!")

Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/DhruvSharma-845/pegasus-cnn_dailymail/commit/b3a1ff283f3be4fedbdef91e9ef02b2ba5f710ed', commit_message='Training complete!', commit_description='', oid='b3a1ff283f3be4fedbdef91e9ef02b2ba5f710ed', pr_url=None, repo_url=RepoUrl('https://huggingface.co/DhruvSharma-845/pegasus-cnn_dailymail', endpoint='https://huggingface.co', repo_type='model', repo_id='DhruvSharma-845/pegasus-cnn_dailymail'), pr_revision=None, pr_num=None)

# Model Evaluation

In [6]:
import torch
import evaluate
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

model_ckpt = "DhruvSharma-845/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
predictions, references = evaluate_summaries_pegasus(cnn_dataset["train"].select(range(128)), rouge, model, tokenizer, batch_size=4)
print(predictions[0])
print(references[0])

100%|██████████| 32/32 [01:31<00:00,  2.86s/it]

Harry Potter star Daniel Radcliffe gains access to a reported £20 million fortune. Young actor says he has no plans to fritter his cash away. Radcliffe's earnings from the first five Potter films have been held in a trust fund.
["Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe's earnings from first five Potter films have been held in trust fund ."]





In [7]:
score = rouge.compute(predictions=predictions, references=references)
print(score)

{'rouge1': 0.5096897127315552, 'rouge2': 0.3336722108937323, 'rougeL': 0.4391224960664898, 'rougeLsum': 0.47574660160380594}
