# Fine-tune BART for Summarization

We will use the CNN/DailyMail dataset and fine-tune the `facebook/bart-base` model using Hugging Face.

In [None]:
# Install required packages (if not already installed)
!pip install datasets transformers rouge_score accelerate -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m24.5 MB/s[0m eta [3

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
dataset = load_dataset("xsum")
train_dataset = dataset["train"]
val_dataset = dataset["validation"]

In [None]:
# Load tokenizer and model
model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# Preprocessing
max_input_length = 512
max_target_length = 128

def preprocess(examples):
    inputs = tokenizer(examples["document"], max_length=max_input_length, truncation=True)
    targets = tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)
    inputs["labels"] = targets["input_ids"]
    return inputs

In [None]:
small_train = train_dataset.select(range(1000))
small_val = val_dataset.select(range(200))
train_dataset=small_train
val_dataset=small_val

In [None]:
# Tokenize the datasets
tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess, batched=True, remove_columns=val_dataset.column_names)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
!pip install evaluate -q

In [None]:
# Define ROUGE metric
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=decoded_preds, references=decoded_labels)

In [None]:
# Training arguments
args = TrainingArguments(
    output_dir="./results",
    # evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    # predict_with_generate=True,
    fp16=True
)

In [None]:
!pip install -U transformers


Collecting transformers
  Downloading transformers-4.51.1-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.1-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.50.3
    Uninstalling transformers-4.50.3:
      Successfully uninstalled transformers-4.50.3
Successfully installed transformers-4.51.1


In [None]:
# Trainer
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # predict_with_generate=True
)

  trainer = Trainer(


In [None]:
# Start training
trainer.train()

Step,Training Loss


TrainOutput(global_step=250, training_loss=2.190537109375, metrics={'train_runtime': 58.6042, 'train_samples_per_second': 17.064, 'train_steps_per_second': 4.266, 'total_flos': 291577879756800.0, 'train_loss': 2.190537109375, 'epoch': 1.0})

In [None]:
# Save model
model.save_pretrained("./finetuned-bart-cnn")
tokenizer.save_pretrained("./finetuned-bart-cnn")

('./finetuned-bart-cnn/tokenizer_config.json',
 './finetuned-bart-cnn/special_tokens_map.json',
 './finetuned-bart-cnn/vocab.json',
 './finetuned-bart-cnn/merges.txt',
 './finetuned-bart-cnn/added_tokens.json',
 './finetuned-bart-cnn/tokenizer.json')

In [None]:
import random

In [None]:
print("\n\n==== Testing on Random Article ====\n")

sample = random.choice(val_dataset)
print("\n\033[1mOriginal Article:\033[0m\n", sample['document'])
print("\n\033[1mReference Summary:\033[0m\n", sample['summary'])




==== Testing on Random Article ====


[1mOriginal Article:[0m
 The hyperbaric chamber, which treats divers with "the bends", was operated by St John's Ambulance on a donation basis until it broke in April 2014.
The health department replaced it in 2015, but says it needs to "balance the books".
Diving instructor Steve Bougourd said he was "gobsmacked".
"I'm just worried that this kind of cost will put people off of actually going to the [hospital] and notifying them if they suspect a problem," he said.
"We may find it's going to be very expensive to get out divers insured."
In the UK hyperbaric oxygen treatment is covered by the NHS, but Guernsey has its own health care system.
Source: NHS
Assistant director at Guernsey's health and social care department (HSC) Ed Freestone said renting the chamber was costing the government £60,000 a year.
He said the department would not make a profit from the new charges, which were based on "the average usage that we could identify over the pre

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize and move input to same device
inputs = tokenizer(sample['document'], return_tensors="pt", max_length=512, truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate summary
summary_ids = model.generate(inputs["input_ids"], max_length=64, num_beams=4, early_stopping=True)
predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\n\033[1mPredicted Summary:\033[0m\n", predicted_summary)


[1mPredicted Summary:[0m
 The health department of Guernsey is planning to buy its own hyperbaric chamber to pay £60,000 a year.
