# Training a Summarization Model

Now let's see how we can use `HuggingFace` to train a summarization model on a new dataset. We'll use the SAMSum dataset.

In [1]:
from datasets import load_dataset


dataset_samsum = load_dataset("samsum")
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]

print(f"Split lengths: {split_lengths}")
print(f"Features: {dataset_samsum['train'].column_names}")
print(f"\nDialogue:")
print(dataset_samsum["test"][0]["dialogue"])
print("\nSummary")
print(dataset_samsum["test"][0]["summary"])

Found cached dataset samsum (/home/alex/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [2]:
from transformers import pipeline

# Evaluate this using PEGASUS
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail", framework='pt')
pipe_out = pipe(dataset_samsum["test"][0]["dialogue"])
print("Summary:")
print(pipe_out[0]["summary_text"].replace(" .<n>", ".\n"))

2023-04-25 21:15:54.103802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Your max_length is set to 128, but you input_length is only 122. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


Summary:
Amanda: Ask Larry Amanda: He called her last time we were at the park together.
Hannah: I'd rather you texted him.
Amanda: Just text him .


# Evaluating the entire test set

We will need a way to compare the baseline PEGASUS model to the finetuned version. We'll create an evaluation loop for this.

In [2]:
from tqdm import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elements, batch_size):
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries(dataset, metric, model, tokenizer,
                       batch_size=16, device=device,
                       column_text="article", column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True)
                             for s in summaries]

        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
    return metric.compute(predictions=decoded_summaries, references=target_batch)

In [3]:
# Load the model directly
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "ainize/bart-base-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

2023-04-25 21:23:15.223026: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import evaluate

rouge_metric = evaluate.load("rouge")
score = evaluate_summaries(dataset_samsum["test"], rouge_metric, model,
                           tokenizer, column_text="dialogue",
                           column_summary="summary", batch_size=8)

100%|██████████| 103/103 [04:32<00:00,  2.64s/it]


In [7]:
import pandas as pd

pd.DataFrame(score, index=["bart"])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart,0.383839,0.234127,0.347803,0.383839


In order to fine tune this model, we need to be able to tokenize the data. We can also limit the lengths of each dialogue and summary to 1024 and 128, respectively.

In [8]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], truncation=True,
                                max_length=1024)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length=128,
                                     truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features,
                                       batched=True)

columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type="torch", columns=columns)

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

# Preparing a batch of data

When training `seq2seq` models, we need to apply "teacher forcing". The encoder will receive input tokens using the labels shifted by one as well as the encoder output. The prediction is then compared to the shifted labels to calculate the loss. To clarify, the decoder only sees the previous ground truth labels.

`HuggingFace` provides a `DataCollatorForSeq2Seq` class that handles this for us.

In [9]:
from transformers import DataCollatorForSeq2Seq

seq2seq_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [10]:
from transformers import TrainingArguments, Trainer

# Gradient accumulation saves memory by updating the model only every X batches
training_args = TrainingArguments(
    output_dir="bart-samsum", num_train_epochs=1, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10, push_to_hub=False,
    evaluation_strategy="steps", eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16)

In [11]:
trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_collator,
                  train_dataset=dataset_samsum_pt["train"],
                  eval_dataset=dataset_samsum_pt["validation"])

trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: dialogue, summary, id. If dialogue, summary, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14732
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 920
  Number of trainable parameters = 139420416
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33majdillhoff[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/920 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.9587, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}
{'loss': 2.7171, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}
{'loss': 2.5365, 'learning_rate': 3e-06, 'epoch': 0.03}
{'loss': 2.3373, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.04}
{'loss': 2.245, 'learning_rate': 5e-06, 'epoch': 0.05}
{'loss': 2.1459, 'learning_rate': 6e-06, 'epoch': 0.07}
{'loss': 2.0281, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.08}
{'loss': 1.916, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.09}
{'loss': 2.0147, 'learning_rate': 9e-06, 'epoch': 0.1}
{'loss': 1.9108, 'learning_rate': 1e-05, 'epoch': 0.11}
{'loss': 1.968, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.12}
{'loss': 1.8644, 'learning_rate': 1.2e-05, 'epoch': 0.13}
{'loss': 1.8956, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.14}
{'loss': 1.8825, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.15}
{'loss': 1.8367, 'learning_rate': 1.5e-05, 'epoch': 0.16}
{'loss': 1.9063, 'learning_

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: dialogue, summary, id. If dialogue, summary, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 818
  Batch size = 1


{'loss': 1.7254, 'learning_rate': 5e-05, 'epoch': 0.54}


  0%|          | 0/818 [00:00<?, ?it/s]

{'eval_loss': 1.5479345321655273, 'eval_runtime': 11.685, 'eval_samples_per_second': 70.004, 'eval_steps_per_second': 70.004, 'epoch': 0.54}
{'loss': 1.8183, 'learning_rate': 4.880952380952381e-05, 'epoch': 0.55}
{'loss': 1.7176, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.56}
{'loss': 1.7989, 'learning_rate': 4.642857142857143e-05, 'epoch': 0.58}
{'loss': 1.7256, 'learning_rate': 4.523809523809524e-05, 'epoch': 0.59}
{'loss': 1.7455, 'learning_rate': 4.404761904761905e-05, 'epoch': 0.6}
{'loss': 1.6998, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.61}
{'loss': 1.6311, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.62}
{'loss': 1.6529, 'learning_rate': 4.047619047619048e-05, 'epoch': 0.63}
{'loss': 1.6794, 'learning_rate': 3.928571428571429e-05, 'epoch': 0.64}
{'loss': 1.6679, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.65}
{'loss': 1.5988, 'learning_rate': 3.690476190476191e-05, 'epoch': 0.66}
{'loss': 1.6306, 'learning_rate': 3.571428571428572e-05, 'epoch': 0



Training completed. Do not forget to share your model on huggingface.co/models =)




{'loss': 1.5999, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 844.0132, 'train_samples_per_second': 17.455, 'train_steps_per_second': 1.09, 'train_loss': 1.776882411086041, 'epoch': 1.0}


TrainOutput(global_step=920, training_loss=1.776882411086041, metrics={'train_runtime': 844.0132, 'train_samples_per_second': 17.455, 'train_steps_per_second': 1.09, 'train_loss': 1.776882411086041, 'epoch': 1.0})

In [12]:
# Evaluate after finetuning
score = evaluate_summaries(
    dataset_samsum["test"], rouge_metric, trainer.model, tokenizer,
    batch_size=2, column_text="dialogue", column_summary="summary")
pd.DataFrame(score, index=[f"bart_finetuned"])

100%|██████████| 410/410 [02:40<00:00,  2.55it/s]


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
bart_finetuned,0.4375,0.133333,0.25,0.25


In [13]:
sample_text = dataset_samsum["test"][0]["dialogue"]
reference = dataset_samsum["test"][0]["summary"]

inputs = tokenizer(sample_text, max_length=1024, truncation=True,
                   padding="max_length", return_tensors="pt")

summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                           attention_mask=inputs["attention_mask"].to(
    device),
    length_penalty=0.8, num_beams=8, max_length=128)

decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                      clean_up_tokenization_spaces=True)
                     for s in summaries]

decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]


In [15]:
print(decoded_summaries)

["Amanda can't find Betty's number."]
