# Summarization (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the following line:
!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs
!pip install tensorflow==2.14
!pip install rouge_score
!pip install nltk
!pip install torchinfo

[31mERROR: torch_xla-1.9-cp37-cp37m-linux_x86_64.whl is not a supported wheel on this platform.[0m[31m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 10 not upgraded.


You will need to setup git, adapt your email and name in the following cell.

In [None]:
!git config --global user.email "deshler.dylan@gmail.com"
!git config --global user.name "Dylan Deshler"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset

raw_dataset = load_dataset("xsum")

In [None]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Summary: {example['summary']}'")
        print(f"'>> Document: {example['document']}'")


show_samples(raw_dataset)


'>> Summary: As Chancellor George Osborne announced all English state schools will become academies, the Welsh Government continues to reject the model here.'
'>> Document: In Wales, councils are responsible for funding and overseeing schools.
But in England, Mr Osborne's plan will mean local authorities will cease to have a role in providing education.
Academies are directly funded by central government and head teachers have more freedom over admissions and to change the way the school works.
It is a significant development in the continued divergence of schools systems on either side of Offa's Dyke.
And although the Welsh Government will get extra cash to match the money for English schools to extend the school day, it can spend it on any devolved policy area.
Ministers have no plans to follow suit.
At the moment, governing bodies are responsible for setting school hours and they need ministerial permission to make significant changes.
There are already more than 2,000 secondary ac

In [None]:
raw_dataset.reset_format()

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
inputs = tokenizer("I loved reading the Hunger Games!")
inputs

{'input_ids': [27, 1858, 1183, 8, 26049, 5880, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁I', '▁loved', '▁reading', '▁the', '▁Hunger', '▁Games', '!', '</s>']

In [None]:
max_input_length = 1024
max_target_length = 128


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["document"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

In [None]:
generated_summary = "I absolutely loved reading the Hunger Games"
reference_summary = "I loved reading the Hunger Games"

In [None]:
import evaluate

rouge_score = evaluate.load("rouge")

In [None]:
scores = rouge_score.compute(
    predictions=[generated_summary], references=[reference_summary]
)
scores

{'rouge1': 0.923076923076923,
 'rouge2': 0.7272727272727272,
 'rougeL': 0.923076923076923,
 'rougeLsum': 0.923076923076923}

In [None]:
scores["rouge1"]

0.923076923076923

In [None]:
!pip install nltk



In [None]:
import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from nltk.tokenize import sent_tokenize


def three_sentence_summary(text):
    return "\n".join(sent_tokenize(text)[:3])


print(three_sentence_summary(raw_dataset["train"][1]["document"]))

A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.
As they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.
One of the tour groups is from Germany, the other from China and Taiwan.


In [None]:
def evaluate_baseline(dataset, metric):
    summaries = [three_sentence_summary(text) for text in dataset["document"]]
    return metric.compute(predictions=summaries, references=dataset["summary"])

In [None]:
import pandas as pd

score = evaluate_baseline(raw_dataset["validation"], rouge_score)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names)
rouge_dict

{'rouge1': 18.46, 'rouge2': 2.52, 'rougeL': 11.98, 'rougeLsum': 14.51}

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(
    raw_dataset["train"].column_names
)

In [None]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  37,  423,  583,  ..., 1598,    5,    1],
        [  71, 1472, 6196,  ...,    0,    0,    0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[ 7433,    18,   413,  2673,    33,  6168,   640,     8, 12580, 17600,
             7,    11,   970,    51,    89,  2593,    11, 10987,    32,  1343,
           227, 18368,  2953,    57, 16133,  4937,     5,     1],
        [ 2759,  8548, 14264,    43,   118, 10932,    57,  1472,    16,     3,
             9, 18024,  1584,   739,  3211,    16, 27874,   690,  2050,     5,
             1,  -100,  -100,  -100,  -100,  -100,  -100,  -100]]), 'decoder_input_ids': tensor([[    0,  7433,    18,   413,  2673,    33,  6168,   640,     8, 12580,
         17600,     7,    11,   970,    51,    89,  2593,    11, 10987,    32,
          1343,   227, 18368,  2953,    57, 16133,  4937,     5],
        [    0,  2759,  8548, 14264,    43,   118, 10932,    57,  1472,    16,
        

In [None]:
tokenized_datasets.set_format("torch")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
summary(model)

In [None]:
from torch.utils.data import DataLoader

batch_size = 32
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=batch_size
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader) // batch_size
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # ROUGE expects a newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [None]:
from huggingface_hub import create_repo, get_full_repo_name

output_dir = 'ddeshler/summarization'
# repo = create_repo(output_dir, private=True)

# model_name = "test-bert-finetuned-squad-accelerate"
# repo_name = get_full_repo_name(model_name)
# repo_name

In [None]:
# from huggingface_hub import Repository

# output_dir = "results-t5-finetuned-squad-accelerate"
# repo = Repository(output_dir, clone_from=repo_name)

In [None]:
from tqdm.auto import tqdm
import torch
import numpy as np
import os

# progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader, desc=f'[Epoch {epoch}] Training')):
        outputs = model(**batch)
        loss = compute_metrics(outputs)['rougeL']
        # loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        # progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(tqdm(eval_dataloader, desc=f'[Epoch {epoch}] Testing')):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result)

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(os.path.join(output_dir, f'epoch_{epoch}'), save_function=accelerator.save)
    tokenizer.save_pretrained(os.path.join(output_dir, f'epoch_{epoch}'))

    tokenizer.push_to_hub('summarization', commit_message=f'Epoch {epoch} tokenizer training')
    unwrapped_model.push_to_hub('summarization', save_function=accelerator.save, commit_message=f'Epoch {epoch} model training')

[Epoch 0] Training:   0%|          | 0/6377 [00:00<?, ?it/s]

[Epoch 0] Testing:   0%|          | 0/355 [00:00<?, ?it/s]



Epoch 0: {'rouge1': 24.1079, 'rouge2': 5.6131, 'rougeL': 18.9415, 'rougeLsum': 18.9354}


[Epoch 1] Training:   0%|          | 0/6377 [00:00<?, ?it/s]

[Epoch 1] Testing:   0%|          | 0/355 [00:00<?, ?it/s]



Epoch 1: {'rouge1': 24.1079, 'rouge2': 5.6131, 'rougeL': 18.9415, 'rougeLsum': 18.9354}


[Epoch 2] Training:   0%|          | 0/6377 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
from transformers import pipeline

# hub_model_id = "huggingface-course/t5-small-finetuned-xsum"
summarizer = pipeline("summarization", model=unwrapped_model)

In [None]:
def print_summary(idx):
    review = raw_dataset["test"][idx]["document"]
    title = raw_dataset["test"][idx]["summary"]
    summary = summarizer(raw_dataset["test"][idx]["document"])[0]["summary_text"]
    print(f"'>>> Document: {review}'")
    print(f"\n'>>> True Summary: {title}'")
    print(f"\n'>>> Generated Summary: {summary}'")

In [None]:
print_summary(100)

In [None]:
print_summary(0)