# Leveraging Huggingface's Encoder Decoder Framework for Abstractive Summarisation

In [1]:
# !rm seq2seq_trainer.py
# !wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_trainer.py

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from dataclasses import dataclass, field
from typing import Optional

import pandas as pd
import torch
import datasets
from datasets import Dataset, load_dataset, load_metric
from sklearn.model_selection import train_test_split
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel, BertTokenizerFast, TrainingArguments

from seq2seq_trainer import Seq2SeqTrainer

## Data Preparation

In [4]:
tokenisation_batch_size = 1024
encoder_max_length = 512
decoder_max_length = 128

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [5]:
df = pd.read_csv('../../data/filtered_sources_with_bodies.csv')[-40000:]
df = df.fillna("")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

train_data = Dataset.from_pandas(train_df)
val_data = Dataset.from_pandas(val_df)

In [6]:
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["article_content"], padding="max_length", truncation=True, max_length=encoder_max_length)
    outputs = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=decoder_max_length)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    
    return batch

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=tokenisation_batch_size, 
    remove_columns=["title", "article_content", "summary", "url", "date"]
)

train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=tokenisation_batch_size, 
    remove_columns=["title", "article_content", "summary", "url", "date"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

train_data.save_to_disk("data/encoded_curation_corpus_large_train_data")
val_data.save_to_disk("data/encoded_curation_corpus_large_val_data")

HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [8]:
# train_data = datasets.load_from_disk("data/encoded_curation_corpus_large_train_data")
# val_data = datasets.load_from_disk("data/encoded_curation_corpus_large_val_data")

## Modelling

We could create our bert2bert model as shown in the commented out line below. But since Patrick von Platen has already trained a model on the CNN/DailyMail dataset we might as well do some transfer learning. So we'll use that model's weights as our starting point.

In [9]:
# bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
bert2bert = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")


In [10]:
# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 142
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

In [11]:
@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(
        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
    )
    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSamler or not."})
    predict_with_generate: bool = field(
        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
    )
    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
    encoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
    )
    decoder_layerdrop: Optional[float] = field(
        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
    )
    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
    attention_dropout: Optional[float] = field(
        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
    )
    lr_scheduler: Optional[str] = field(
        default="linear", metadata={"help": f"Which lr scheduler to use."}
    )

In [12]:
rouge = load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid
    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

On a single V100 GPU this takes about 1 hour per epoch. I've only run it for about 10 minutes because I'm impatient but you could run it for as long as you like!

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./models",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=1000,
    save_steps=1000,
    eval_steps=8000,
    warmup_steps=2000,
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True, 
)

trainer = Seq2SeqTrainer(
    model=bert2bert,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

## Inference

In [16]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel.from_pretrained("./models/checkpoint-3000")
model.to("cuda")

test_data = Dataset.from_pandas(test_df)
test_data = test_data.select(range(64))

def generate_summary(batch):
    inputs = tokenizer(batch["article_content"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    batch["pred"] = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return batch

results = test_data.map(generate_summary, batched=True, batch_size=16, remove_columns=["article_content"])

pred_str = results["pred"]
label_str = results["summary"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Score(precision=0.18080042450138092, recall=0.14232230690271044, fmeasure=0.1561764867738723)


In [17]:
test_data[0]['article_content']

'\n\nUber is the biggest ride-sharing service in the world. It’s already present in countless countries and continues to grow. It does have to deal with stringent local regulations and other issues in tough markets which often lead to the company have to take a swift decision. It took one such decision in Abu Dhabi – the capital of the United Arab Emirates – by temporarily suspending its service after some drivers were reportedly detained.\n\nAdvertising\n\nThe issue reportedly stems from a dispute between regulators and regional rival Careem. Uber is said to have suspended its service in the city after some drivers working for Careem were detained.\n\nIt’s believed that as many as eight Careem drivers have been detained, while local media reports suggest that up to 50 drivers working for both Uber and Careem have been arrested so far.\n\n“Uber made the decision to temporarily suspend services due to some unforeseen circumstances. Our goal is to have operations up and running as soon a

In [18]:
label_str[0]

'Ride-hailing services Uber and Careem have "temporarily suspended" operations in the United Arab Emirates\' capital Abu Dhabi, following the arrest of between eight and 50 drivers, according to unnamed sources. It is believed the drivers were detained because of "violations of regulations," but it is unclear which regulations are involved.\n'

In [19]:
pred_str[0]

'uber is the biggest ride - sharing service in the world. it took one such decision in abu dhabi, temporarily suspending its service after some drivers were detained. it is believed that as many as eight careem drivers have been detained, while local media reports suggest that up to 50 drivers working for both uber and careem have been arrested. the issue is believed to have stemmed from a dispute between regulators and regional rival careem.'