In [None]:
!pip install datasets
!pip install transformers
!pip install rouge_score


# Only needed on Colab due to bug in the installed version of accelerate and old version of urllib3
!pip uninstall accelerate
!pip install accelerate
!pip install --upgrade urllib3

In [2]:
import datasets
from transformers import EncoderDecoderModel, BertTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
import torch
import rouge_score
import os
from google.colab import files
import json
import pandas as pd
from IPython.display import display, HTML
from datasets import ClassLabel


## Data Pre-Processing

In [3]:
# Using the GPU
device = torch.device("cuda")

In [19]:
train_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train")
val_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation")

In [5]:
# T5 Model declaration
checkpoint = "t5-small"


tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

pad_on_right = tokenizer.padding_side == "right"
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# Prepare inputs for T5
batch_size = 4
prefix = "summarize: "


def t5_preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["highlights"], max_length=64, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [20]:
train_data = train_data.select(range(1000))
val_data = val_data.select(range(100))

In [21]:
# Map T5 Tokenizer

train_data = train_data.map(

    t5_preprocess_function,
    batched = True,
    batch_size = batch_size,
    remove_columns = ["article", "highlights", "id"]
)

val_data = val_data.map(
    t5_preprocess_function,
    batched=True,
    batch_size=batch_size,
    remove_columns=["article", "highlights", "id"]
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
# Define evaluation metrics
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [16]:
os.environ["WANDB_DISABLED"] = "true"

In [17]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 1024
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4
model.config.decoder_start_token_id = tokenizer.pad_token_id

In [None]:
# T5 Training
training_args = Seq2SeqTrainingArguments(
    output_dir="summary_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    seed = 25,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge2 Precision,Rouge2 Recall,Rouge2 Fmeasure
1,No log,1.967669,0.0419,0.1738,0.0666
2,2.005500,1.968779,0.042,0.1758,0.0668


In [24]:
d = trainer.state.log_history
file = open("log_history_summary_model.json", "w")
json.dump(d, file)
file.close()

In [25]:
trainer.save_model("./trained_summary")
model.from_pretrained("./trained_summary")
print("Model Loaded")

Model Loaded


In [65]:
!zip -r /content/trained_summary /content/trained_summary/
files.download("/content/trained_summary.zip")

updating: content/trained_summary/ (stored 0%)
updating: content/trained_summary/generation_config.json (deflated 35%)
updating: content/trained_summary/training_args.bin (deflated 48%)
updating: content/trained_summary/special_tokens_map.json (deflated 86%)
updating: content/trained_summary/pytorch_model.bin (deflated 9%)
updating: content/trained_summary/tokenizer_config.json (deflated 83%)
updating: content/trained_summary/config.json (deflated 64%)
updating: content/trained_summary/tokenizer.json (deflated 74%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Evaluation

In [48]:
test_data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="test")
test_data = test_data.select(range(500))

In [59]:
def generate_summary(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=1024, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask, max_length=64)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred_summary"] = output_str

    return batch

In [60]:
batch_size = 8

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [61]:
rouge.compute(predictions=results["pred_summary"], references=results["highlights"], rouge_types=["rouge2"])["rouge2"].mid

Score(precision=0.10946256711985802, recall=0.14364300650072748, fmeasure=0.12153760972642329)

In [62]:
results['pred_summary'][50]

'a group of individuals, but groups generally cannot sue for defamation. The identity of the aspiring plaintiff matters. There are strict rules about who can be a plaintiff in a defaamation action. Defamatory statements are not actionable by the'

In [63]:
results['highlights'][50]

'An outside review found that a Rolling Stone article about campus rape was "deeply flawed"\nDanny Cevallos says that there are obstacles to a successful libel case, should one be filed .'