## Goals
* Finetune T5 on a specific dataset for abstractive summarization
* Use the model for inference

In [None]:
# install the libraries
!pip install transformers datasets evaluate rouge_score

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# update datasets
!pip install -U datasets

In [None]:
# Load the dataset
from datasets import load_dataset

data = load_dataset("billsum", split="ca_test")

In [None]:
# split the dataset into train and test chunks %80 training %20 test
data = data.train_test_split(test_size=0.2)

In [None]:
print(data) # Training-test distribution

In [None]:
# Example of data
print(data['train'][0])

In [None]:
# text = the text of the bill as input to the model
# summary = a condensed version of text as the target for the model
# Preprocessing
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
# preprocessing function:
# 1-) Prefix the input with a prompt so T5 knows this is a summarization task.
prefix = "summarize: "
# 2-) Use the keyword text_target argument when tokenizing labels
def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["text"]]
  model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
  # 3-) Truncate sequences to be no longer than the maximum length set by the max_length parameter
  labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

  model_inputs["labels"] = labels["input_ids"]

  return model_inputs


In [None]:
# use the map method to preprocess function over the dataset
tokenized_data = data.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorForSeq2Seq
# Create a batch example using Datacollatorforseq2seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
# we will include a metric during training (rouge)
# so it will be helpful for evaluating the model's performance

import evaluate
rouge = evaluate.load("rouge")

In [None]:
# The function that passes your predictions and labels to compute to calculate the ROUGE metric
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)

  return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Now we are training our model.
# First load T5 with AutoModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
# Defining the training hyperparameters in Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_summarization_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=9,
    per_device_eval_batch_size=9,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False, # Change bf16=True for XPU
    push_to_hub=True,
    report_to = "none"
)
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.push_to_hub()

### Inference

In [None]:
# We can use this model for inference now.
# First have a text to be summarized. (snow white story)
# For summarization you should prefix your input as shown:
text = "summarize: Once upon a time, in a faraway kingdom, there was a kind and beautiful princess named Snow White. She had skin as white as snow, lips as red as roses, and hair as black as coal. But she lived with her stepmother, the Queen, who was beautiful on the outside but jealous and cruel on the inside."

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="CanerCoban/my_awesome_summarization_model")
summarizer(text)

In [None]:
# you can also manually replicate the results of the pipeline
# Tokenize the text anad return the input_ids as PyTorch tensors:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("CanerCoban/my_awesome_summarization_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [None]:
# use the generate method to create the summarization.
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("CanerCoban/my_awesome_summarization_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [None]:
# Decode the generated token ids back into text:
tokenizer.decode(outputs[0], skip_special_tokens=True)
