## Goals
* Finetune T5 on a specific dataset for abstractive summarization
* Use the model for inference

In [None]:
# install the libraries
!pip install transformers datasets evaluate rouge_score

In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# update datasets
!pip install -U datasets

In [4]:
# Load the dataset
from datasets import load_dataset

data = load_dataset("billsum", split="ca_test")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [5]:
# split the dataset into train and test chunks %80 training %20 test
data = data.train_test_split(test_size=0.2)

In [6]:
print(data) # Training-test distribution

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 989
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 248
    })
})


In [7]:
# Example of data
print(data['train'][0])

{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 131019.5 of the Health and Safety Code is amended to read:\n131019.5.\n(a) For purposes of this section, the following definitions shall apply:\n(1) “Determinants of equity” means social, economic, geographic, political, and physical environmental conditions that lead to the creation of a fair and just society.\n(2) “Health equity” means efforts to ensure that all people have full and equal access to opportunities that enable them to lead healthy lives.\n(3) “Health and mental health disparities” means differences in health and mental health status among distinct segments of the population, including differences that occur by gender, age, race or ethnicity, sexual orientation, gender identity, education or income, disability or functional impairment, or geographic location, or the combination of any of these factors.\n(4) “Health and mental health inequities” means disparities in health or men

In [None]:
# text = the text of the bill as input to the model
# summary = a condensed version of text as the target for the model
# Preprocessing
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [9]:
# preprocessing function:
# 1-) Prefix the input with a prompt so T5 knows this is a summarization task.
prefix = "summarize: "
# 2-) Use the keyword text_target argument when tokenizing labels
def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["text"]]
  model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
  # 3-) Truncate sequences to be no longer than the maximum length set by the max_length parameter
  labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

  model_inputs["labels"] = labels["input_ids"]

  return model_inputs


In [10]:
# use the map method to preprocess function over the dataset
tokenized_data = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [11]:
from transformers import DataCollatorForSeq2Seq
# Create a batch example using Datacollatorforseq2seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [12]:
# we will include a metric during training (rouge)
# so it will be helpful for evaluating the model's performance

import evaluate
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [17]:
# The function that passes your predictions and labels to compute to calculate the ROUGE metric
import numpy as np

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)

  return {k: round(v, 4) for k, v in result.items()}

In [18]:
# Now we are training our model.
# First load T5 with AutoModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [25]:
# Defining the training hyperparameters in Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_summarization_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=9,
    per_device_eval_batch_size=9,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False, # Change bf16=True for XPU
    push_to_hub=True,
    report_to = "none"
)
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.39188,0.2052,0.1025,0.1707,0.1708,20.0
2,No log,2.36044,0.2052,0.1031,0.1717,0.172,20.0
3,No log,2.343434,0.2053,0.1021,0.1714,0.1715,20.0
4,No log,2.336984,0.2059,0.1034,0.1724,0.1724,20.0


TrainOutput(global_step=440, training_loss=2.5136141690340907, metrics={'train_runtime': 311.5544, 'train_samples_per_second': 12.698, 'train_steps_per_second': 1.412, 'total_flos': 1070824333246464.0, 'train_loss': 2.5136141690340907, 'epoch': 4.0})

In [28]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/CanerCoban/my_awesome_summarization_model/commit/013d5d9755d7ab52ecffa1cf76c6440162f888c2', commit_message='End of training', commit_description='', oid='013d5d9755d7ab52ecffa1cf76c6440162f888c2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/CanerCoban/my_awesome_summarization_model', endpoint='https://huggingface.co', repo_type='model', repo_id='CanerCoban/my_awesome_summarization_model'), pr_revision=None, pr_num=None)

### Inference

In [29]:
# We can use this model for inference now.
# First have a text to be summarized. (snow white story)
# For summarization you should prefix your input as shown:
text = "summarize: Once upon a time, in a faraway kingdom, there was a kind and beautiful princess named Snow White. She had skin as white as snow, lips as red as roses, and hair as black as coal. But she lived with her stepmother, the Queen, who was beautiful on the outside but jealous and cruel on the inside."

In [31]:
from transformers import pipeline

summarizer = pipeline("summarization", model="CanerCoban/my_awesome_summarization_model")
summarizer(text)

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Device set to use cuda:0
Your max_length is set to 200, but your input_length is only 77. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)


[{'summary_text': 'Snow White had skin as white as snow, lips as red as roses, and hair as black as coal. But she lived with her stepmother, the Queen, who was jealous and cruel on the inside.'}]

In [32]:
# you can also manually replicate the results of the pipeline
# Tokenize the text anad return the input_ids as PyTorch tensors:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("CanerCoban/my_awesome_summarization_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [33]:
# use the generate method to create the summarization.
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("CanerCoban/my_awesome_summarization_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [34]:
# Decode the generated token ids back into text:
tokenizer.decode(outputs[0], skip_special_tokens=True)


'Snow White was a kind and beautiful princess in a faraway kingdom. She had skin as white as snow, lips as red as roses, and hair as black as coal. But she lived with her stepmother, the Queen, who was jealous and cruel on the inside.'