# Text summarization

In this notebook, I will load a Transformer (LLM: T5 [Text-To-Text Transfer Transformer]) from Hugging Face to summarize text from a dataset (NLP area). In order to do it, we will apply a fine-tuning.

In [2]:
# Libraries
from datasets import Dataset, load_dataset
import evaluate
import torch
# T5Tokenizer -> to token data
# T5ForConditionalGeneration -> the model that will be load
# TrainingArguments, Trainer -> to train the model
from transformers import T5Tokenizer, DataCollatorForSeq2Seq, T5ForConditionalGeneration, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# We will work with CPU
print(torch.cuda.is_available()) 

False


## Data

In [4]:
dataset = load_dataset("xsum", trust_remote_code=True)

In [5]:
def preprocess_function(examples, tokenizer = T5Tokenizer.from_pretrained("t5-small"), max_input_length=512, max_target_length=128):
    inputs_doc = ["summarize: " + str(doc) for doc in examples["document"]]
    model_inputs = tokenizer(inputs_doc, padding="max_length", max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        inputs_summ = [str(s) for s in examples["summary"]]
        labels = tokenizer(inputs_summ, padding="max_length", max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
# To define the data
tam_train, tam_val = 1000, 200
small_train_dataset = dataset["train"].select(range(tam_train))
small_val_dataset = dataset["validation"].select(range(tam_val))

In [7]:
# To tokenize the data
tokenized_train = small_train_dataset.map(preprocess_function, batched=True)
tokenized_val = small_val_dataset.map(preprocess_function, batched=True)

## Transformer

In [8]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [10]:
# To process the data: inputs and labels
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
#fp16=True if GPU
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=2, # instead of >=3 due to time execution issues
    fp16=False
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator
    # compute_metrics=compute_metrics # for clear test results
)

  trainer = Trainer(


In [13]:
# To train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,2.2008


TrainOutput(global_step=500, training_loss=2.20083349609375, metrics={'train_runtime': 1824.6489, 'train_samples_per_second': 1.096, 'train_steps_per_second': 0.274, 'total_flos': 270683602944000.0, 'train_loss': 2.20083349609375, 'epoch': 2.0})

In [14]:
# Testing
small_test_dataset = dataset["test"].select(range(200))
tokenized_test = small_test_dataset.map(preprocess_function, batched=True)
test_results = trainer.evaluate(eval_dataset=tokenized_test)



In [15]:
print(test_results)

{'eval_loss': 0.8083456158638, 'eval_runtime': 33.9669, 'eval_samples_per_second': 5.888, 'eval_steps_per_second': 1.472, 'epoch': 2.0}


In [None]:
# Results
for key in results:
    print(f"{key}: {results[key].mid.fmeasure:.4f}")

In [None]:
# Save model and tokenizer
model.save_pretrained("./results")
tokenizer.save_pretrained("./results")

('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\spiece.model',
 './results\\added_tokens.json')