# Summarization (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.wh!l
#!apt install git-lfs

You will need to setup git, adapt your email and name in the following cell.

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
import re
import warnings
warnings.filterwarnings("ignore")
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

##Lectura y formato de los datos

In [None]:
import pandas as pd
data=pd.read_csv('/content/drive/MyDrive/NOTICIASFINAL.csv')

train, test = train_test_split(data, test_size=0.1, random_state=2022)
train, validation = train_test_split(train, test_size=0.1, random_state=2022)

In [None]:
train=train.drop(columns='Unnamed: 0')
test=test.drop(columns='Unnamed: 0')
validation=validation.drop(columns='Unnamed: 0')

In [None]:
from datasets import Dataset
train_dataset=Dataset.from_pandas(train)
test_dataset=Dataset.from_pandas(test)
validation_dataset=Dataset.from_pandas(validation)

In [None]:
import datasets
dd = datasets.DatasetDict({"train":train_dataset, 'validation': validation_dataset ,"test":test_dataset})

In [None]:
dd

DatasetDict({
    train: Dataset({
        features: ['Noticia', 'Resumen Original', 'Resumen Procesado', '__index_level_0__'],
        num_rows: 48274
    })
    validation: Dataset({
        features: ['Noticia', 'Resumen Original', 'Resumen Procesado', '__index_level_0__'],
        num_rows: 5364
    })
    test: Dataset({
        features: ['Noticia', 'Resumen Original', 'Resumen Procesado', '__index_level_0__'],
        num_rows: 5960
    })
})

##Filtrado

In [None]:
dd = dd.filter(lambda x: x['Resumen Procesado'] is not None)

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:
dd

DatasetDict({
    train: Dataset({
        features: ['Noticia', 'Resumen Original', 'Resumen Procesado', '__index_level_0__'],
        num_rows: 48269
    })
    validation: Dataset({
        features: ['Noticia', 'Resumen Original', 'Resumen Procesado', '__index_level_0__'],
        num_rows: 5364
    })
    test: Dataset({
        features: ['Noticia', 'Resumen Original', 'Resumen Procesado', '__index_level_0__'],
        num_rows: 5959
    })
})

##Carga del modelo

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Ejemplos de la tokenizacion

In [None]:
inputs = tokenizer("Muy buenas amigos")
inputs

{'input_ids': [259, 74978, 259, 72818, 22477, 1], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer.convert_ids_to_tokens(inputs.input_ids)

['▁', 'Muy', '▁', 'buenas', '▁amigos', '</s>']

In [None]:
tokenizer(books_dataset['train']['review_body'][0])

{'input_ids': [653, 1957, 1314, 261, 2757, 1280, 435, 259, 29166, 263, 269, 774, 5547, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

##Preprocesamiento de las noticias

In [None]:
max_input_length = 512
max_target_length = 150


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["Noticia"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(text_target=examples['Resumen Procesado'], max_length=max_target_length, truncation=True)
    #print(labels)
    model_inputs["labels"] = labels["input_ids"]
    #print(model_inputs["labels"] )
    model_inputs["labels_mask"] = labels["attention_mask"]
    #print(model_inputs["labels_mask"])
    return model_inputs

In [None]:
dd_tokenized=dd.map(preprocess_function, batched=True)

  0%|          | 0/49 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

##Metricas para evaluacion

In [None]:
!pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=3db42d0619c81df817d272065c43ca42fbb96198f1d08a134059266429179907
  Stored in directory: /root/.cache/pip/wheels/24/55/6f/ebfc4cb176d1c9665da4e306e1705496206d08215c1acd9dde
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import evaluate

rouge_score = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

##Se fijan los hiperparametros para el fine tuning

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 3
# Show the training loss with every epoch
logging_steps = len(dd_tokenized["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-tuto-mt5-small-2",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

##Funcion que calcula las metricas de evaluacion

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value  for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import DataCollatorForSeq2Seq

#Le damos el formato correcto
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

##Fijamos los parametros en nuestro trainer

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=dd_tokenized["train"],
    eval_dataset=dd_tokenized["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

##Entrenamiento

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 48269
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18102
  Number of trainable parameters = 300176768


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,2.1519,1.856385,0.4159,0.2906,0.3928,0.3929
2,2.1289,1.856385,0.4159,0.2906,0.3928,0.3929
3,2.1291,1.856385,0.4159,0.2906,0.3928,0.3929


Saving model checkpoint to mt5-small-tuto-mt5-small-2/checkpoint-500
Configuration saved in mt5-small-tuto-mt5-small-2/checkpoint-500/config.json
Model weights saved in mt5-small-tuto-mt5-small-2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in mt5-small-tuto-mt5-small-2/checkpoint-500/tokenizer_config.json
Special tokens file saved in mt5-small-tuto-mt5-small-2/checkpoint-500/special_tokens_map.json
Copy vocab file to mt5-small-tuto-mt5-small-2/checkpoint-500/spiece.model
Deleting older checkpoint [mt5-small-tuto-mt5-small-2/checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to mt5-small-tuto-mt5-small-2/checkpoint-1000
Configuration saved in mt5-small-tuto-mt5-small-2/checkpoint-1000/config.json
Model weights saved in mt5-small-tuto-mt5-small-2/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in mt5-small-tuto-mt5-small-2/checkpoint-1000/tokenizer_config.json
Special tokens file saved in mt5-small-tuto-mt5-small-2/checkpoint-1000/special

Epoch,Training Loss,Validation Loss


Saving model checkpoint to mt5-small-tuto-mt5-small-2/checkpoint-6500
Configuration saved in mt5-small-tuto-mt5-small-2/checkpoint-6500/config.json
Model weights saved in mt5-small-tuto-mt5-small-2/checkpoint-6500/pytorch_model.bin
tokenizer config file saved in mt5-small-tuto-mt5-small-2/checkpoint-6500/tokenizer_config.json
Special tokens file saved in mt5-small-tuto-mt5-small-2/checkpoint-6500/special_tokens_map.json
Copy vocab file to mt5-small-tuto-mt5-small-2/checkpoint-6500/spiece.model
Deleting older checkpoint [mt5-small-tuto-mt5-small-2/checkpoint-5000] due to args.save_total_limit
Saving model checkpoint to mt5-small-tuto-mt5-small-2/checkpoint-7000
Configuration saved in mt5-small-tuto-mt5-small-2/checkpoint-7000/config.json
Model weights saved in mt5-small-tuto-mt5-small-2/checkpoint-7000/pytorch_model.bin
tokenizer config file saved in mt5-small-tuto-mt5-small-2/checkpoint-7000/tokenizer_config.json
Special tokens file saved in mt5-small-tuto-mt5-small-2/checkpoint-7000/s

TrainOutput(global_step=18102, training_loss=2.136609254891195, metrics={'train_runtime': 10216.0837, 'train_samples_per_second': 14.174, 'train_steps_per_second': 1.772, 'total_flos': 7.287959711204352e+16, 'train_loss': 2.136609254891195, 'epoch': 3.0})

##Push a Hugging face
De esa forma es accesible para todos

In [None]:
trainer.push_to_hub()

#Ejemplo de uso

In [None]:
import torch
from transformers import BertTokenizerFast, EncoderDecoderModel
device = 'cuda' if torch.cuda.is_available() else 'cpu'
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("EP9/mt5-small-tuto-mt5-small-2")
model.to(device)
tokenizer = BertTokenizerFast.from_pretrained("EP9/mt5-small-tuto-mt5-small-2")

def generate_summary(text):

   inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
   input_ids = inputs.input_ids.to(device)
   attention_mask = inputs.attention_mask.to(device)
   output = model.generate(input_ids, attention_mask=attention_mask)
   return tokenizer.decode(output[0], skip_special_tokens=True)

text = "Your_Text"
generate_summary(text)