# Clasificación de texto con transformers

In [None]:
!pip install datasets transformers evaluate accelerate xformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers
  Downloading xformers-0.0.20-cp310-cp310-manylinux2014_x86_64

In [None]:
from datasets import load_dataset # Manejo de conjuntos de datos (como pandas)
import evaluate # Métricas de rendimiento
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, BertModel, pipeline # Modelamiento con Transformers
import numpy as np

Importamos un conjunto de datos de diagnósticos médicos en español etiquetados como dentales y no-dentales.

In [None]:
spanish_diagnostics = load_dataset('fvillena/spanish_diagnostics')

Downloading builder script:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading and preparing dataset spanish_diagnostics/default to /root/.cache/huggingface/datasets/fvillena___spanish_diagnostics/default/0.0.0/45c176cea64580ea9631f78c2867a657ede368597681e5337e9f1c976e4e84ff...


Downloading data:   0%|          | 0.00/6.85M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset spanish_diagnostics downloaded and prepared to /root/.cache/huggingface/datasets/fvillena___spanish_diagnostics/default/0.0.0/45c176cea64580ea9631f78c2867a657ede368597681e5337e9f1c976e4e84ff. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
spanish_diagnostics["train"].features


{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_dental', 'dental'], id=None)}

Vamos a medir el rendimiento con accuracy porque nuestro conjunto de datos está balanceado.

In [None]:
metric = evaluate.load('accuracy')

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Segmentamos los string de texto usando el tokenizador específico del modeo de Transformers que vamos a usar.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Tokenizamos el texto

In [None]:
tokenized_spanish_diagnostics = spanish_diagnostics.map(preprocess_function, batched=True)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [None]:
id2label = {0: "not_dental", 1: "dental"}
label2id = {"not_dental": 0, "dental": 1}

Importamos un modelo basado en BERT que fue entrenado con un conjunto de datos en múltiples lenguajes.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2, id2label=id2label, label2id=label2id)

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual

Configuramos el entrenador de nuestro modelo

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    max_steps=500,
    evaluation_strategy="steps"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer = tokenizer,
    train_dataset=tokenized_spanish_diagnostics["train"],
    eval_dataset=tokenized_spanish_diagnostics["test"].shuffle(seed=11).select(range(1000)),
    compute_metrics=compute_metrics
)



Entrenamos el modelo

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,0.7026,0.693997,0.493


TrainOutput(global_step=500, training_loss=0.702611572265625, metrics={'train_runtime': 93.0268, 'train_samples_per_second': 42.998, 'train_steps_per_second': 5.375, 'total_flos': 152415301037760.0, 'train_loss': 0.702611572265625, 'epoch': 0.06})

Probamos el modelo con ejemplos inventados por nosotros.

In [None]:
classifier = pipeline("text-classification", model = model, tokenizer=tokenizer, device=0)

In [None]:
classifier(["fractura de tobillo","caries dentinaria"])

[{'label': 'dental', 'score': 0.5148761868476868},
 {'label': 'dental', 'score': 0.5148748755455017}]

## Actividad 1

Usted acaba de ajustar un predictor de la etiqueta dental utilizando un modelo de lenguaje multilenguaje. Utilice un modelo de lenguaje ajustado para el lenguaje Español y vea si el rendimiento del modelo mejora.

Acá puede explorar muchos modelos de lenguaje: https://huggingface.co/models

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")

In [None]:
def preprocess_function2(examples):
    return tokenizer2(examples["text"], truncation=True)

In [None]:
tokenized_spanish_diagnostics2 = spanish_diagnostics.map(preprocess_function2, batched=True)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [None]:
model2 = AutoModelForSequenceClassification.from_pretrained("PlanTL-GOB-ES/roberta-base-bne", num_labels=2, id2label=id2label, label2id=label2id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [None]:
training_args2 = TrainingArguments(
    output_dir="./results2",
    max_steps=500,
    evaluation_strategy="steps"
)

In [None]:
trainer2 = Trainer(
    model=model2,
    args=training_args2,
    tokenizer = tokenizer2,
    train_dataset=tokenized_spanish_diagnostics2["train"],
    eval_dataset=tokenized_spanish_diagnostics2["test"].shuffle(seed=11).select(range(1000)),
    compute_metrics=compute_metrics
)

In [None]:
trainer2.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,0.338,0.239568,0.929


TrainOutput(global_step=500, training_loss=0.3380162353515625, metrics={'train_runtime': 75.6861, 'train_samples_per_second': 52.85, 'train_steps_per_second': 6.606, 'total_flos': 122198640773760.0, 'train_loss': 0.3380162353515625, 'epoch': 0.06})

In [None]:
classifier2 = pipeline("text-classification", model = model2, tokenizer=tokenizer2, device=0)

In [None]:
classifier2(["fractura de tobillo","caries dentinaria"])

[{'label': 'not_dental', 'score': 0.973836362361908},
 {'label': 'dental', 'score': 0.9799413681030273}]

## Actividad 2:

Nosotros acabamos de realizar un ajuste fino de un modelo de lenguaje, que significa agregar capas río abajo en la arquitectura del modelo, para poder resolver la tarea de clasificación de texto. Pero la salida del modelo de lenguaje no es una clasificación, sino que una secuencia de embeddings contextualizada.

Cargue un modelo de lenguaje basado en BERT (clase `transformers.BertModel`) junto con su tokenizador.

1. Tokenice un texto y explore cuántos tokens se detectaron. Por qué la cantidad de tokens puede ser inconsistente con la cantidad de palabras?
2. Pásele un texto tokenizado al modelo, explore la salida modelo (atributo `last_hidden_state` de la clase `transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions`) y vea cuántas dimensiones tiene esa salida. Por qué tiene esas dimensiones esa salida?

In [None]:
tokenizer3 = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [None]:
model3 = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

In [None]:
tokenized_text = tokenizer3("¡Hola tú!, este es el curso de Reconocimiento de Patrones")

In [None]:
tokenized_text[0].tokens

['[CLS]',
 '¡',
 'Hola',
 'tú',
 '!',
 ',',
 'este',
 'es',
 'el',
 'curso',
 'de',
 'Reconoci',
 '##miento',
 'de',
 'Patr',
 '##ones',
 '[SEP]']

Hay más tokens que palabras en la oración porque también se toman en cuenta como tokens distintos los símbolos y además hay una tokenización por piezas de palabras cuando la palabra no se encuentra en el vocabulario.

In [None]:
inputs = tokenizer3("¡Hola tú!, este es el curso de Reconocimiento de Patrones", return_tensors="pt")

In [None]:
outputs = model3(**inputs)

In [None]:
outputs.last_hidden_state

tensor([[[ 0.2717, -0.4312,  0.1304,  ..., -0.2825, -0.0417, -0.4201],
         [-0.3997,  0.4468, -0.0307,  ..., -0.4323,  0.0024,  0.7095],
         [ 0.5203, -0.9626, -0.1072,  ..., -0.6186, -0.4658, -0.1481],
         ...,
         [-0.5982, -0.1102, -0.0615,  ..., -0.4002,  0.2964,  0.4036],
         [-0.1925,  0.1401,  0.0076,  ..., -0.9047,  0.3816,  0.0733],
         [-0.8110,  0.0526, -0.3054,  ..., -1.3095,  0.3250, -0.7058]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
outputs.last_hidden_state.shape

torch.Size([1, 17, 768])

La primera dimensión es el tamaño del batch y en este caso sólo le pasamos un ejemplo, la segunda dimensión es el largo de la secuencia, en nuestro caso nuestra secuencia era de 17 tokens y la tercera dimensión es el tamaño del embedding que representa cada token.