In [1]:
!pip install transformers    # Primero instalamos la librería Transformer

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Downloading hf_xet-1.1.5-cp37-abi3-macosx_11_0_arm64.whl.metadata (879 bytes)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.33.0-py3-none-any.whl (514 kB)
Downloading hf_xet-1.1.5-cp37-abi3-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━

In [2]:
# Importamos AutoTpkenizer, que nos permitirá recuperar el método de tokenización
# utilizado durante el proceso de entrenamiento de cada modelo pre-entrenado, para
# ahora poder aplicarlo en el nuevo texto de entrada:

from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Por ejemplo, el tokenizador utilizado durante el entrenamiento de gpt2:

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [4]:
encoding = tokenizer("This is my first sentence in the Tranformer World!", "Hello World")
print(encoding)

{'input_ids': [1212, 318, 616, 717, 6827, 287, 262, 833, 272, 16354, 2159, 0, 15496, 2159], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


#**BERT**

In [5]:
# O bien, el tokenizer de una de las versiones de BERT ...

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [6]:
encoding = tokenizer("one two 3 4", "Altogether now")

print(encoding)

print(tokenizer.decode(encoding["input_ids"]))  # podemos recuperar el texto a través de los enteros de la tokenización

print(encoding["token_type_ids"])    # Segment Embeddings

print(tokenizer.convert_ids_to_tokens(encoding["input_ids"]))   # WordPiece Tokenizer

{'input_ids': [101, 1141, 1160, 124, 125, 102, 17762, 16609, 4679, 1208, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] one two 3 4 [SEP] Altogether now [SEP]
[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
['[CLS]', 'one', 'two', '3', '4', '[SEP]', 'Alto', '##get', '##her', 'now', '[SEP]']


In [7]:
encoding = tokenizer("Tranformers World!", "My name is BERT.")

print(encoding)

print(tokenizer.decode(encoding["input_ids"]))

print(encoding["token_type_ids"])

print(tokenizer.convert_ids_to_tokens(encoding["input_ids"]))   # WordPiece Tokenizer

{'input_ids': [101, 157, 4047, 23763, 1116, 1291, 106, 102, 1422, 1271, 1110, 139, 9637, 1942, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] Tranformers World! [SEP] My name is BERT. [SEP]
[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
['[CLS]', 'T', '##ran', '##former', '##s', 'World', '!', '[SEP]', 'My', 'name', 'is', 'B', '##ER', '##T', '.', '[SEP]']


Observa que tenemos un diccionario en el cual nos regresa los siguiente:

*   **input_ids:** Los enteros de la tokenización de los enunciados de entrada.

*   **token_type_ids (segment embeddings):** En este caso que tenemos dos oraciones, nos identifica cada una.

*   **attention_mask:** Nos idica cuáles tokens necesitarán el concepto de Attention.



Observa que en particular tenemos dos tokens especiales, CLS y SEP, llamados "classifier" y "separator".

El uso de estos tokens especiales varía de acuerdo al tipo de Transformer.

#**Padding**

Cuando tenemos por ejemplo varios enunciados/comentarios/twitters de entrada de diferente longitud cada uno, se requiere usar el concepto de "padding" para que todos queden iguales en longitud de tokenizers.

Observa que se hace el padding con respecto al enunciado más largo:*texto en cursiva*

In [8]:
txt = ["1 2 3", "Hello World", "token"]

encoding = tokenizer(txt, padding=True)

print(encoding["input_ids"])

print(encoding["attention_mask"])

[[101, 122, 123, 124, 102], [101, 8667, 1291, 102, 0], [101, 22559, 102, 0, 0]]
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 0], [1, 1, 1, 0, 0]]


#**Truncation**

Puedes indicar la longitud máximo de un segmento (incluyendo los tokens especiales).

In [9]:
txt = ["1 2 3 4 5 6 7", "a b c"]

encoding = tokenizer(txt, padding=True,
                     truncation=True,
                     max_length=6
                     )

print(encoding['input_ids'])

[[101, 122, 123, 124, 125, 102], [101, 170, 171, 172, 102, 0]]


#**Modelos**

##**Método - 1: AutoModel - Sentiment Analysis**

In [10]:
model_name = "siebert/sentiment-roberta-large-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
my_sentences = ["This is a wonderful world", "This product was awful", "It was a good experience"]

In [12]:
# Loading the tokenizer:

encoding = tokenizer(my_sentences,
                     padding=True,
                     truncation=True,
                     return_tensors='pt',   # probabilities
                     )
print(encoding)

{'input_ids': tensor([[    0,   713,    16,    10,  4613,   232,     2],
        [    0,   713,  1152,    21, 11522,     2,     1],
        [    0,   243,    21,    10,   205,   676,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])}


In [13]:
# loading the model:

from transformers import AutoModelForSequenceClassification

In [14]:
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)  # recuerda que cada vez que cargas algún modelo sube algo la RAM

In [15]:
# veamos las probabilidads del sentimiento asignado a cada frase:

pt_outputs = pt_model(**encoding)
logits = pt_outputs.logits
print(logits)

tensor([[-3.6391,  2.7987],
        [ 3.9579, -3.6338],
        [-3.8043,  2.9755]], grad_fn=<AddmmBackward0>)


In [16]:
# veamos la salida aplicando una softmax:

import torch
output = torch.softmax(logits, dim=1).tolist()

print(output)

[[0.001597340451553464, 0.9984026551246643], [0.9994956254959106, 0.0005043695564381778], [0.0011352391447871923, 0.9988647699356079]]


##**Método - 2: Pipeline**

Podemos hacer uso de la clase Pipeline para llegar a los resultados anteriores de manera directa:

In [17]:
from transformers import pipeline

In [18]:
!pip install xformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting xformers
  Downloading xformers-0.0.30.tar.gz (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m377.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: xformers
[33m  DEPRECATION: Building 'xformers' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'xformers'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for xformers (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit cod

In [19]:
# creamos una isntancia de pipeline con el modelo y tokenizador que seleccionaste:

roberta_pipe = pipeline("sentiment-analysis",
                        model= model_name,
                        tokenizer= model_name,
                        top_k=None   # para que regrese todos los scores
                        )


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
# y de nuevo llegamos a la misma clasificación de cada enunciado:

roberta_pipe(my_sentences)

[[{'label': 'POSITIVE', 'score': 0.9984026551246643},
  {'label': 'NEGATIVE', 'score': 0.001597340451553464}],
 [{'label': 'NEGATIVE', 'score': 0.9994956254959106},
  {'label': 'POSITIVE', 'score': 0.0005043696146458387}],
 [{'label': 'POSITIVE', 'score': 0.9988647699356079},
  {'label': 'NEGATIVE', 'score': 0.0011352396104484797}]]