# 1. Utilisation des Transformers de Hugging Face 🤗

In [29]:
# commencez par installer la bibliothèque transformers si vous n'utilisez pas Colab ou Kaggle

!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




## Preprocessing avec un Tokenizer

In [30]:
from transformers import AutoTokenizer    

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [31]:
raw_inputs = [
    "Burkina Faso is a West African country divided into 45 provinces.",
    "The official language is French, but other national languages such as Mooré, Peul, Dioula and Bissa are also spoken.",
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="tf")
print(inputs)

{'input_ids': <tf.Tensor: shape=(2, 29), dtype=int32, numpy=
array([[  101, 23089, 22773,  2003,  1037,  2225,  3060,  2406,  4055,
         2046,  3429,  6941,  1012,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  101,  1996,  2880,  2653,  2003,  2413,  1010,  2021,  2060,
         2120,  4155,  2107,  2004,  5405,  1010, 21877,  5313,  1010,
         4487,  7140,  2721,  1998, 20377,  3736,  2024,  2036,  5287,
         1012,   102]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 29), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1]], dtype=int32)>}


## Passer le modèle en revue

In [32]:
from transformers import TFAutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModel.from_pretrained(checkpoint)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


### Vecteur de grande dimension (High-dimensional vector) ?

In [33]:
outputs = model(inputs)
print(outputs.last_hidden_state.shape)

(2, 29, 768)


In [34]:
outputs[0]

<tf.Tensor: shape=(2, 29, 768), dtype=float32, numpy=
array([[[-0.18444543,  0.18406054, -0.3317901 , ..., -0.33912918,
          0.6345618 , -0.21481766],
        [ 0.5576094 ,  0.5374135 , -0.26444602, ..., -0.6617011 ,
          0.73083746, -0.58173436],
        [ 0.33717203,  0.7582899 , -0.15661317, ..., -0.8102957 ,
          0.37502262, -0.1652882 ],
        ...,
        [-0.22423077,  0.4882026 , -0.7373865 , ..., -0.35429463,
          0.28613007, -0.8663359 ],
        [ 0.16610484,  0.35079545, -0.37180883, ..., -0.44478616,
          0.29429734, -0.6339572 ],
        [-0.1446706 ,  0.31071886, -0.62418133, ..., -0.31383926,
          0.15263262, -0.43699735]],

       [[ 0.20606035,  0.767488  , -0.36321968, ..., -0.25143635,
          0.46576267, -0.49673885],
        [-0.22202978,  0.4234435 , -0.4924144 , ..., -0.00735636,
          0.8536247 ,  0.3430001 ],
        [-0.41844034,  0.65331614, -0.05087002, ..., -0.33773685,
          0.93199396,  0.47552633],
        ...,


### Têtes de modèle (Heads) : Donner du sens aux chiffres

In [35]:
from transformers import TFAutoModelForSequenceClassification  # ajout d'une tete pour la classification de séquences

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(inputs)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [36]:
outputs.logits.shape

TensorShape([2, 2])

### Postprocessing outputs

In [38]:
print(outputs.logits)

tf.Tensor(
[[-1.6803675  1.6408637]
 [-1.7621164  1.7131441]], shape=(2, 2), dtype=float32)


In [39]:
import tensorflow as tf

predictions = tf.math.softmax(outputs.logits, axis=-1)
print(predictions)

tf.Tensor(
[[0.03484998 0.96515   ]
 [0.0300244  0.9699756 ]], shape=(2, 2), dtype=float32)


# 2. Les Modèles sur 🤗

## Creation d'un Transformer: le cas de BERT

In [40]:
from transformers import BertConfig, TFBertModel

# the config
config = BertConfig()

# the model from the config
model = TFBertModel(config)

In [41]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



### Méthodes de chargement des modèles

In [42]:
from transformers import BertConfig, TFBertModel

config = BertConfig()
model = TFBertModel(config)

# Ici les paramètres du modèle seront initialisés aléatoirement

In [43]:
from transformers import TFBertModel

# ici on charge le modèle de base de BERT prétrainé
model = TFBertModel.from_pretrained("bert-base-cased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

### Enregistrer avec la méthode "save_pretrained"

In [49]:
model.save_pretrained("mon_repertoire")

In [52]:
# ls affiche tout simplement le contenu du répertoire mon_repertoire

!ls mon_repertoire

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json  tf_model.h5


# 3. Les Tokenizers sur 🤗

## Loading and saving

In [53]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [54]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [55]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [56]:
tokenizer.save_pretrained("directory_on_my_computer")


('directory_on_my_computer/tokenizer_config.json',
 'directory_on_my_computer/special_tokens_map.json',
 'directory_on_my_computer/vocab.txt',
 'directory_on_my_computer/added_tokens.json',
 'directory_on_my_computer/tokenizer.json')

### Tokenization

In [57]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)

print(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


### From tokens to input IDs

In [58]:
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


### Decoding

In [59]:
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])

print(decoded_string)

Using a transformer network is simple


# 4. Gestion des séquences multiples 🤗

## Les modèles attendent un batch (lot) d'entrées par défaut

In [60]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tf.constant(ids)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [61]:
ids

[1045,
 1005,
 2310,
 2042,
 3403,
 2005,
 1037,
 17662,
 12172,
 2607,
 2026,
 2878,
 2166,
 1012]

In [62]:
input_ids

<tf.Tensor: shape=(14,), dtype=int32, numpy=
array([ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
        2607,  2026,  2878,  2166,  1012], dtype=int32)>

In [64]:
# This line will fail.

model(input_ids)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-2.7276218,  2.8789387]], dtype=float32)>, hidden_states=None, attentions=None)

In [65]:
tokenized_inputs = tokenizer(sequence, return_tensors="tf")

print(tokenized_inputs["input_ids"])

tf.Tensor(
[[  101  1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026
   2878  2166  1012   102]], shape=(1, 16), dtype=int32)


In [66]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = tf.constant([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Input IDs: tf.Tensor(
[[ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878
   2166  1012]], shape=(1, 14), dtype=int32)
Logits: tf.Tensor([[-2.7276218  2.8789387]], shape=(1, 2), dtype=float32)


In [67]:
batched_ids = [ids, ids]

In [68]:
input_ids = tf.constant(batched_ids)
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

Input IDs: tf.Tensor(
[[ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878
   2166  1012]
 [ 1045  1005  2310  2042  3403  2005  1037 17662 12172  2607  2026  2878
   2166  1012]], shape=(2, 14), dtype=int32)
Logits: tf.Tensor(
[[-2.7276225  2.8789396]
 [-2.727621   2.878938 ]], shape=(2, 2), dtype=float32)


# Putting it all together 🤗

In [78]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [79]:
sequence = "Burkina Faso is a West African country divided into 45 provinces."

sequences_multiples = [
    "Burkina Faso is a West African country divided into 45 provinces.",
    "The official language is French, but other national languages such as Mooré, Peul, Dioula and Bissa are also spoken.",
]

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

In [83]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# passer une seule séquence au modèle
model_inputs = tokenizer(sequence)
model_inputs

{'input_ids': [101, 23089, 22773, 2003, 1037, 2225, 3060, 2406, 4055, 2046, 3429, 6941, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [84]:
# passer plusieurs séquences au modèle
model_inputs = tokenizer(sequences_multiples)
model_inputs

{'input_ids': [[101, 23089, 22773, 2003, 1037, 2225, 3060, 2406, 4055, 2046, 3429, 6941, 1012, 102], [101, 1996, 2880, 2653, 2003, 2413, 1010, 2021, 2060, 2120, 4155, 2107, 2004, 5405, 1010, 21877, 5313, 1010, 4487, 7140, 2721, 1998, 20377, 3736, 2024, 2036, 5287, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [85]:
# Les séquences seront remplies jusqu'à la longueur maximale de la séquence.
model_inputs = tokenizer(sequences, padding="longest")

# Remplit les séquences jusqu'à la longueur maximale du modèle
# (512 pour BERT ou DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Remplit les séquences jusqu'à la longueur maximale spécifiée.
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

## Les Tokens spéciaux

In [86]:
model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[101, 23089, 22773, 2003, 1037, 2225, 3060, 2406, 4055, 2046, 3429, 6941, 1012, 102]
[23089, 22773, 2003, 1037, 2225, 3060, 2406, 4055, 2046, 3429, 6941, 1012]


In [None]:
# en résumé 

import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = tf.constant(ids)

2024-04-05 22:52:47.197617: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered

2024-04-05 22:52:47.197888: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered

2024-04-05 22:52:47.370569: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.



All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.

If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
