In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt

2023-04-27 15:46:24.036269: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-27 15:46:24.246441: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-27 15:46:24.253358: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-27 15:46:24.253386: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

# Transformers

## Chargement des Données

In [2]:
# Données d'entrainement
train_data_complete = pd.read_csv("../data/allocine_genres_train.csv", sep=",")
train_data = train_data_complete[["titre", "synopsis", "genre"]]

# Données de test/validation
test_data_complete = pd.read_csv("../data/allocine_genres_test.csv", sep=",")
test_data = test_data_complete[["titre", "synopsis", "genre"]]

In [3]:
# Liste de classes et ajout d'un identifiant numérique pour chaque classe
genre_name = sorted(train_data.genre.unique().flatten())
print("Genres:", genre_name)
print("Nombre d'exemplaires:", len(train_data))

label2id = {genre_name[i]:i for i in range(len(genre_name))}
id2label = {i:genre_name[i] for i in range(len(genre_name))}

Genres: ['biopic', 'comédie', 'documentaire', 'drame', 'historique', 'horreur', 'policier', 'romance', 'science fiction']
Nombre d'exemplaires: 2875


In [4]:
# Proportion des données qui sera utilisée
scale = 0.2

In [5]:
from datasets import Features, Value, ClassLabel, Dataset, DatasetDict

data_df = pd.DataFrame()
# Chaque film est décrit par un titre et un synopsis
split_titre = train_data.titre
data_df["text"] = split_titre + " " + train_data.synopsis
# Chaque filme possède un attribut qui décrit son genre (auquel a été associé un identifiant numérique)
data_df["genre"] = train_data.genre.map(label2id)

# Transformation du DataFrame en objet de type Dataset utilisé par HuggingFace
province_features = Features({"text": Value('string'),
                              "genre": ClassLabel(names=genre_name)})
data = Dataset.from_pandas(data_df, features=province_features)
# Découpage en train et test
data = data.train_test_split(test_size=0.2, shuffle=True, seed=12)

In [6]:
data["train"].features

{'text': Value(dtype='string', id=None),
 'genre': ClassLabel(names=['biopic', 'comédie', 'documentaire', 'drame', 'historique', 'horreur', 'policier', 'romance', 'science fiction'], id=None)}

In [7]:
data["train"][0]

{'text': "True Romance Le jour de son anniversaire , Clarence Worley rencontre la splendide Alabama dans un cinéma miteux . Coup de foudre immédiat . Après une nuit d' amour , Alabama avoue a Clarence qu' elle a été en fait engagée par le patron de Clarence comme cadeau d' anniversaire . De là va commencer une folle aventure .",
 'genre': 7}

## Tokénisation

In [8]:
from transformers import AutoTokenizer

# model_ckpt = "camembert-base"
# model_ckpt = "google/flan-t5-large"
# model_ckpt = "xlm-roberta-base" # Trop gros
# model_ckpt = "t5-small"
model_ckpt = "distilbert-base-multilingual-cased"

# Chargement du tokéniseur pré-entraîné correspondant au modèle utilisé
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [9]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

In [10]:
# Tokénisation des 2 premières instances
preprocess_function(data["train"][:2])

{'input_ids': [[101, 24079, 34404, 10281, 16947, 10104, 10312, 77087, 117, 40653, 102204, 56900, 21702, 10109, 32650, 72384, 10253, 19866, 10260, 10119, 26552, 10221, 27249, 119, 13098, 14590, 10104, 12688, 16419, 10211, 18298, 92883, 119, 14214, 10231, 26642, 172, 112, 25205, 117, 19866, 34657, 12772, 169, 40653, 10608, 112, 11117, 169, 10845, 10110, 11329, 88601, 10112, 10248, 10141, 44979, 10104, 40653, 10986, 11135, 20042, 10138, 172, 112, 77087, 119, 10190, 10331, 10321, 22195, 10129, 10231, 95059, 71593, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 69783, 69783, 117, 10208, 11744, 117, 17217, 11245, 10248, 10974, 32539, 13479, 117, 28281, 254, 10312, 28276, 10110, 22272, 10104, 10126, 97184, 97607, 33519, 10104, 10312, 263, 19224, 12131, 10104, 208, 85695, 220, 119, 14180, 171, 37925, 10165, 10141, 10347, 10350, 10139, 80934

In [11]:
# Tokenisation de la totalité des données : chaque unité est remplacée par un identifiant numérique
tokenized_data = data.map(preprocess_function, batched=True, batch_size=None)

Map:   0%|          | 0/2300 [00:00<?, ? examples/s]

Map:   0%|          | 0/575 [00:00<?, ? examples/s]

In [12]:
tokenized_data["train"][0]

{'text': "True Romance Le jour de son anniversaire , Clarence Worley rencontre la splendide Alabama dans un cinéma miteux . Coup de foudre immédiat . Après une nuit d' amour , Alabama avoue a Clarence qu' elle a été en fait engagée par le patron de Clarence comme cadeau d' anniversaire . De là va commencer une folle aventure .",
 'genre': 7,
 'input_ids': [101,
  24079,
  34404,
  10281,
  16947,
  10104,
  10312,
  77087,
  117,
  40653,
  102204,
  56900,
  21702,
  10109,
  32650,
  72384,
  10253,
  19866,
  10260,
  10119,
  26552,
  10221,
  27249,
  119,
  13098,
  14590,
  10104,
  12688,
  16419,
  10211,
  18298,
  92883,
  119,
  14214,
  10231,
  26642,
  172,
  112,
  25205,
  117,
  19866,
  34657,
  12772,
  169,
  40653,
  10608,
  112,
  11117,
  169,
  10845,
  10110,
  11329,
  88601,
  10112,
  10248,
  10141,
  44979,
  10104,
  40653,
  10986,
  11135,
  20042,
  10138,
  172,
  112,
  77087,
  119,
  10190,
  10331,
  10321,
  22195,
  10129,
  10231,
  95059,
  

In [13]:
# Affichage des tokens. DistilBERT utilise l'algorithme WordPiece
tokens = tokenizer.convert_ids_to_tokens(tokenized_data["train"][0]['input_ids'])
print(tokenized_data["train"][0]["text"])
print(tokens)

True Romance Le jour de son anniversaire , Clarence Worley rencontre la splendide Alabama dans un cinéma miteux . Coup de foudre immédiat . Après une nuit d' amour , Alabama avoue a Clarence qu' elle a été en fait engagée par le patron de Clarence comme cadeau d' anniversaire . De là va commencer une folle aventure .
['[CLS]', 'True', 'Romance', 'Le', 'jour', 'de', 'son', 'anniversaire', ',', 'Clarence', 'Wo', '##rley', 'rencontre', 'la', 'sp', '##lendi', '##de', 'Alabama', 'dans', 'un', 'cinéma', 'mit', '##eux', '.', 'Co', '##up', 'de', 'fou', '##dre', 'im', '##mé', '##diat', '.', 'Après', 'une', 'nuit', 'd', "'", 'amour', ',', 'Alabama', 'avo', '##ue', 'a', 'Clarence', 'qu', "'", 'elle', 'a', 'été', 'en', 'fait', 'engagé', '##e', 'par', 'le', 'patron', 'de', 'Clarence', 'comme', 'ca', '##dea', '##u', 'd', "'", 'anniversaire', '.', 'De', 'là', 'va', 'commence', '##r', 'une', 'folle', 'aventure', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

In [14]:
# Taille du vocabulaire
tokenizer.vocab_size

119547

In [15]:
# Taille de contexte maximum
tokenizer.model_max_length

512

## Préparation de l'Evaluation

In [16]:
import evaluate

accuracy = evaluate.load("accuracy")

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return acc

## Construction et Entraînement du Transformer (PyTorch -> NE FONCTIONNE PAS)

In [18]:
# from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
# import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_ckpt, num_labels=len(genre_name), id2label=id2label, label2id=label2id
# ).to(device)

In [19]:
# training_args = TrainingArguments(
#     output_dir=f"{model_ckpt}-finetuned-genre",
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=6,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

In [20]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_data["train"],
#     eval_dataset=tokenized_data["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

In [21]:
# trainer.train()

## Construction et Entraînement du Transformer (Tensorflow)

In [22]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [23]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

2023-04-27 15:46:34.532244: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-04-27 15:46:34.532293: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-04-27 15:46:34.532331: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (fabien): /proc/driver/nvidia/version does not exist
2023-04-27 15:46:34.534329: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [24]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=9, id2label=id2label, label2id=label2id
)

Some layers from the model checkpoint at distilbert-base-multilingual-cased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream ta

In [25]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)
print(type(tf_validation_set))

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>


In [26]:
import tensorflow as tf

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [27]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set, label_cols=["genre"])

ValueError: Label genre is in label_cols but could not be found in the dataset inputs!

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[metric_callback])

TypeError: fit() got an unexpected keyword argument 'target'

## Analyse des Résultats

In [None]:
# Prédictions pour les données de test
preds_output = trainer.predict(tokenized_data['test'])

In [None]:
preds_output

In [None]:
preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = tokenized_data['test']['label']
labels = tokenized_data['test'].features['label'].names

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    labels_for_fig = [l[0:4]+'.' for l in labels]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                  display_labels=labels_for_fig)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()
    
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    # Fonction qui retourne la perte (entropie croisée) et la classe prédite
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}

    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device), 
                             reduction="none")
    return {"loss": loss.cpu().numpy(), 
            "predicted_label": pred_label.cpu().numpy()}

In [None]:
# Conversion des données au bon format
tokenized_data.set_format("torch", 
                            columns=["input_ids", "attention_mask", "label"])

In [None]:
# Calcul des valeurs de perte
tokenized_data["test"] = tokenized_data["test"].map(
    forward_pass_with_label, batched=True, batch_size=64)

In [None]:
# Création d'un DataFrame avec les textes, les pertes les classe (prédites et attendues)

def label_int2str(row):
    return tokenized_data["train"].features["label"].int2str(row)

tokenized_data.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = tokenized_data["test"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
                              .apply(label_int2str))

In [None]:
# Pour éviter l'affichage tronqué des descriptions
pd.set_option('display.max_colwidth', -1)
# Affichage des 10 premières instances triées par perte décroissante
df_test.sort_values("loss", ascending=False).head(10)

In [None]:
# Affichage des 10 premières instances triées par perte croissante
# Cela permet de voir les instances pour lesquelles les prédictions sont les plus certaines
df_test.sort_values("loss", ascending=True).head(10)