In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

import matplotlib.pyplot as plt

import spacy

2023-05-02 19:33:32.556156: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-02 19:33:32.597979: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-02 19:33:32.805764: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-02 19:33:32.807172: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
!python3 -m spacy download fr_core_news_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting fr-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.5.0/fr_core_news_sm-3.5.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [3]:
nlp = spacy.load("fr_core_news_sm")
from nltk.corpus import stopwords
from nltk import download
download('stopwords')
stopWords = set(stopwords.words('french'))
print(stopWords)

{'étée', 'est', 'eux', 'et', 'serait', 'je', 'leur', 'serions', 'eurent', 'étés', 'ont', 'eusses', 'ayons', 'le', 'moi', 'dans', 'soit', 'avez', 'fusse', 'auraient', 'votre', 'avait', 'y', 'soient', 'auront', 'nous', 'ou', 'avais', 'ses', 'au', 'soyons', 'était', 'sa', 'serais', 'sont', 'ait', 'auras', 'fussions', 'aurais', 'mes', 'fussent', 'tes', 'avions', 'étaient', 'étante', 'du', 's', 'c', 'aient', 'eusse', 'êtes', 'serons', 'de', 'ayante', 'été', 'ayant', 'sera', 'pas', 'me', 'soyez', 'fûmes', 'se', 'n', 'serai', 'pour', 'ce', 'étants', 'lui', 'aurait', 'même', 'es', 'ta', 'étant', 'son', 'un', 'ayez', 'fûtes', 'aurons', 'eûtes', 'étantes', 'nos', 't', 'ai', 'aurions', 'eut', 'm', 'aie', 'serez', 'fussiez', 'étions', 'as', 'eûmes', 'aux', 'vos', 'l', 'seras', 'elle', 'aurai', 'j', 'étiez', 'on', 'fusses', 'des', 'auriez', 'sur', 'fus', 'notre', 'mon', 'furent', 'fût', 'eussions', 'ma', 'la', 'eue', 'une', 'eu', 'qui', 'ils', 'eût', 'ayants', 'que', 'les', 'sommes', 'avec', 'ayant

[nltk_data] Downloading package stopwords to /home/el0h1m/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Transformers

## Chargement des Données

In [4]:
# Données d'entrainement
train_data_complete = pd.read_csv("../data/allocine_genres_train.csv", sep=",")
train_data = train_data_complete[["titre", "synopsis", "genre"]]

# Données de test/validation
test_data_complete = pd.read_csv("../data/allocine_genres_test.csv", sep=",")
test_data = test_data_complete[["titre", "synopsis", "genre"]]

In [29]:
# Liste de classes et ajout d'un identifiant numérique pour chaque classe
genre_name = sorted(train_data.genre.unique().flatten())
print("Genres:", genre_name)
print("Nombre d'exemplaires:", len(train_data))

label2id = {genre_name[i]:i for i in range(len(genre_name))}
id2label = {i:genre_name[i] for i in range(len(genre_name))}

Genres: ['biopic', 'comédie', 'documentaire', 'drame', 'historique', 'horreur', 'policier', 'romance', 'science fiction']
Nombre d'exemplaires: 2875


In [30]:
print(label2id)
print(id2label)

{'biopic': 0, 'comédie': 1, 'documentaire': 2, 'drame': 3, 'historique': 4, 'horreur': 5, 'policier': 6, 'romance': 7, 'science fiction': 8}
{0: 'biopic', 1: 'comédie', 2: 'documentaire', 3: 'drame', 4: 'historique', 5: 'horreur', 6: 'policier', 7: 'romance', 8: 'science fiction'}


In [6]:
# Proportion des données qui sera utilisée
scale = 0.2

In [7]:
def preprocess(sentence) :
    list_w = nlp(sentence)
    list_w_clean = []
    res = []
    for token in list_w:
        if (token.text.lower() not in stopWords) and (token.text not in ["'",'-',',','.','…','...',':',';']):
            list_w_clean.append(token)
    for token in list_w_clean:
        res.append(token.lemma_.lower())
    print(res)
    return " ".join(res)

In [8]:
from datasets import Features, Value, ClassLabel, Dataset, DatasetDict

data_df = pd.DataFrame()
# Chaque film est décrit par un titre et un synopsis
split_titre = train_data.titre
data_df["text"] = split_titre + " " + train_data.synopsis
for i in range(len(data_df["text"])):
    data_df["text"][i] = preprocess(data_df["text"][i])
# Chaque filme possède un attribut qui décrit son genre (auquel a été associé un identifiant numérique)
data_df["genre"] = train_data.genre.map(label2id)

# Transformation du DataFrame en objet de type Dataset utilisé par HuggingFace
province_features = Features({"text": Value('string'),
                              "genre": ClassLabel(names=genre_name)})
data = Dataset.from_pandas(data_df, features=province_features)
# Découpage en train et test
data = data.train_test_split(test_size=0.2, shuffle=True, seed=12)

['crime', 'orient', 'express', 'visite', 'istanbul', 'célèbre', 'détectiv', 'belge', 'hercule', 'poirot', 'embarqu', '’', 'orient', 'express', 'luxueux', 'train', 'relier', 'capitale', 'turc', 'calais', 'faire', 'connaissance', 'autre', 'passager', 'volubile', 'américaine', 'mme', 'hubbard', 'princesse', 'russe', 'dragomiroff', 'servant', 'hildegarde', 'séduisant', 'comte', 'hongrois', 'andrenyi', 'femme', 'matin', 'alors', 'train', 'immobiliser', 'sou', 'neige', 'passager', 'découvrir', 'effroi', 'm.', 'ratchett', 'antipathique', 'homme', '’', 'affaire', 'américain', 'avoir', 'froidement', 'assassiner', 'cabine', 'durer', 'nuit', 'hercule', 'poirot', 'celer', 'faire', 'aucun', 'doute', 'meurtrier', 'trouver', 'parmi', 'treize', 'personne', 'bord']
['12', 'homme', 'colère', 'jeune', 'homme', 'origine', 'modeste', 'accuser', 'meurtre', 'père', 'risque', 'peine', 'mort', 'jury', 'composer', 'douze', 'homme', 'retirer', 'délibérer', 'procéder', 'immédiatement', 'vote', 'onze', 'voter', 'c

In [9]:
data["train"].features

{'text': Value(dtype='string', id=None),
 'genre': ClassLabel(names=['biopic', 'comédie', 'documentaire', 'drame', 'historique', 'horreur', 'policier', 'romance', 'science fiction'], id=None)}

In [10]:
data["train"][0]

{'text': 'true romance jour anniversaire clarence worley rencontre splendide alabama cinéma miteux coup foudre immédiat après nuit amour alabama avouer avoir clarence avoir fait engager patron clarence comme cadeau anniversaire là aller commencer fou aventure',
 'genre': 7}

## Tokénisation

In [11]:
from transformers import AutoTokenizer

model_ckpt = "baptiste-pasquier/distilcamembert-allocine"

# Chargement du tokéniseur pré-entraîné correspondant au modèle utilisé
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading (…)okenizer_config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/353 [00:00<?, ?B/s]

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

In [13]:
# Tokénisation des 2 premières instances
preprocess_function(data["train"][:2])

{'input_ids': [[5, 14268, 35, 17616, 209, 2575, 8548, 14296, 16306, 81, 2404, 162, 11072, 33, 10966, 4547, 1545, 4189, 914, 392, 13268, 8567, 182, 656, 1724, 33, 10966, 4547, 9675, 190, 8548, 14296, 190, 82, 11523, 3656, 8548, 14296, 79, 2275, 2575, 241, 632, 1348, 2331, 3371, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 535, 2281, 55, 535, 2281, 55, 338, 674, 2489, 81, 27193, 4708, 2377, 1299, 8840, 17806, 64, 602, 94, 10672, 108, 12750, 1973, 1209, 85, 10242, 528, 2940, 1077, 246, 2635, 444, 438, 1010, 971, 4532, 2635, 21, 12, 21, 13371, 98, 2549, 10844, 40, 670, 2926, 3288, 10242, 25591, 535, 2281, 55, 3305, 8155, 392, 9060, 291, 3288, 907, 387, 3732, 993, 87, 18, 305, 3676, 87, 226, 4107, 535, 2281, 55, 632, 747, 433, 18391, 83, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [14]:
# Tokenisation de la totalité des données : chaque unité est remplacée par un identifiant numérique
tokenized_data = data.map(preprocess_function, batched=True, batch_size=None)

Map:   0%|          | 0/2300 [00:00<?, ? examples/s]

Map:   0%|          | 0/575 [00:00<?, ? examples/s]

In [15]:
tokenized_data["train"][0]

{'text': 'true romance jour anniversaire clarence worley rencontre splendide alabama cinéma miteux coup foudre immédiat après nuit amour alabama avouer avoir clarence avoir fait engager patron clarence comme cadeau anniversaire là aller commencer fou aventure',
 'genre': 7,
 'input_ids': [5,
  14268,
  35,
  17616,
  209,
  2575,
  8548,
  14296,
  16306,
  81,
  2404,
  162,
  11072,
  33,
  10966,
  4547,
  1545,
  4189,
  914,
  392,
  13268,
  8567,
  182,
  656,
  1724,
  33,
  10966,
  4547,
  9675,
  190,
  8548,
  14296,
  190,
  82,
  11523,
  3656,
  8548,
  14296,
  79,
  2275,
  2575,
  241,
  632,
  1348,
  2331,
  3371,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1

In [16]:
# Affichage des tokens. DistilBERT utilise l'algorithme WordPiece
tokens = tokenizer.convert_ids_to_tokens(tokenized_data["train"][0]['input_ids'])
print(tokenized_data["train"][0]["text"])
print(tokens)

true romance jour anniversaire clarence worley rencontre splendide alabama cinéma miteux coup foudre immédiat après nuit amour alabama avouer avoir clarence avoir fait engager patron clarence comme cadeau anniversaire là aller commencer fou aventure
['<s>', '▁tru', 'e', '▁romance', '▁jour', '▁anniversaire', '▁cla', 'rence', '▁wo', 'r', 'ley', '▁rencontre', '▁splendide', '▁a', 'lab', 'ama', '▁cinéma', '▁mit', 'eux', '▁coup', '▁foudre', '▁immédiat', '▁après', '▁nuit', '▁amour', '▁a', 'lab', 'ama', '▁avouer', '▁avoir', '▁cla', 'rence', '▁avoir', '▁fait', '▁engager', '▁patron', '▁cla', 'rence', '▁comme', '▁cadeau', '▁anniversaire', '▁là', '▁aller', '▁commencer', '▁fou', '▁aventure', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'

In [17]:
# Taille du vocabulaire
tokenizer.vocab_size

32005

In [18]:
# Taille de contexte maximum
tokenizer.model_max_length

512

## Préparation de l'Evaluation

In [19]:
import evaluate

accuracy = evaluate.load("accuracy")

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    return acc

## Construction et Entraînement du Transformer (PyTorch -> NE FONCTIONNE PAS)

In [21]:
# from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
# import torch

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = AutoModelForSequenceClassification.from_pretrained(
#     model_ckpt, num_labels=len(genre_name), id2label=id2label, label2id=label2id
# ).to(device)

In [22]:
# training_args = TrainingArguments(
#     output_dir=f"{model_ckpt}-finetuned-genre",
#     learning_rate=2e-5,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=6,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
# )

In [23]:
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_data["train"],
#     eval_dataset=tokenized_data["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

In [24]:
# trainer.train()

## Construction et Entraînement du Transformer (Tensorflow)

In [25]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [26]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 64
num_epochs = 2
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [27]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=9, id2label=id2label, label2id=label2id
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/909 [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/273M [00:00<?, ?B/s]

ValueError: cannot reshape array of size 1536 into shape (768,9)

In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=64,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=64,
    collate_fn=data_collator,
)
print(type(tf_validation_set))

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set, label_cols=["genre"])

In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[metric_callback])

## Analyse des Résultats

In [None]:
# Prédictions pour les données de test
preds_output = trainer.predict(tokenized_data['test'])

In [None]:
preds_output

In [None]:
preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=1)
y_valid = tokenized_data['test']['label']
labels = tokenized_data['test'].features['label'].names

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    labels_for_fig = [l[0:4]+'.' for l in labels]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, 
                                  display_labels=labels_for_fig)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()
    
plot_confusion_matrix(y_preds, y_valid, labels)

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_label(batch):
    # Fonction qui retourne la perte (entropie croisée) et la classe prédite
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}

    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis=-1)
        loss = cross_entropy(output.logits, batch["label"].to(device), 
                             reduction="none")
    return {"loss": loss.cpu().numpy(), 
            "predicted_label": pred_label.cpu().numpy()}

In [None]:
# Conversion des données au bon format
tokenized_data.set_format("torch", 
                            columns=["input_ids", "attention_mask", "label"])

In [None]:
# Calcul des valeurs de perte
tokenized_data["test"] = tokenized_data["test"].map(
    forward_pass_with_label, batched=True, batch_size=64)

In [None]:
# Création d'un DataFrame avec les textes, les pertes les classe (prédites et attendues)

def label_int2str(row):
    return tokenized_data["train"].features["label"].int2str(row)

tokenized_data.set_format("pandas")
cols = ["text", "label", "predicted_label", "loss"]
df_test = tokenized_data["test"][:][cols]
df_test["label"] = df_test["label"].apply(label_int2str)
df_test["predicted_label"] = (df_test["predicted_label"]
                              .apply(label_int2str))

In [None]:
# Pour éviter l'affichage tronqué des descriptions
pd.set_option('display.max_colwidth', -1)
# Affichage des 10 premières instances triées par perte décroissante
df_test.sort_values("loss", ascending=False).head(10)

In [None]:
# Affichage des 10 premières instances triées par perte croissante
# Cela permet de voir les instances pour lesquelles les prédictions sont les plus certaines
df_test.sort_values("loss", ascending=True).head(10)