## 1. Préparation des données
On charge les textes et labels, puis on prépare les entrées pour le LSTM

In [7]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
from sklearn.metrics import roc_auc_score, accuracy_score
import keras_nlp

# Charger GoEmotions
dataset = load_dataset("google-research-datasets/go_emotions", "simplified")
train_hf = dataset["train"]
val_hf = dataset["validation"]

train_texts = list(train_hf["text"])
val_texts = list(val_hf["text"])

def convert_to_multilabel(example):
    label_vector = np.zeros(28, dtype=np.float32)
    for idx in example["labels"]:
        label_vector[idx] = 1.0
    return label_vector

train_labels = [convert_to_multilabel(ex) for ex in train_hf]
val_labels = [convert_to_multilabel(ex) for ex in val_hf]



## 2. Modèle LSTM simple
On entraîne un LSTM bidirectionnel sur les textes tokenisés.

In [3]:
max_words = 10000
max_len = 128

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_len)
X_val = pad_sequences(tokenizer.texts_to_sequences(val_texts), maxlen=max_len)

y_train = np.array(train_labels)
y_val = np.array(val_labels)

lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, 64, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=False)),
    tf.keras.layers.Dense(28, activation="sigmoid")
])

lstm_model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=[tf.keras.metrics.AUC(name="auc", multi_label=True), "accuracy"]
)

history_lstm = lstm_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=3,
    batch_size=32
)

val_auc_lstm = history_lstm.history['val_auc'][-1]
val_acc_lstm = history_lstm.history['val_accuracy'][-1]
print(f"LSTM - AUC validation : {val_auc_lstm:.4f} | Accuracy : {val_acc_lstm:.4f}")

Epoch 1/3




[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 40ms/step - accuracy: 0.3254 - auc: 0.5708 - loss: 0.1517 - val_accuracy: 0.4119 - val_auc: 0.6655 - val_loss: 0.1312
Epoch 2/3
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 42ms/step - accuracy: 0.4445 - auc: 0.7178 - loss: 0.1223 - val_accuracy: 0.4648 - val_auc: 0.7579 - val_loss: 0.1171
Epoch 3/3
[1m1357/1357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 42ms/step - accuracy: 0.4896 - auc: 0.8015 - loss: 0.1082 - val_accuracy: 0.4888 - val_auc: 0.7854 - val_loss: 0.1100
LSTM - AUC validation : 0.7854 | Accuracy : 0.4888


## Conclusion comparative : BERT Small vs LSTM

BERT Small surpasse donc légèrement le LSTM en AUC et offre de meilleures métriques de précision et de rappel, ce qui confirme la supériorité des modèles Transformers pour la classification multi-label d’émotions sur texte.