Notebook 4 : Modèle avancé


---


In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score,
)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

import mlflow
import mlflow.tensorflow

mlflow.set_tracking_uri(f"file:{Path('..').resolve() / 'mlruns'}")
mlflow.set_experiment("sentiment_airparadis_modele_avance")
mlflow.tensorflow.autolog()

ROOT = Path("..").resolve()
DATA_PATH = ROOT / "data"
OUT_PATH = ROOT / "out"
SCRIPTS_PATH = ROOT / "scripts"
EMB_PATH = DATA_PATH / "embeddings"

sys.path.append(str(SCRIPTS_PATH))

from preprocessing import preprocess_advanced

tqdm.pandas(desc="Preprocessing (advanced)")
pd.set_option("display.max_colwidth", 200)

print(tf.__version__)

  return FileStore(store_uri, store_uri)
2025/12/01 04:40:25 INFO mlflow.tracking.fluent: Experiment with name 'sentiment_airparadis_modele_avance' does not exist. Creating a new experiment.


2.20.0


In [None]:
col_names = ["target", "ids", "date", "flag", "user", "text"]
df = pd.read_csv(
    DATA_PATH / "training.1600000.processed.noemoticon.csv",
    encoding="latin-1",
    header=None,
    names=col_names,
)
df["label"] = (df["target"] == 4).astype(int)

df = df.reset_index().rename(columns={"index": "row_id"})

split = pd.read_csv(OUT_PATH / "split.csv")

df = df.merge(split, left_on="row_id", right_on="ids", how="inner")

df[["row_id", "split"]].head(), len(df)

(   row_id  split
 0       0  train
 1       1  train
 2       2  train
 3       3   test
 4       4  train,
 1527316)

In [None]:
df["text_adv"] = df["text"].progress_apply(preprocess_advanced)

df_train = df[df["split"] == "train"].copy()
df_test = df[df["split"] == "test"].copy()

X_train_text = df_train["text_adv"].astype(str).tolist()
X_test_text = df_test["text_adv"].astype(str).tolist()
y_train = df_train["label"].values
y_test = df_test["label"].values

len(X_train_text), len(X_test_text)

Preprocessing (advanced): 100%|██████████| 1527316/1527316 [01:46<00:00, 14316.56it/s]


(1221852, 305464)

In [None]:
max_words = 50_000
tokenizer = Tokenizer(num_words=max_words, oov_token="<unk>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

lengths = [len(seq) for seq in X_train_seq]
max_len = int(np.percentile(lengths, 95))
print("max_len choisi :", max_len)

X_train_pad = pad_sequences(
    X_train_seq, maxlen=max_len, padding="post", truncating="post"
)
X_test_pad = pad_sequences(
    X_test_seq, maxlen=max_len, padding="post", truncating="post"
)

X_train_pad.shape, X_test_pad.shape

max_len choisi : 23


((1221852, 23), (305464, 23))

In [None]:
def load_embeddings_txt(path, embedding_dim, has_header=False):
    embeddings_index = {}
    with open(path, encoding="utf8") as f:
        if has_header:
            next(f)
        for line in f:
            values = line.rstrip().split(" ")
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            if len(coefs) != embedding_dim:
                continue
            embeddings_index[word] = coefs
    print(f"Embeddings chargés depuis {path} : {len(embeddings_index)} mots")
    return embeddings_index


def build_embedding_matrix(tokenizer, embeddings_index, max_words, embedding_dim):
    word_index = tokenizer.word_index
    num_words = min(max_words, len(word_index) + 1)

    embedding_matrix = np.zeros((num_words, embedding_dim), dtype="float32")

    for word, i in word_index.items():
        if i >= max_words:
            continue
        vec = embeddings_index.get(word)
        if vec is not None:
            embedding_matrix[i] = vec
    print("Matrice d'embedding shape :", embedding_matrix.shape)
    return embedding_matrix, num_words


def build_lstm_model(
    num_words,
    embedding_dim,
    embedding_matrix,
    max_len,
    lstm_units=128,
    dropout_rate=0.3,
    bidirectional=False,
    trainable=False,
):
    model = Sequential()
    model.add(
        Embedding(
            input_dim=num_words,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            input_length=max_len,
            trainable=trainable,
        )
    )
    if bidirectional:
        model.add(Bidirectional(LSTM(lstm_units)))
    else:
        model.add(LSTM(lstm_units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [None]:
glove_path = EMB_PATH / "glove.twitter.27B.200d.txt"
embedding_dim_glove = 200

emb_index_glove = load_embeddings_txt(glove_path, embedding_dim_glove, has_header=False)
embedding_matrix_glove, num_words_glove = build_embedding_matrix(
    tokenizer, emb_index_glove, max_words, embedding_dim_glove
)

batch_size = 256
epochs = 5

es = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

with mlflow.start_run(run_name="lstm_glove"):

    mlflow.log_param("embedding_type", "glove_twitter_100d")
    mlflow.log_param("embedding_dim", embedding_dim_glove)
    mlflow.log_param("max_words", max_words)
    mlflow.log_param("max_len", max_len)
    mlflow.log_param("lstm_units", 128)
    mlflow.log_param("bidirectional", False)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("epochs", epochs)

    model_glove = build_lstm_model(
        num_words=num_words_glove,
        embedding_dim=embedding_dim_glove,
        embedding_matrix=embedding_matrix_glove,
        max_len=max_len,
        lstm_units=128,
        dropout_rate=0.3,
        bidirectional=False,
        trainable=False,
    )

    history = model_glove.fit(
        X_train_pad,
        y_train,
        validation_split=0.1,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[es],
        verbose=1,
    )

    y_proba = model_glove.predict(X_test_pad).ravel()
    y_pred = (y_proba >= 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average="binary"
    )
    roc_auc = roc_auc_score(y_test, y_proba)

    mlflow.log_metric("test_accuracy", acc)
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.log_metric("test_f1", f1)
    mlflow.log_metric("test_roc_auc", roc_auc)

print("GloVe - accuracy :", acc, " | F1 :", f1, " | ROC AUC :", roc_auc)

Embeddings chargés depuis C:\Users\Gui\Desktop\AAA_doc\Openclassroom school\Python project\proj_proj\proj7\data\embeddings\glove.twitter.27B.200d.txt : 1193514 mots
Matrice d'embedding shape : (50000, 200)




Epoch 1/5
[1m4295/4296[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - accuracy: 0.7855 - loss: 0.4525



[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 31ms/step - accuracy: 0.8041 - loss: 0.4249 - val_accuracy: 0.7744 - val_loss: 0.4765
Epoch 2/5
[1m4295/4296[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - accuracy: 0.8250 - loss: 0.3898



[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 31ms/step - accuracy: 0.8262 - loss: 0.3868 - val_accuracy: 0.7940 - val_loss: 0.4484
Epoch 3/5
[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 31ms/step - accuracy: 0.8350 - loss: 0.3702 - val_accuracy: 0.7574 - val_loss: 0.5216
Epoch 4/5
[1m4295/4296[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 30ms/step - accuracy: 0.8417 - loss: 0.3581



[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 31ms/step - accuracy: 0.8412 - loss: 0.3582 - val_accuracy: 0.8050 - val_loss: 0.4208
Epoch 5/5
[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 31ms/step - accuracy: 0.8466 - loss: 0.3482 - val_accuracy: 0.7838 - val_loss: 0.4653
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step




[1m9546/9546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 3ms/step
GloVe - accuracy : 0.8304055469711652  | F1 : 0.8265064517965566  | ROC AUC : 0.9116228564071932


In [None]:
fasttext_path = EMB_PATH / "wiki-news-300d-1M-subword.vec"
embedding_dim_ft = 300

emb_index_ft = load_embeddings_txt(fasttext_path, embedding_dim_ft, has_header=True)
embedding_matrix_ft, num_words_ft = build_embedding_matrix(
    tokenizer, emb_index_ft, max_words, embedding_dim_ft
)

batch_size_ft = 256
epochs_ft = 5

es_ft = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

with mlflow.start_run(run_name="bilstm_fasttext"):

    mlflow.log_param("embedding_type", "fasttext_cc_300d")
    mlflow.log_param("embedding_dim", embedding_dim_ft)
    mlflow.log_param("max_words", max_words)
    mlflow.log_param("max_len", max_len)
    mlflow.log_param("lstm_units", 128)
    mlflow.log_param("bidirectional", True)
    mlflow.log_param("batch_size", batch_size_ft)
    mlflow.log_param("epochs", epochs_ft)

    model_ft = build_lstm_model(
        num_words=num_words_ft,
        embedding_dim=embedding_dim_ft,
        embedding_matrix=embedding_matrix_ft,
        max_len=max_len,
        lstm_units=128,
        dropout_rate=0.3,
        bidirectional=True,
        trainable=False,
    )

    history_ft = model_ft.fit(
        X_train_pad,
        y_train,
        validation_split=0.1,
        epochs=epochs_ft,
        batch_size=batch_size_ft,
        callbacks=[es_ft],
        verbose=1,
    )

    y_proba_ft = model_ft.predict(X_test_pad).ravel()
    y_pred_ft = (y_proba_ft >= 0.5).astype(int)

    acc_ft = accuracy_score(y_test, y_pred_ft)
    precision_ft, recall_ft, f1_ft, _ = precision_recall_fscore_support(
        y_test, y_pred_ft, average="binary"
    )
    roc_auc_ft = roc_auc_score(y_test, y_proba_ft)

    mlflow.log_metric("test_accuracy", acc_ft)
    mlflow.log_metric("test_precision", precision_ft)
    mlflow.log_metric("test_recall", recall_ft)
    mlflow.log_metric("test_f1", f1_ft)
    mlflow.log_metric("test_roc_auc", roc_auc_ft)

print("fastText - accuracy :", acc_ft, " | F1 :", f1_ft, " | ROC AUC :", roc_auc_ft)



Embeddings chargés depuis C:\Users\Gui\Desktop\AAA_doc\Openclassroom school\Python project\proj_proj\proj7\data\embeddings\wiki-news-300d-1M-subword.vec : 999994 mots
Matrice d'embedding shape : (50000, 300)


Epoch 1/5
[1m4295/4296[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 42ms/step - accuracy: 0.7481 - loss: 0.5083



[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 44ms/step - accuracy: 0.7712 - loss: 0.4769 - val_accuracy: 0.7153 - val_loss: 0.5761
Epoch 2/5
[1m4295/4296[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 41ms/step - accuracy: 0.7994 - loss: 0.4332



[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 44ms/step - accuracy: 0.8019 - loss: 0.4288 - val_accuracy: 0.7378 - val_loss: 0.5370
Epoch 3/5
[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 44ms/step - accuracy: 0.8110 - loss: 0.4128 - val_accuracy: 0.7360 - val_loss: 0.5437
Epoch 4/5
[1m4295/4296[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 41ms/step - accuracy: 0.8164 - loss: 0.4031



[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 44ms/step - accuracy: 0.8169 - loss: 0.4020 - val_accuracy: 0.7422 - val_loss: 0.5324
Epoch 5/5
[1m4296/4296[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 44ms/step - accuracy: 0.8222 - loss: 0.3931 - val_accuracy: 0.6882 - val_loss: 0.6277
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 227ms/step




[1m9546/9546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 4ms/step
fastText - accuracy : 0.8104784851897441  | F1 : 0.7973423322504761  | ROC AUC : 0.8973571987590773
