In [7]:
pip install pyyaml

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install transformers datasets 

Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.30.0->transformers)
  Using cached fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading aiohttp-3.12.11-cp310-cp310-win_amd64.whl.metadata (7.9 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Using cached aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023

In [None]:
import mlflow
import mlflow.sklearn
import mlflow.lightgbm
import mlflow.tensorflow
import yaml
import os

from utils import load_data_tfidf, load_data_use
from models.use_model import train_use
from models.lstm_model import train_lstm
from models.bert_model import build_and_train_bert


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score, log_loss, roc_auc_score

# Chargement du fichier YAML
def load_config():
    config_path = "config.yml"
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

# Fonction d'entraînement générique pour les modèles scikit-learn
def train_model_sklearn(model, model_name, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)

        # Prédictions
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        # Métriques
        accuracy = model.score(X_test, y_test)
        f1 = f1_score(y_test, y_pred)
        logloss = log_loss(y_test, y_proba)
        roc_auc = roc_auc_score(y_test, y_proba)

        # Logging
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("log_loss", logloss)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.sklearn.log_model(model, model_name)

        print(f"✅ {model_name} terminé avec précision={accuracy:.2f} | F1={f1:.2f} | AUC={roc_auc:.2f}")

# Main
if __name__ == "__main__":
    config = load_config()

    # === 1. Données TF-IDF pour les modèles classiques ===
    X_train_tfidf, X_test_tfidf, y_train, y_test, _ = load_data_tfidf()

    # === 2. Random Forest ===
    rf_params = {k: v[0] for k, v in config["grid_search"]["random_forest"].items()}
    rf_model = RandomForestClassifier(**rf_params)
    train_model_sklearn(rf_model, "RandomForest", X_train_tfidf, X_test_tfidf, y_train, y_test)

    # === 3. Logistic Regression ===
    logreg_params = {k: v[0] for k, v in config["grid_search"]["logistic_regression"].items()}
    logreg_model = LogisticRegression(**logreg_params)
    train_model_sklearn(logreg_model, "LogisticRegression", X_train_tfidf, X_test_tfidf, y_train, y_test)

    # === 4. LightGBM ===
    lgbm_params = {k: v[0] for k, v in config["grid_search"]["lightgbm"].items()}
    lgbm_model = LGBMClassifier(**lgbm_params)
    train_model_sklearn(lgbm_model, "LightGBM", X_train_tfidf, X_test_tfidf, y_train, y_test)

    # === 5. Universal Sentence Encoder ===
    X_train_use, X_test_use, y_train_use, y_test_use = load_data_use()
    train_use(config["use"], X_train_use, X_test_use, y_train_use, y_test_use)

    # === LSTM ===
    train_lstm()


# === Entraînement BERT ===

print("📦 Modèle BERT")
build_and_train_bert()

