# 4. MLflow

Ce notebook enregistre tous les mod√®les, m√©triques et le meilleur mod√®le avec MLflow

## Import des biblioth√®ques

In [25]:
import pandas as pd
import numpy as np
import pickle
import mlflow
import mlflow.sklearn
from datetime import datetime
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("Biblioth√®ques import√©es avec succ√®s")

Biblioth√®ques import√©es avec succ√®s


## Configuration de MLflow

In [26]:
import mlflow

# Fix : utiliser le dossier local au lieu de SQLite
mlflow.set_tracking_uri("mlruns")

# D√©finir le nom de l'exp√©rience
experiment_name = "Job_Classification_Pipeline"

# Cr√©er ou r√©cup√©rer l'exp√©rience
mlflow.set_experiment(experiment_name)

# Obtenir l'ID de l'exp√©rience
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id

print(f"Exp√©rience MLflow configur√©e: {experiment_name}")
print(f"Experiment ID: {experiment_id}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

Exp√©rience MLflow configur√©e: Job_Classification_Pipeline
Experiment ID: 597620477807537161
Tracking URI: mlruns


## Chargement des r√©sultats du modeling

In [27]:
# Charger les r√©sultats du modeling
with open('../data/pkl/modeling_results_gridsearch.pkl', 'rb') as f:
    modeling_results = pickle.load(f)

all_results = modeling_results['all_results']
trained_models = modeling_results['trained_models']
best_model_key = modeling_results['best_model_key']
best_model = modeling_results['best_model']
best_metrics = modeling_results['best_metrics']
label_encoder = modeling_results['label_encoder']

print(f"R√©sultats charg√©s avec succ√®s")
print(f"Nombre de mod√®les entra√Æn√©s: {len(trained_models)}")
print(f"Meilleur mod√®le: {best_model_key}")

R√©sultats charg√©s avec succ√®s
Nombre de mod√®les entra√Æn√©s: 20
Meilleur mod√®le: Random_Forest_Count


## Enregistrement de tous les mod√®les dans MLflow

In [28]:
print("="*80)
print("ENREGISTREMENT DE TOUS LES MOD√àLES DANS MLFLOW")
print("="*80)

run_ids = {}
total_models = len(trained_models)
current_model = 0

for model_key, model_info in trained_models.items():
    current_model += 1
    print(f"\n[{current_model}/{total_models}] Enregistrement: {model_key}")
    
    # Extraire le nom du mod√®le et la configuration des features
    model_name, feature_config = model_key.rsplit('_', 1)
    
    # D√©marrer un run MLflow
    with mlflow.start_run(run_name=model_key) as run:
        # Enregistrer les param√®tres
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("feature_config", feature_config)
        mlflow.log_param("num_classes", len(label_encoder.classes_))
        
        # Enregistrer les hyperparam√®tres du mod√®le
        model_params = model_info['model'].get_params()
        for param_name, param_value in model_params.items():
            # Convertir les valeurs non-s√©rialisables
            if param_value is None or isinstance(param_value, (int, float, str, bool)):
                mlflow.log_param(f"model_{param_name}", param_value)
        
        # Enregistrer toutes les m√©triques
        metrics = model_info['metrics']
        mlflow.log_metric("accuracy", metrics['accuracy'])
        mlflow.log_metric("precision_weighted", metrics['precision_weighted'])
        mlflow.log_metric("recall_weighted", metrics['recall_weighted'])
        mlflow.log_metric("f1_weighted", metrics['f1_weighted'])
        mlflow.log_metric("precision_macro", metrics['precision_macro'])
        mlflow.log_metric("recall_macro", metrics['recall_macro'])
        mlflow.log_metric("f1_macro", metrics['f1_macro'])
        mlflow.log_metric("training_time", metrics['training_time'])
        mlflow.log_metric("prediction_time", metrics['prediction_time'])
        
        # Enregistrer le mod√®le
        mlflow.sklearn.log_model(
            sk_model=model_info['model'],
            artifact_path="model",
            registered_model_name=None  # Ne pas enregistrer dans le Model Registry pour l'instant
        )
        
        # Ajouter un tag pour identifier si c'est le meilleur mod√®le
        is_best = "yes" if model_key == best_model_key else "no"
        mlflow.set_tag("best_model", is_best)
        mlflow.set_tag("timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        
        # Stocker le run_id
        run_ids[model_key] = run.info.run_id
        
        print(f"   ‚úì Run ID: {run.info.run_id}")
        print(f"   ‚úì F1-Score: {metrics['f1_weighted']:.4f}")
        print(f"   ‚úì Meilleur mod√®le: {is_best}")

print(f"\n{'='*80}")
print(f"TOUS LES MOD√àLES ENREGISTR√âS DANS MLFLOW")
print(f"{'='*80}")



ENREGISTREMENT DE TOUS LES MOD√àLES DANS MLFLOW

[1/20] Enregistrement: Logistic_Regression_TF-IDF




   ‚úì Run ID: 73351afd520a48edabb38582cf566488
   ‚úì F1-Score: 0.7340
   ‚úì Meilleur mod√®le: no

[2/20] Enregistrement: Multinomial_NB_TF-IDF




   ‚úì Run ID: 71c674e9449a4b15afac568320e7eb7f
   ‚úì F1-Score: 0.6330
   ‚úì Meilleur mod√®le: no

[3/20] Enregistrement: Linear_SVC_TF-IDF




Exception: 'mlruns' does not exist.

## Enregistrement sp√©cial du meilleur mod√®le

In [None]:
print("\n" + "="*80)
print("ENREGISTREMENT DU MEILLEUR MOD√àLE DANS LE MODEL REGISTRY")
print("="*80)

# Cr√©er un run d√©di√© pour le meilleur mod√®le
with mlflow.start_run(run_name=f"BEST_MODEL_{best_model_key}") as run:
    # Extraire les informations
    model_name, feature_config = best_model_key.rsplit('_', 1)
    
    # Enregistrer les param√®tres
    mlflow.log_param("model_type", model_name)
    mlflow.log_param("feature_config", feature_config)
    mlflow.log_param("num_classes", len(label_encoder.classes_))
    mlflow.log_param("selection_criteria", "f1_weighted")
    
    # Enregistrer les hyperparam√®tres
    model_params = best_model.get_params()
    for param_name, param_value in model_params.items():
        if param_value is None or isinstance(param_value, (int, float, str, bool)):
            mlflow.log_param(f"model_{param_name}", param_value)
    
    # Enregistrer toutes les m√©triques
    mlflow.log_metric("accuracy", best_metrics['accuracy'])
    mlflow.log_metric("precision_weighted", best_metrics['precision_weighted'])
    mlflow.log_metric("recall_weighted", best_metrics['recall_weighted'])
    mlflow.log_metric("f1_weighted", best_metrics['f1_weighted'])
    mlflow.log_metric("precision_macro", best_metrics['precision_macro'])
    mlflow.log_metric("recall_macro", best_metrics['recall_macro'])
    mlflow.log_metric("f1_macro", best_metrics['f1_macro'])
    mlflow.log_metric("training_time", best_metrics['training_time'])
    mlflow.log_metric("prediction_time", best_metrics['prediction_time'])
    
    # Enregistrer le mod√®le dans le Model Registry
    model_uri = mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="best_model",
        registered_model_name="Job_Classification_Best_Model"
    )
    
    # Ajouter des tags
    mlflow.set_tag("best_model", "yes")
    mlflow.set_tag("model_version", "production_candidate")
    mlflow.set_tag("timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    mlflow.set_tag("pipeline_stage", "complete")
    
    best_run_id = run.info.run_id
    
    print(f"\n‚úì Meilleur mod√®le enregistr√© dans le Model Registry")
    print(f"‚úì Run ID: {best_run_id}")
    print(f"‚úì Model Name: Job_Classification_Best_Model")
    print(f"‚úì Model: {model_name}")
    print(f"‚úì Features: {feature_config}")
    print(f"‚úì F1-Score: {best_metrics['f1_weighted']:.4f}")
    print(f"‚úì Accuracy: {best_metrics['accuracy']:.4f}")


ENREGISTREMENT DU MEILLEUR MOD√àLE DANS LE MODEL REGISTRY


Successfully registered model 'Job_Classification_Best_Model'.
Created version '1' of model 'Job_Classification_Best_Model'.



‚úì Meilleur mod√®le enregistr√© dans le Model Registry
‚úì Run ID: 9ef967e6361e4825a6065f615d2b3acd
‚úì Model Name: Job_Classification_Best_Model
‚úì Model: Random_Forest
‚úì Features: Count
‚úì F1-Score: 0.7449
‚úì Accuracy: 0.7561


## Enregistrement du tableau comparatif

In [None]:
# Cr√©er un run pour les m√©tadonn√©es de l'exp√©rience
with mlflow.start_run(run_name="Experiment_Metadata") as run:
    # Enregistrer le tableau de comparaison comme artifact
    all_results.to_csv('../data/csv/mlflow_comparison.csv', index=False)
    mlflow.log_artifact('../data/csv/mlflow_comparison.csv', artifact_path="comparison")
    
    # Enregistrer des statistiques globales
    mlflow.log_metric("total_models_trained", len(trained_models))
    mlflow.log_metric("best_f1_score", best_metrics['f1_weighted'])
    mlflow.log_metric("best_accuracy", best_metrics['accuracy'])
    mlflow.log_metric("avg_f1_score", all_results['f1_weighted'].mean())
    mlflow.log_metric("std_f1_score", all_results['f1_weighted'].std())
    
    # Tags pour l'exp√©rience
    mlflow.set_tag("experiment_type", "model_comparison")
    mlflow.set_tag("timestamp", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    mlflow.set_tag("pipeline_complete", "yes")
    
    print("\n‚úì M√©tadonn√©es de l'exp√©rience enregistr√©es")
    print(f"‚úì Tableau comparatif enregistr√© comme artifact")


‚úì M√©tadonn√©es de l'exp√©rience enregistr√©es
‚úì Tableau comparatif enregistr√© comme artifact


## Visualisation des runs MLflow

In [None]:
# R√©cup√©rer tous les runs de l'exp√©rience
from mlflow.tracking import MlflowClient

client = MlflowClient()
runs = client.search_runs(
    experiment_ids=[experiment_id],
    order_by=["metrics.f1_weighted DESC"]
)

print("\n" + "="*80)
print("TOP 10 RUNS PAR F1-SCORE")
print("="*80)

for i, run in enumerate(runs[:10], 1):
    run_name = run.data.tags.get('mlflow.runName', 'N/A')
    f1_score = run.data.metrics.get('f1_weighted', 0)
    accuracy = run.data.metrics.get('accuracy', 0)
    is_best = run.data.tags.get('best_model', 'no')
    
    best_marker = " ‚≠ê MEILLEUR" if is_best == "yes" else ""
    
    print(f"{i}. {run_name}{best_marker}")
    print(f"   F1-Score: {f1_score:.4f} | Accuracy: {accuracy:.4f}")
    print(f"   Run ID: {run.info.run_id}")
    print()


TOP 10 RUNS PAR F1-SCORE
1. BEST_MODEL_Random_Forest_Count ‚≠ê MEILLEUR
   F1-Score: 0.7449 | Accuracy: 0.7561
   Run ID: 9ef967e6361e4825a6065f615d2b3acd

2. Random_Forest_Count ‚≠ê MEILLEUR
   F1-Score: 0.7449 | Accuracy: 0.7561
   Run ID: c4885dc25dae43f8ba60a3aaf51955fe

3. Logistic_Regression_Count
   F1-Score: 0.7379 | Accuracy: 0.7398
   Run ID: 12d9b88a69b24beea47456ca31c822e7

4. Logistic_Regression_TF-IDF
   F1-Score: 0.7340 | Accuracy: 0.7337
   Run ID: 3f6210b4dd4647ab9f2b30bb6aefd70d

5. Random_Forest_Combined
   F1-Score: 0.7336 | Accuracy: 0.7480
   Run ID: 7263ad738c974384b85b081a78ddb129

6. Logistic_Regression_SVD
   F1-Score: 0.7333 | Accuracy: 0.7337
   Run ID: 45f1c04dffe8410c9ac30ae25c8b1f59

7. Linear_SVC_Count
   F1-Score: 0.7271 | Accuracy: 0.7317
   Run ID: 12013371a12f400daf461ed56896ca24

8. Random_Forest_TF-IDF
   F1-Score: 0.7214 | Accuracy: 0.7378
   Run ID: cbfe326572c648c4b80d7f73d6245909

9. Linear_SVC_SVD
   F1-Score: 0.7204 | Accuracy: 0.7276
   Run

## Statistiques de l'exp√©rience

In [None]:
# Calculer des statistiques
f1_scores = [run.data.metrics.get('f1_weighted', 0) for run in runs]
accuracies = [run.data.metrics.get('accuracy', 0) for run in runs]

print("\n" + "="*80)
print("STATISTIQUES DE L'EXP√âRIENCE")
print("="*80)

print(f"\nNombre total de runs: {len(runs)}")
print(f"\nF1-Score (weighted):")
print(f"  Maximum: {max(f1_scores):.4f}")
print(f"  Minimum: {min(f1_scores):.4f}")
print(f"  Moyenne: {np.mean(f1_scores):.4f}")
print(f"  √âcart-type: {np.std(f1_scores):.4f}")

print(f"\nAccuracy:")
print(f"  Maximum: {max(accuracies):.4f}")
print(f"  Minimum: {min(accuracies):.4f}")
print(f"  Moyenne: {np.mean(accuracies):.4f}")
print(f"  √âcart-type: {np.std(accuracies):.4f}")


STATISTIQUES DE L'EXP√âRIENCE

Nombre total de runs: 22

F1-Score (weighted):
  Maximum: 0.7449
  Minimum: 0.0000
  Moyenne: 0.6442
  √âcart-type: 0.1588

Accuracy:
  Maximum: 0.7561
  Minimum: 0.0000
  Moyenne: 0.6516
  √âcart-type: 0.1600


## Informations sur le Model Registry

In [None]:
# R√©cup√©rer les informations du mod√®le enregistr√©
registered_model_name = "Job_Classification_Best_Model"

try:
    registered_model = client.get_registered_model(registered_model_name)
    latest_versions = client.get_latest_versions(registered_model_name)
    
    print("\n" + "="*80)
    print("INFORMATIONS DU MODEL REGISTRY")
    print("="*80)
    
    print(f"\nNom du mod√®le: {registered_model.name}")
    print(f"Description: {registered_model.description or 'N/A'}")
    print(f"\nVersions disponibles: {len(latest_versions)}")
    
    for version in latest_versions:
        print(f"\n  Version {version.version}:")
        print(f"    Stage: {version.current_stage}")
        print(f"    Run ID: {version.run_id}")
        print(f"    Status: {version.status}")
        
except Exception as e:
    print(f"Note: {str(e)}")


INFORMATIONS DU MODEL REGISTRY

Nom du mod√®le: Job_Classification_Best_Model
Description: N/A

Versions disponibles: 1

  Version 1:
    Stage: None
    Run ID: 9ef967e6361e4825a6065f615d2b3acd
    Status: READY


## Sauvegarde des informations MLflow

In [None]:
# Cr√©er un rapport r√©capitulatif
mlflow_info = {
    'experiment_name': experiment_name,
    'experiment_id': experiment_id,
    'total_runs': len(runs),
    'best_run_id': best_run_id,
    'best_model_key': best_model_key,
    'best_f1_score': best_metrics['f1_weighted'],
    'best_accuracy': best_metrics['accuracy'],
    'registered_model_name': registered_model_name,
    'all_run_ids': run_ids,
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'tracking_uri': mlflow.get_tracking_uri()
}

with open('../data/pkl/mlflow_info.pkl', 'wb') as f:
    pickle.dump(mlflow_info, f)

print("Informations MLflow sauvegard√©es: ../data/pkl/mlflow_info.pkl")

Informations MLflow sauvegard√©es: ../data/pkl/mlflow_info.pkl


## R√©sum√© MLflow

In [None]:
print("\n" + "="*80)
print("R√âSUM√â MLFLOW")
print("="*80)

print(f"\n1. EXP√âRIENCE MLFLOW:")
print(f"   - Nom: {experiment_name}")
print(f"   - ID: {experiment_id}")
print(f"   - Tracking URI: {mlflow.get_tracking_uri()}")

print(f"\n2. RUNS ENREGISTR√âS:")
print(f"   - Total: {len(runs)}")
print(f"   - Mod√®les standards: {len(trained_models)}")
print(f"   - Meilleur mod√®le: 1")
print(f"   - M√©tadonn√©es: 1")

print(f"\n3. MEILLEUR MOD√àLE:")
print(f"   - Mod√®le: {best_model_key}")
print(f"   - Run ID: {best_run_id}")
print(f"   - Registry Name: {registered_model_name}")
print(f"   - F1-Score: {best_metrics['f1_weighted']:.4f}")
print(f"   - Accuracy: {best_metrics['accuracy']:.4f}")

print(f"\n4. M√âTRIQUES ENREGISTR√âES POUR CHAQUE MOD√àLE:")
print(f"   - accuracy")
print(f"   - precision_weighted & precision_macro")
print(f"   - recall_weighted & recall_macro")
print(f"   - f1_weighted & f1_macro")
print(f"   - training_time")
print(f"   - prediction_time")

print(f"\n5. ARTIFACTS ENREGISTR√âS:")
print(f"   - Mod√®les sklearn pour tous les runs")
print(f"   - Tableau de comparaison (CSV)")
print(f"   - Meilleur mod√®le dans le Model Registry")

print(f"\n6. FICHIERS SAUVEGARD√âS LOCALEMENT:")
print(f"   - ../data/pkl/mlflow_info.pkl")
print(f"   - ../data/csv/mlflow_comparison.csv")

print(f"\n7. ACC√àS √Ä L'INTERFACE MLFLOW:")
print(f"   Commande: mlflow ui")
print(f"   URL: http://localhost:5000")

print("\n" + "="*80)
print("PIPELINE COMPLET TERMIN√â AVEC SUCC√àS")
print("="*80)
print("\nTous les mod√®les, m√©triques et le meilleur mod√®le ont √©t√© enregistr√©s dans MLflow.")
print("Vous pouvez maintenant visualiser les r√©sultats avec 'mlflow ui'.")


R√âSUM√â MLFLOW

1. EXP√âRIENCE MLFLOW:
   - Nom: Job_Classification_Pipeline
   - ID: 597620477807537161
   - Tracking URI: mlruns

2. RUNS ENREGISTR√âS:
   - Total: 22
   - Mod√®les standards: 20
   - Meilleur mod√®le: 1
   - M√©tadonn√©es: 1

3. MEILLEUR MOD√àLE:
   - Mod√®le: Random_Forest_Count
   - Run ID: 9ef967e6361e4825a6065f615d2b3acd
   - Registry Name: Job_Classification_Best_Model
   - F1-Score: 0.7449
   - Accuracy: 0.7561

4. M√âTRIQUES ENREGISTR√âES POUR CHAQUE MOD√àLE:
   - accuracy
   - precision_weighted & precision_macro
   - recall_weighted & recall_macro
   - f1_weighted & f1_macro
   - training_time
   - prediction_time

5. ARTIFACTS ENREGISTR√âS:
   - Mod√®les sklearn pour tous les runs
   - Tableau de comparaison (CSV)
   - Meilleur mod√®le dans le Model Registry

6. FICHIERS SAUVEGARD√âS LOCALEMENT:
   - ../data/pkl/mlflow_info.pkl
   - ../data/csv/mlflow_comparison.csv

7. ACC√àS √Ä L'INTERFACE MLFLOW:
   Commande: mlflow ui
   URL: http://localhost:5000

PI