# ⚙️ Initiez-vous au MLOps (partie 1/2)

## 🤖 Modélisation
### 🛠️ Préparez l'environnement de travail
#### 📦 Import des modules python

In [6]:
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

import mlflow
os.environ['MLFLOW_TRACKING_URI'] = 'http://127.0.0.1:5010'
mlflow.set_experiment("lung-cancer-detection")
mlflow.sklearn.autolog(log_input_examples=True, log_model_signatures=True)
mlflow.xgboost.autolog(log_input_examples=True, log_model_signatures=True) #, log_feature_importance_plot=False)
mlflow.lightgbm.autolog(log_input_examples=True, log_model_signatures=True) #, log_feature_importance_plot=False)

import pyarrow
import pandas as pd
import numpy as np
from src.visualization.visu_text import print_title, print_end, print_col, quick_df_info
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV
)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from src.models.modelization import (
    print_report,
    print_cross_validation_scores,
)

### Préparation du jeu de données

In [7]:
df = pd.read_parquet("../data/processed/survey_lung_cancer_features.parquet", engine='pyarrow')
# Conversion des colonnes booléennes en int pour compatibilité avec certains modèles
bool_cols = df.select_dtypes(include=['bool', 'boolean']).columns
df[bool_cols] = df[bool_cols].astype(int)
# Conversion des colonnes catégorielles en codes numériques
cat_cols = df.select_dtypes(include=['category']).columns
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)

quick_df_info(df)


┌-------------------------------* Information *-------------------------------┐
├─------- Shape: (4455, 30) - Colonnes:
├─GENDER                    int64     
├─AGE                       int64     
├─SMOKING                   int64     
├─YELLOW_FINGERS            int64     
├─ANXIETY                   int64     
├─PEER_PRESSURE             int64     
├─CHRONIC DISEASE           int64     
├─FATIGUE                   int64     
├─ALLERGY                   int64     
├─WHEEZING                  int64     
├─ALCOHOL CONSUMING         int64     
├─COUGHING                  int64     
├─SHORTNESS OF BREATH       int64     
├─SWALLOWING DIFFICULTY     int64     
├─CHEST PAIN                int64     
├─LUNG_CANCER               int64     
├─SMOKING_x_AGE             int64     
├─SMOKING_x_ALCOHOL         int64     
├─RESPIRATORY_SYMPTOMS      int64     
├─TOTAL_SYMPTOMS            int64     
├─BEHAVIORAL_RISK_SCORE     int64     
├─SEVERE_SYMPTOMS           int64     
├─AGE_GROUP          

### ⛓️‍💥 Separation du jeu de données

In [8]:
# ================================
# SÉPARATION TRAIN / VALIDATION / TEST
# ================================

y = df["LUNG_CANCER"]
X = df.drop(columns=["LUNG_CANCER"])

print_title(f"Données originales: {X.shape[0]} échantillons")
print_col(f"Distribution: Classe 0: {(y == 0).sum()}, Classe 1: {(y == 1).sum()}")
print_col(f"Proportion classe 1: {(y == 1).mean()*100:.1f}%")
print_end()

# Étape 1: Séparer train+val (80%) / test (20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    X,
    y,
    test_size=0.2,  # 20% pour test
    random_state=42,
    stratify=y,  # Garde la même proportion de classes
)

# Étape 2: Séparer train (60%) / validation (20%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=0.25,  # 25% de 80% = 20% du total
    random_state=42,
    stratify=y_temp,
)

# ================================
# VÉRIFICATION DES PROPORTIONS
# ================================

total = len(X)
print_title("RÉPARTITION FINALE:")
print_col(f" Train:      {len(X_train)} échantillons ({len(X_train)/total*100:.1f}%)")
print_col(f" Validation: {len(X_val)} échantillons ({len(X_val)/total*100:.1f}%)")
print_col(f"Test:       {len(X_test)} échantillons ({len(X_test)/total*100:.1f}%)")
print_end()

print_title("VÉRIFICATION STRATIFICATION:")
print_col(f" Original    - Classe 1: {(y == 1).mean()*100:.1f}%")
print_col(f" Train       - Classe 1: {(y_train == 1).mean()*100:.1f}%")
print_col(f" Validation  - Classe 1: {(y_val == 1).mean()*100:.1f}%")
print_col(f" Test        - Classe 1: {(y_test == 1).mean()*100:.1f}%")
print_end()

quick_df_info(X_train)


┌------------------* Données originales: 4455 échantillons *------------------┐
├─Distribution: Classe 0: 429, Classe 1: 4026
├─Proportion classe 1: 90.4%
└------------------------------------------------------------------------------┘

┌---------------------------* RÉPARTITION FINALE: *---------------------------┐
├─ Train:      2673 échantillons (60.0%)
├─ Validation: 891 échantillons (20.0%)
├─Test:       891 échantillons (20.0%)
└------------------------------------------------------------------------------┘

┌-----------------------* VÉRIFICATION STRATIFICATION: *-----------------------┐
├─ Original    - Classe 1: 90.4%
├─ Train       - Classe 1: 90.4%
├─ Validation  - Classe 1: 90.3%
├─ Test        - Classe 1: 90.3%
└------------------------------------------------------------------------------┘

┌-------------------------------* Information *-------------------------------┐
├─------- Shape: (2673, 29) - Colonnes:
├─GENDER                    int64     
├─AGE                     

### 🤖 Dummy Classifier, Modèle naif

In [9]:
dummy_pipeline = ImbPipeline([ 
    ('oversampling', SMOTE(random_state=42)),
    ('scaling', StandardScaler()),
    ('model', DummyClassifier(random_state=42)) 
])

dummy =dummy_pipeline.fit(X_train, y_train)
y_pred = dummy.predict(X_test)
# Afficher le rapport
accuracy, precision, recall, f1, f2 = print_report(
    y_test, y_pred, target_names=["Non", "Oui"]
)
dummy_score = {
    "Model": "Dummy",
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "F2-score": f2,
}

2025/10/09 14:58:39 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7aa3e2a3036c42d98010eef4763829cb', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
  "inputs": [
    [
      0.0,
      55.0,
     .... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Model does not have the "python_function" flavor
2025/10/09 14:58:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '50865dfaac3440688be155eae44534fd', which will track hyperparameters, performance metrics, model artifacts, and lineage informat

🏃 View run flawless-bug-991 at: http://127.0.0.1:5010/#/experiments/3/runs/7aa3e2a3036c42d98010eef4763829cb
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3




🏃 View run grandiose-gnu-884 at: http://127.0.0.1:5010/#/experiments/3/runs/50865dfaac3440688be155eae44534fd
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3

┌--------------------------* PERFORMANCE DU MODÈLE *--------------------------┐
├─ Accuracy (Exactitude):       0.097 │ (TP+TN)/(TP+TN+FP+FN)
├─ Precision (Précision):       0.000 │ TP/(TP+FP) | Minimiser les faux positifs. 
├─ Recall (Sensibilité):        0.000 │ TP/(TP+FN) | Minimiser les faux négatif. 
├─ F1-score:                    0.000 │ 2*Precision*Recall/(Precision+Recall)
├─ F2-score:                    0.000 │ 5*Precision*Recall/(4*Precision+Recall) | Privilégie le rappel)
└------------------------------------------------------------------------------┘

┌---------------------------* MATRICE DE CONFUSION *---------------------------┐
├─ RÉALITÉ \ PRÉDICTION
|                      Non         Oui
|          Non          86           0
|          Oui         805           0
|
├─ Détail (classe positive = Oui):


### 🤖 Regression logistique

In [10]:
rl_pipeline = ImbPipeline([ 
    ('oversampling', SMOTE(random_state=42)),
    ('scaling', StandardScaler()),
    ('model', LogisticRegression(random_state=42)) 
])

rl =rl_pipeline.fit(X_train, y_train)
y_pred = rl.predict(X_test)
# Afficher le rapport
accuracy, precision, recall, f1, f2 = print_report(
    y_test, y_pred, target_names=["Non", "Oui"]
)

logisticRegression_score = {
    "Model": "LogisticRegression",
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "F2-score": f2,
}

2025/10/09 14:58:43 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'd94c96593c5e465f8785ed6224406813', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
  "inputs": [
    [
      0.0,
      55.0,
     .... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Model does not have the "python_function" flavor
2025/10/09 14:58:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '02f547b6ffa7487e9c23a9fe3ddcef82', which will track hyperparameters, performance metrics, model artifacts, and lineage informat

🏃 View run bedecked-lynx-783 at: http://127.0.0.1:5010/#/experiments/3/runs/d94c96593c5e465f8785ed6224406813
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3




🏃 View run glamorous-sloth-551 at: http://127.0.0.1:5010/#/experiments/3/runs/02f547b6ffa7487e9c23a9fe3ddcef82
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3

┌--------------------------* PERFORMANCE DU MODÈLE *--------------------------┐
├─ Accuracy (Exactitude):       0.933 │ (TP+TN)/(TP+TN+FP+FN)
├─ Precision (Précision):       0.996 │ TP/(TP+FP) | Minimiser les faux positifs. 
├─ Recall (Sensibilité):        0.929 │ TP/(TP+FN) | Minimiser les faux négatif. 
├─ F1-score:                    0.961 │ 2*Precision*Recall/(Precision+Recall)
├─ F2-score:                    0.942 │ 5*Precision*Recall/(4*Precision+Recall) | Privilégie le rappel)
└------------------------------------------------------------------------------┘

┌---------------------------* MATRICE DE CONFUSION *---------------------------┐
├─ RÉALITÉ \ PRÉDICTION
|                      Non         Oui
|          Non          83           3
|          Oui          57         748
|
├─ Détail (classe positive = Oui)

### 🤖 Random Forest

In [11]:
rf_pipeline = ImbPipeline([ 
    ('oversampling', SMOTE(random_state=42)),
    ('model', RandomForestClassifier(random_state=42)) 
])

rf = rf_pipeline.fit(X_train, y_train)
y_pred = rf.predict(X_test)
# Afficher le rapport
accuracy, precision, recall, f1, f2 = print_report(
    y_test, y_pred, target_names=["Non", "Oui"]
)

randomForestClassifier_score = {
    "Model": "RandomForestClassifier",
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "F2-score": f2,
}

2025/10/09 14:58:46 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '23d32238194f43b58d1ca3a6946a8aa1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
  "inputs": [
    [
      0.0,
      55.0,
     .... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Model does not have the "python_function" flavor
2025/10/09 14:58:47 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '06ad3df8429b4e98bda37236df80bd4a', which will track hyperparameters, performance metrics, model artifacts, and lineage informat

🏃 View run honorable-donkey-795 at: http://127.0.0.1:5010/#/experiments/3/runs/23d32238194f43b58d1ca3a6946a8aa1
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3




🏃 View run languid-squirrel-396 at: http://127.0.0.1:5010/#/experiments/3/runs/06ad3df8429b4e98bda37236df80bd4a
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3

┌--------------------------* PERFORMANCE DU MODÈLE *--------------------------┐
├─ Accuracy (Exactitude):       1.000 │ (TP+TN)/(TP+TN+FP+FN)
├─ Precision (Précision):       1.000 │ TP/(TP+FP) | Minimiser les faux positifs. 
├─ Recall (Sensibilité):        1.000 │ TP/(TP+FN) | Minimiser les faux négatif. 
├─ F1-score:                    1.000 │ 2*Precision*Recall/(Precision+Recall)
├─ F2-score:                    1.000 │ 5*Precision*Recall/(4*Precision+Recall) | Privilégie le rappel)
└------------------------------------------------------------------------------┘

┌---------------------------* MATRICE DE CONFUSION *---------------------------┐
├─ RÉALITÉ \ PRÉDICTION
|                      Non         Oui
|          Non          86           0
|          Oui           0         805
|
├─ Détail (classe positive = Oui

### 🤖 XGBoost

In [12]:
# Pipeline imblearn (supporte SMOTE)
xgb_pipeline = ImbPipeline([
    ('oversampling', SMOTE(random_state=42)),
    ('model', xgb.XGBClassifier(random_state=42))
])

# Pas besoin de StandardScaler avec XGBoost
xgb_model = xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

accuracy, precision, recall, f1, f2 = print_report(
    y_test, y_pred, target_names=["Non", "Oui"]
)

xgb_score = {
    "Model": "XGBClassifier",
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "F2-score": f2,
}


2025/10/09 14:58:49 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '37b76b2fb0d048ac86a5e1d0293274da', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
  "inputs": [
    [
      0.0,
      55.0,
     .... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Model does not have the "python_function" flavor
2025/10/09 14:58:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2939dc0f672f4aecb69fb3d4ddc427e9', which will track hyperparameters, performance metrics, model artifacts, and lineage informat

🏃 View run adaptable-stork-170 at: http://127.0.0.1:5010/#/experiments/3/runs/37b76b2fb0d048ac86a5e1d0293274da
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3


2025/10/09 14:58:51 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/Users/francoishellebuyck/Documents/projects/openclassrooms/project6/.venv/lib/python3.13/site-packages/mlflow/xgboost/__init__.py", line 806, in train_impl
    log_feature_importance_plot(features, importance, imp_type)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/francoishellebuyck/Documents/projects/openclassrooms/project6/.venv/lib/python3.13/site-packages/mlflow/xgboost/__init__.py", line 656, in log_feature_importance_plot
    mlflow.log_artifact(filepath)
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^
  File "/Users/francoishellebuyck/Documents/projects/openclassrooms/project6/.venv/lib/python3.13/site-packages/mlflow/tracking/fluent.py", line 1429, in log_artifact
    MlflowClient().log_artifact(run_id, local_path, artifact_path)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^

🏃 View run stately-vole-344 at: http://127.0.0.1:5010/#/experiments/3/runs/2939dc0f672f4aecb69fb3d4ddc427e9
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3

┌--------------------------* PERFORMANCE DU MODÈLE *--------------------------┐
├─ Accuracy (Exactitude):       1.000 │ (TP+TN)/(TP+TN+FP+FN)
├─ Precision (Précision):       1.000 │ TP/(TP+FP) | Minimiser les faux positifs. 
├─ Recall (Sensibilité):        1.000 │ TP/(TP+FN) | Minimiser les faux négatif. 
├─ F1-score:                    1.000 │ 2*Precision*Recall/(Precision+Recall)
├─ F2-score:                    1.000 │ 5*Precision*Recall/(4*Precision+Recall) | Privilégie le rappel)
└------------------------------------------------------------------------------┘

┌---------------------------* MATRICE DE CONFUSION *---------------------------┐
├─ RÉALITÉ \ PRÉDICTION
|                      Non         Oui
|          Non          86           0
|          Oui           0         805
|
├─ Détail (classe positive = Oui):
├

### 🤖 Lightgbm

In [13]:
import lightgbm as lgb
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Créer le pipeline avec SMOTE et LightGBM
lgb_pipeline = ImbPipeline([
    ('oversampling', SMOTE(random_state=42)),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        verbose=-1  # Désactiver les warnings
    ))
])

# Entraîner le modèle
lgb_pipeline.fit(X_train, y_train)

# Prédire
y_pred = lgb_pipeline.predict(X_test)

# Évaluation
accuracy, precision, recall, f1, f2 = print_report(
    y_test, y_pred, target_names=["Non", "Oui"]
)

lgb_score = {
    "Model": "LightGBM",
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-score": f1,
    "F2-score": f2,
}


2025/10/09 14:58:53 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ba8f8204c89541ab9c7a61140922e379', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
  "inputs": [
    [
      0.0,
      55.0,
     .... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Model does not have the "python_function" flavor
2025/10/09 14:58:54 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '277fe93cc69b4cf1b1decf2db9d0d319', which will track hyperparameters, performance metrics, model artifacts, and lineage informat

🏃 View run adorable-frog-360 at: http://127.0.0.1:5010/#/experiments/3/runs/ba8f8204c89541ab9c7a61140922e379
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3


2025/10/09 14:58:55 ERROR mlflow.lightgbm: Failed to log feature importance plot. LightGBM autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/Users/francoishellebuyck/Documents/projects/openclassrooms/project6/.venv/lib/python3.13/site-packages/mlflow/lightgbm/__init__.py", line 851, in train_impl
    log_feature_importance_plot(features, importance, imp_type)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/francoishellebuyck/Documents/projects/openclassrooms/project6/.venv/lib/python3.13/site-packages/mlflow/lightgbm/__init__.py", line 732, in log_feature_importance_plot
    mlflow.log_artifact(filepath)
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^
  File "/Users/francoishellebuyck/Documents/projects/openclassrooms/project6/.venv/lib/python3.13/site-packages/mlflow/tracking/fluent.py", line 1429, in log_artifact
    MlflowClient().log_artifact(run_id, local_path, artifact_path)
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~^^

🏃 View run bemused-bass-11 at: http://127.0.0.1:5010/#/experiments/3/runs/277fe93cc69b4cf1b1decf2db9d0d319
🧪 View experiment at: http://127.0.0.1:5010/#/experiments/3

┌--------------------------* PERFORMANCE DU MODÈLE *--------------------------┐
├─ Accuracy (Exactitude):       1.000 │ (TP+TN)/(TP+TN+FP+FN)
├─ Precision (Précision):       1.000 │ TP/(TP+FP) | Minimiser les faux positifs. 
├─ Recall (Sensibilité):        1.000 │ TP/(TP+FN) | Minimiser les faux négatif. 
├─ F1-score:                    1.000 │ 2*Precision*Recall/(Precision+Recall)
├─ F2-score:                    1.000 │ 5*Precision*Recall/(4*Precision+Recall) | Privilégie le rappel)
└------------------------------------------------------------------------------┘

┌---------------------------* MATRICE DE CONFUSION *---------------------------┐
├─ RÉALITÉ \ PRÉDICTION
|                      Non         Oui
|          Non          86           0
|          Oui           0         805
|
├─ Détail (classe positive = Oui):
├─