In [34]:
# Importation de la base de données
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score

In [35]:
# ====== Importer le jeu de données ======
data = pd.read_excel(r"D:\Projects\IT\Data Science & IA\Prediction_des_Maladies_et_Proposition_de_Traitement\datasets\clinic_data.xlsx")
print("Jeu de données importé ✅✅")

Jeu de données importé ✅✅


In [36]:
# Séparer des features/Target
x = data.drop(columns=['Diagnostique','Traitement'])
y = data["Diagnostique"]

# Encoder du Target 
label = LabelEncoder()
y = label.fit_transform(y)

# === Diviser les données en entraînement et de test ===
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42, test_size=0.2)

In [37]:
# === Détection automatique des types de données ===
num_col = data.select_dtypes(include=['int64','float64']).columns.tolist()
cat_col = data.select_dtypes(include=["object"]).columns.difference(['Traitement','Diagnostique']).tolist()

In [38]:
# ====== Préprocesseurs ======
num_transformer =Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('oneencoder', OneHotEncoder(sparse_output= False, handle_unknown='ignore'))
])

# ====== ColumnTransformer général =======
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_col),
        ('cat',cat_transformer,cat_col),
    ]
)

In [39]:
# ======= Modèles à entraîner ======
models = {
    'logistic': LogisticRegression(max_iter=1000, solver='liblinear',class_weight="balanced"),
    'random_forest': RandomForestClassifier(class_weight="balanced"),
    'svm': SVC(class_weight="balanced"),
    'xgboost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

# Grilles d'hyperparamètres
param_dist = {
    "logistic": {
        "classifier__C": [0.1, 1, 10],
        "classifier__penalty": ["l1", "l2"]
    },
    "random_forest": {
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [10, 20, None]
    },
    "svm": {
        "classifier__C": [0.1, 1, 10],
        "classifier__kernel": ["linear", "rbf"]
    },
    "xgboost": {
        "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2],
        "classifier__n_estimators": [100, 200, 300],
        "classifier__max_depth": [3, 6, 10],
        "classifier__subsample": [0.7, 0.8, 0.9],
        "classifier__colsample_bytree": [0.7, 0.8, 1],
        "classifier__gamma": [0, 0.1, 0.2]
    }
}

# ====== Entraînement avec RandomizedSearchCV ======
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_models = {}

for name, model in models.items():
    print(f"\n🔍 Entraînement du modèle : {name}")
    
    pipe = Pipeline([
        ('preprocessing', preprocessor),
        ('classifier', model)
    ])
    
    random_search = RandomizedSearchCV(
        pipe,
        param_distributions=param_dist[name],
        n_iter=20,  # tu peux ajuster selon ton temps de calcul
        cv=cv,
        n_jobs=-1,
        scoring='accuracy',
        random_state=42
    )
    
    random_search.fit(x_train, y_train)
    best_model = random_search.best_estimator_
    best_models[name] = best_model
    
    # ====== Évaluation ======
    y_pred = best_model.predict(x_test)
    
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    cm = confusion_matrix(y_test, y_pred)
    
    # ====== Afficher les résultats ======
    print(f"✅ {name.upper()} - Accuracy : {acc:.3f} | Recall : {rec:.3f} | F1-score : {f1:.3f}")
    print(f"Meilleurs hyperparamètres : {random_search.best_params_}")
    print(f"Confusion Matrix : \n{cm}")



🔍 Entraînement du modèle : logistic




✅ LOGISTIC - Accuracy : 0.195 | Recall : 0.188 | F1-score : 0.121
Meilleurs hyperparamètres : {'classifier__penalty': 'l2', 'classifier__C': 0.1}
Confusion Matrix : 
[[29  0  1  5  7]
 [30  0  0  5  7]
 [16  0  0  8 12]
 [30  0  0  4  3]
 [25  0  0 12  6]]

🔍 Entraînement du modèle : random_forest




✅ RANDOM_FOREST - Accuracy : 0.180 | Recall : 0.180 | F1-score : 0.173
Meilleurs hyperparamètres : {'classifier__n_estimators': 200, 'classifier__max_depth': 20}
Confusion Matrix : 
[[14  3  4  8 13]
 [15  4  2 12  9]
 [ 7  1  5  9 14]
 [13  4  8  8  4]
 [23  3  4  8  5]]

🔍 Entraînement du modèle : svm




✅ SVM - Accuracy : 0.185 | Recall : 0.182 | F1-score : 0.138
Meilleurs hyperparamètres : {'classifier__kernel': 'linear', 'classifier__C': 1}
Confusion Matrix : 
[[21  1  0 15  5]
 [21  2  0 11  8]
 [15  0  0 10 11]
 [21  1  0 10  5]
 [20  1  0 18  4]]

🔍 Entraînement du modèle : xgboost


Parameters: { "use_label_encoder" } are not used.



✅ XGBOOST - Accuracy : 0.190 | Recall : 0.189 | F1-score : 0.179
Meilleurs hyperparamètres : {'classifier__subsample': 0.7, 'classifier__n_estimators': 300, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.2, 'classifier__gamma': 0, 'classifier__colsample_bytree': 0.8}
Confusion Matrix : 
[[15  2 10  8  7]
 [13  3  5 11 10]
 [ 8  2  5  8 13]
 [11  3  9  8  6]
 [10  6  7 13  7]]


In [41]:
import mlflow
import mlflow.sklearn

# Entraînement du modèle Logistic Regression
with mlflow.start_run():
    model = LogisticRegression(max_iter=1000, C=0.1, penalty='l2')
    model.fit(x_train, y_train)
    
    # Prédictions et évaluation
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = f1_score(y_test, y_pred)
    
    # Log des performances
    mlflow.log_param("penalty", 'l2')
    mlflow.log_param("C", 0.1)
    mlflow.log_metric("accuracy", accuracy)
    
    print(report)
    
    # Enregistrement du modèle dans le Model Registry
    mlflow.sklearn.log_model(model, "logistic_regression_model")




ValueError: could not convert string to float: 'Absent'