In [14]:
import pandas as pd
import joblib
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Charger le dataset
df = pd.read_csv("diabetes.csv")

# Séparer X et y
X = df.drop(columns=["Outcome"])
y = df["Outcome"]

# Colonnes avec zéros invalides
invalid_zero_cols = [
    "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"
]
X[invalid_zero_cols] = X[invalid_zero_cols].replace(0, np.nan)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Pipeline
pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("model", RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        class_weight="balanced"
    ))
])

# Entraînement
pipeline.fit(X_train, y_train)

# Évaluation
y_pred = pipeline.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Sauvegarde
joblib.dump(pipeline, "diabetes_model.pkl")
print("✅ Modèle sauvegardé")


Accuracy : 0.7532467532467533
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

✅ Modèle sauvegardé
