In [1]:
# Import
import pandas as pd
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
# Vérification des chemins
os.makedirs("data/processed", exist_ok=True)
print(os.getcwd())
print(os.path.exists("data/processed/preprocessor.pkl"))

c:\Users\Emma\Documents\Certifications\Projet 1 Doctolib\Mini-projet (entrainement)\Grand projet Hemato
True


## Chargement du dataset

In [3]:
# Chargement
df = pd.read_csv('df_clean.csv')

In [4]:
# Encodage de la variable cible
df["Patient's Vital Status"] = df["Patient's Vital Status"].map({"Dead":1, "Alive":0})
df = df.dropna(subset=["Patient's Vital Status"])

In [5]:
# Split stratifié
X = df.drop(columns=["Patient's Vital Status"])
y = df["Patient's Vital Status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [6]:
# Conversion des booléens en str pour OneHot
for col in X_train.select_dtypes(include="bool").columns:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

## Preprocessing

In [7]:
num_cols = X_train.select_dtypes(include="number").columns.tolist()
cat_cols = X_train.select_dtypes(exclude="number").columns.tolist()

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

## Définition des modèles

In [8]:
models = {
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced_subsample",
        n_jobs=-1,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        eval_metric="logloss",
        use_label_encoder=False,
        n_jobs=-1,
        random_state=42
    )
}

## Entrainement et évaluation

In [9]:
results = []

for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    # Fit sur train
    pipe.fit(X_train, y_train)
    
    # Prédiction probabilités
    y_proba = pipe.predict_proba(X_test)[:,1]
    y_pred = pipe.predict(X_test)
    
    # Calcul des métriques
    auc = roc_auc_score(y_test, y_proba)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        "Model": name,
        "AUC": auc,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    })
    
    print(f"{name} - AUC: {auc:.3f}, Precision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}")



Logistic Regression - AUC: 0.843, Precision: 0.850, Recall: 0.680, F1: 0.756
Random Forest - AUC: 0.782, Precision: 0.710, Recall: 0.880, F1: 0.786


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - AUC: 0.828, Precision: 0.759, Recall: 0.880, F1: 0.815


## Récapitulatif des modèles

In [10]:
results_df = pd.DataFrame(results).sort_values("AUC", ascending=False)
print("\nClassement des modèles :")
display(results_df)

# %%
# Sauvegarde du meilleur modèle
best_model_name = results_df.iloc[0]["Model"]
best_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", models[best_model_name])
])
best_pipeline.fit(X_train, y_train)
joblib.dump(best_pipeline, "data/processed/best_model.pkl")

print(f"Meilleur modèle sauvegardé : {best_model_name}")


Classement des modèles :


Unnamed: 0,Model,AUC,Precision,Recall,F1
0,Logistic Regression,0.843077,0.85,0.68,0.755556
2,XGBoost,0.827692,0.758621,0.88,0.814815
1,Random Forest,0.781538,0.709677,0.88,0.785714


Meilleur modèle sauvegardé : Logistic Regression


