In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import os



In [2]:
# Création dossier pour sauvegarde
os.makedirs("data/processed", exist_ok=True)

In [3]:
# Chargement du dataset nettoyé
df=pd.read_csv('df_clean.csv')

In [4]:
# Encodage de la variable cible
df["Patient's Vital Status"] = df["Patient's Vital Status"].map({"Dead":1, "Alive":0})
df = df.dropna(subset=["Patient's Vital Status"])

In [5]:
# Split stratifié
X = df.drop(columns=["Patient's Vital Status"])
y = df["Patient's Vital Status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [6]:
# Conversion des booléens et pipeline
def bool_to_str(X):
    return X.astype(str)

num_cols = X_train.select_dtypes(include=["int64","float64"]).columns
cat_cols = X_train.select_dtypes(include=["object", "bool", "category"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("bool_to_str", FunctionTransformer(bool_to_str, validate=False)),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])

# Fit uniquement sur le train pour éviter le data leakage
preprocessor.fit(X_train)

# Transformation
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  cat_cols = X_train.select_dtypes(include=["object", "bool", "category"]).columns


In [7]:
# Sauvegarde du preprocessor
joblib.dump(preprocessor, "data/processed/preprocessor.pkl")

['data/processed/preprocessor.pkl']

In [8]:
# Récupération des noms de features pour SHAP
num_feature_names = num_cols.tolist()
cat_pipeline_fitted = preprocessor.named_transformers_['cat']
cat_feature_names = cat_pipeline_fitted.named_steps['onehot'].get_feature_names_out(cat_cols)
feature_names = num_feature_names + list(cat_feature_names)

### Prévention du data leakage
Le preprocessing est ajusté uniquement sur le jeu d’entraînement.
Le jeu de test n’est jamais utilisé lors du fit afin d’éviter toute fuite d’information.
