Le preprocessing a été externalisé dans un module Python dédié (src/preprocessing.py) afin de garantir la réutilisabilité du code, la lisibilité du projet et la prévention du data leakage. Ce module est importé dans la phase de modélisation.

In [None]:
%%writefile src/preprocessing.py


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


def load_data(url):
    """Charge les données depuis GitHub"""
    return pd.read_csv(url, sep="\t")


def build_preprocessor(num_features, cat_features):
    """Construit le pipeline de preprocessing"""

    numeric_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(
            handle_unknown="ignore",
            sparse_output=False
        ))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, num_features),
            ("cat", categorical_pipeline, cat_features)
        ]
    )

    return preprocessor


def prepare_train_test(
    df,
    target_col,
    num_features,
    cat_features,
    test_size=0.2,
    random_state=42
):
    """Split + preprocessing sans data leakage"""

    X = df[num_features + cat_features]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y,
        random_state=random_state
    )

    preprocessor = build_preprocessor(num_features, cat_features)

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    return X_train_processed, X_test_processed, y_train, y_test, preprocessor
