# Pipeline de prétraitement

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib


PROJECT_DIR = Path().cwd().parent.resolve()
DATA_DIR = PROJECT_DIR / "data"
DATA_PATH = DATA_DIR / "03_DONNEES.csv"
OUTPUT_DIR = PROJECT_DIR / "output"
PREPROCESSOR_PATH = OUTPUT_DIR / "preprocessor.pkl"

## Chargement des données

In [2]:
df = pd.read_csv(DATA_PATH.as_posix())

X = df.drop(["customerID", "Churn"], axis=1)
y = df["Churn"].copy()

## Création de la pipeline de prétraitement

In [3]:
num_features = X.select_dtypes(include=np.number).columns.to_list()
cat_features = X.select_dtypes(include=["object", "str"]).columns.to_list()

print("Features numériques : ", num_features)
print("Features catégorielles : ", cat_features)

Features numériques :  ['SeniorCitizen', 'tenure', 'InternetCharges', 'MonthlyCharges', 'TotalCharges']
Features catégorielles :  ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract']


In [4]:
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, cat_features),
        ("num", num_pipeline, num_features),
    ]
)

## Sauvegarde de la pipeline de prétraitement

In [5]:
joblib.dump(preprocessor, PREPROCESSOR_PATH.as_posix())
print("RandomForestRegressor Pipeline path: ", PREPROCESSOR_PATH.as_posix())

RandomForestRegressor Pipeline path:  C:/Users/Administrateur/Documents/DESSAUX_Damien_ECF3/output/preprocessor.pkl
