# Préprocessing (CSV)

Ce notebook fait:
- nettoyage basique (doublons, valeurs manquantes)
- séparation features (`X`) / cible (`y`)
- preprocessing avec scikit-learn:
  - imputation
  - standardisation **ou** normalisation min-max
  - encodage One-Hot des variables catégorielles

Exécute les cellules dans l’ordre. Modifie `TARGET` dans la section **Paramètres**.

In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler

In [2]:
# Paramètres
CSV_PATH = "data.csv"
TARGET = "Job Title"

# Choix du scaling: "standard" (standardisation), "minmax" (normalisation), "none" (aucun)
SCALE = "standard"

# Stratégies d'imputation
IMPUTE_NUM = "median"           # mean | median | most_frequent | constant
IMPUTE_CAT = "most_frequent"    # most_frequent | constant

# Split train / execution / test
TEST_SIZE = 0.20
EXEC_SIZE = 0.20  # portion du "reste" (après avoir retiré le test)
RANDOM_STATE = 42

In [3]:
# Chargement + diagnostic rapide

df_raw = pd.read_csv(CSV_PATH)
print("Colonnes:", list(df_raw.columns))
print("Shape brut:", df_raw.shape)

# Doublons (lignes identiques)
print("Doublons:", int(df_raw.duplicated().sum()))

# Valeurs manquantes (par colonne)
na_counts = df_raw.isna().sum().sort_values(ascending=False)
print("\nTop NA (si >0):")
print(na_counts[na_counts > 0].head(20))

df_raw.head()

Colonnes: ['Index', 'User Id', 'First Name', 'Last Name', 'Sex', 'Email', 'Phone', 'Date of birth', 'Job Title']
Shape brut: (1000, 9)
Doublons: 0

Top NA (si >0):
Series([], dtype: int64)


Unnamed: 0,Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title
0,1,8717bbf45cCDbEe,Shelia,Mahoney,Male,pwarner@example.org,857.139.8239,2014-01-27,Probation officer
1,2,3d5AD30A4cD38ed,Jo,Rivers,Female,fergusonkatherine@example.net,+1-950-759-8687,1931-07-26,Dancer
2,3,810Ce0F276Badec,Sheryl,Lowery,Female,fhoward@example.org,(599)782-0605,2013-11-25,Copy
3,4,BF2a889C00f0cE1,Whitney,Hooper,Male,zjohnston@example.com,+1-939-130-6258,2012-11-17,Counselling psychologist
4,5,9afFEafAe1CBBB9,Lindsey,Rice,Female,elin@example.net,(390)417-1635x3010,1923-04-15,Biomedical engineer


In [4]:
def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Nettoyage basique: NA + doublons + suppression index."""
    cleaned = df.copy()

    # Convertit "vide" ("", "   ") en NA
    cleaned = cleaned.replace(r"^\s*$", pd.NA, regex=True)

    # Doublons exacts
    cleaned = cleaned.drop_duplicates()

    # Colonnes d'index typiques
    for col in ("Unnamed: 0", "Index"):
        if col in cleaned.columns:
            s = cleaned[col]
            if s.is_unique and pd.api.types.is_numeric_dtype(s):
                cleaned = cleaned.drop(columns=[col])

    return cleaned


df = basic_clean(df_raw)
print("Shape après clean:", df.shape)
print("Doublons après clean:", int(df.duplicated().sum()))

df.head()

Shape après clean: (1000, 8)
Doublons après clean: 0


Unnamed: 0,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title
0,8717bbf45cCDbEe,Shelia,Mahoney,Male,pwarner@example.org,857.139.8239,2014-01-27,Probation officer
1,3d5AD30A4cD38ed,Jo,Rivers,Female,fergusonkatherine@example.net,+1-950-759-8687,1931-07-26,Dancer
2,810Ce0F276Badec,Sheryl,Lowery,Female,fhoward@example.org,(599)782-0605,2013-11-25,Copy
3,BF2a889C00f0cE1,Whitney,Hooper,Male,zjohnston@example.com,+1-939-130-6258,2012-11-17,Counselling psychologist
4,9afFEafAe1CBBB9,Lindsey,Rice,Female,elin@example.net,(390)417-1635x3010,1923-04-15,Biomedical engineer


In [5]:
def split_features_target(df: pd.DataFrame, target: str) -> tuple[pd.DataFrame, pd.Series]:
    if target not in df.columns:
        raise ValueError(f"TARGET='{target}' introuvable. Colonnes: {list(df.columns)}")
    return df.drop(columns=[target]), df[target]


def infer_column_types(X: pd.DataFrame) -> tuple[list[str], list[str]]:
    numeric_cols = list(X.select_dtypes(include=["number", "bool"]).columns)
    categorical_cols = [c for c in X.columns if c not in numeric_cols]
    return numeric_cols, categorical_cols


X, y = split_features_target(df, TARGET)
num_cols, cat_cols = infer_column_types(X)

print("X:", X.shape)
print("y:", y.shape)
print("Num cols:", num_cols)
print("Cat cols:", cat_cols)

X: (1000, 7)
y: (1000,)
Num cols: []
Cat cols: ['User Id', 'First Name', 'Last Name', 'Sex', 'Email', 'Phone', 'Date of birth']


## Split: train / execution / test

On découpe en 3 ensembles:
- **train**: pour *fit* (apprendre) l’imputation, le scaling, et le one-hot
- **execution** (validation): pour tester/ajuster sans toucher au test final
- **test**: uniquement pour la mesure finale (pas de fit dessus)

In [6]:
# Split en 3 parties: train / execution / test

if not (0 < TEST_SIZE < 1):
    raise ValueError("TEST_SIZE doit être entre 0 et 1")
if not (0 < EXEC_SIZE < 1):
    raise ValueError("EXEC_SIZE doit être entre 0 et 1")

# 1) isole le test
X_tmp, X_test, y_tmp, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
)

# 2) split le reste en train + execution
X_train, X_exec, y_train, y_exec = train_test_split(
    X_tmp,
    y_tmp,
    test_size=EXEC_SIZE,
    random_state=RANDOM_STATE,
)

print("train:", X_train.shape, y_train.shape)
print("execution:", X_exec.shape, y_exec.shape)
print("test:", X_test.shape, y_test.shape)

train: (640, 7) (640,)
execution: (160, 7) (160,)
test: (200, 7) (200,)


In [7]:
def build_preprocessor(
    X: pd.DataFrame,
    *,
    scale: str,
    impute_num: str,
    impute_cat: str,
) -> ColumnTransformer:
    if scale not in {"standard", "minmax", "none"}:
        raise ValueError("SCALE doit être: 'standard', 'minmax' ou 'none'")

    numeric_cols, categorical_cols = infer_column_types(X)

    # Numérique: imputation + (optionnel) scaling
    num_steps = [("imputer", SimpleImputer(strategy=impute_num))]
    if scale == "standard":
        num_steps.append(("scaler", StandardScaler()))
    elif scale == "minmax":
        num_steps.append(("scaler", MinMaxScaler()))

    numeric_pipe = Pipeline(steps=num_steps)

    # Catégoriel: imputation + one-hot
    cat_imputer_kwargs = {"fill_value": "missing"} if impute_cat == "constant" else {}
    categorical_pipe = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy=impute_cat, **cat_imputer_kwargs)),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]
    )

    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, numeric_cols),
            ("cat", categorical_pipe, categorical_cols),
        ],
        remainder="drop",
    )


# 1) TRAIN: fit du preprocessing UNIQUEMENT sur train (évite la fuite de données)
preprocessor = build_preprocessor(
    X_train, scale=SCALE, impute_num=IMPUTE_NUM, impute_cat=IMPUTE_CAT
 )
X_train_prepared = preprocessor.fit_transform(X_train)


# 2) EXECUTION + TEST: transform uniquement
X_exec_prepared = preprocessor.transform(X_exec)
X_test_prepared = preprocessor.transform(X_test)


print("X train brut:", X_train.shape, "->", X_train_prepared.shape)
print("X exec brut:", X_exec.shape, "->", X_exec_prepared.shape)
print("X test brut:", X_test.shape, "->", X_test_prepared.shape)


# Optionnel: noms des features après one-hot (si scikit-learn le supporte)
try:
    feature_names = preprocessor.get_feature_names_out()
    print("Nb features (noms):", len(feature_names))
except Exception:
    pass

X train brut: (640, 7) -> (640, 3457)
X exec brut: (160, 7) -> (160, 3457)
X test brut: (200, 7) -> (200, 3457)
Nb features (noms): 3457


## Variantes de preprocessing

Change `SCALE` en haut puis ré-exécute les cellules de preprocessing.

- `SCALE = "standard"` → Standardisation
- `SCALE = "minmax"` → Normalisation min-max
- `SCALE = "none"` → Pas de scaling

L’imputation est contrôlée par `IMPUTE_NUM` et `IMPUTE_CAT`.