In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# ========= 1. Chargement données =========
df = pd.read_csv("income_cleaned.csv")

# ========= 2. Cible = income =========
TARGET_COL = "income"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# ========= 3. Séparer num / cat =========
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numériques :", numeric_cols)
print("Catégorielles :", cat_cols)

# ========= 4. Prétraitement =========
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# ========= 5. Modèle =========
model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

# ========= 6. Pipeline complet =========
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

# ========= 7. Split train/test =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ========= 8. Entraînement =========
print("\nEntraînement...")
clf.fit(X_train, y_train)

# ========= 9. Évaluation =========
y_pred = clf.predict(X_test)

print("\nAccuracy :", accuracy_score(y_test, y_pred))
print("\nClassification report :\n")
print(classification_report(y_test, y_pred))

# ========= 10. Exemple prédiction =========
print("\nExemple de prédictions :")
sample = X_test.iloc[:3]
print(sample)
print("\nPrédictions :", clf.predict(sample))


Numériques : ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Catégorielles : ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Entraînement...

Accuracy : 0.8435599778883361

Classification report :

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6803
           1       0.71      0.62      0.66      2242

    accuracy                           0.84      9045
   macro avg       0.80      0.77      0.78      9045
weighted avg       0.84      0.84      0.84      9045


Exemple de prédictions :
       age  workclass     education  education-num      marital-status  \
36919   27    Private  Some-college             10       Never-married   
17947   43  Local-gov   Prof-school             15  Married-civ-spouse   
3173    49  Local-gov     Bachelors             13            Divorced   

           occupation relationship   race     sex  capital-gain 

test avec la donnée sans le travail des features supplémentaires

In [6]:
# ========= 1. Chargement données =========
df = pd.read_csv("income_boosted.csv")

# ========= 2. Cible = income =========
TARGET_COL = "income"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# ========= 3. Séparer num / cat =========
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numériques :", numeric_cols)
print("Catégorielles :", cat_cols)

# ========= 4. Prétraitement =========
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

# ========= 5. Modèle =========
model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

# ========= 6. Pipeline complet =========
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

# ========= 7. Split train/test =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ========= 8. Entraînement =========
print("\nEntraînement...")
clf.fit(X_train, y_train)

# ========= 9. Évaluation =========
y_pred = clf.predict(X_test)

print("\nAccuracy :", accuracy_score(y_test, y_pred))
print("\nClassification report :\n")
print(classification_report(y_test, y_pred))

# ========= 10. Exemple prédiction =========
print("\nExemple de prédictions :")
sample = X_test.iloc[:3]
print(sample)
print("\nPrédictions :", clf.predict(sample))


Numériques : ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'estimation_carriere_age', 'is_married', 'maried_education_experience']
Catégorielles : ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

Entraînement...

Accuracy : 0.8465450525152017

Classification report :

              precision    recall  f1-score   support

           0       0.88      0.92      0.90      6803
           1       0.72      0.62      0.67      2242

    accuracy                           0.85      9045
   macro avg       0.80      0.77      0.78      9045
weighted avg       0.84      0.85      0.84      9045


Exemple de prédictions :
       age  workclass     education  education-num      marital-status  \
36919   27    Private  Some-college             10       Never-married   
17947   43  Local-gov   Prof-school             15  Married-civ-spouse   
3173    49  Local-gov     Bachelors             13            Divor

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from catboost import CatBoostClassifier

# ========= Chargement =========
df = pd.read_csv("income_boosted.csv")

TARGET_COL = "income"
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

# ========= Colonnes catégorielles =========
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# ========= Train/test =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ========= Modèle CatBoost =========
model = CatBoostClassifier(
    iterations=600,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="F1",
    random_seed=42,
    verbose=50
)

# ========= Entraînement =========
model.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_test, y_test),
    verbose=50
)

# ========= Prédictions =========
y_pred = model.predict(X_test)

# ========= Scores =========
acc = accuracy_score(y_test, y_pred)
f1_w = f1_score(y_test, y_pred, average="weighted")

print("\nAccuracy :", acc)
print("F1-score global (weighted) :", f1_w)

print("\nRapport complet :\n")
print(classification_report(y_test, y_pred))

# ========= Exemple prédiction =========
print("\nExemples de prédiction :", model.predict(X_test.iloc[:5]))


0:	learn: 0.6698623	test: 0.6628114	best: 0.6628114 (0)	total: 43.5ms	remaining: 26s
50:	learn: 0.6826996	test: 0.6675114	best: 0.6683544 (49)	total: 833ms	remaining: 8.97s
100:	learn: 0.7029133	test: 0.6813297	best: 0.6813297 (100)	total: 1.62s	remaining: 7.99s
150:	learn: 0.7123473	test: 0.6901513	best: 0.6901513 (150)	total: 2.51s	remaining: 7.47s
200:	learn: 0.7177162	test: 0.6959309	best: 0.6959309 (198)	total: 3.33s	remaining: 6.62s
250:	learn: 0.7240537	test: 0.7005650	best: 0.7016941 (234)	total: 4.06s	remaining: 5.65s
300:	learn: 0.7279937	test: 0.7026497	best: 0.7031672 (289)	total: 4.76s	remaining: 4.73s
350:	learn: 0.7308513	test: 0.7059687	best: 0.7061414 (346)	total: 5.49s	remaining: 3.89s
400:	learn: 0.7337516	test: 0.7063996	best: 0.7069176 (394)	total: 6.18s	remaining: 3.07s
450:	learn: 0.7359260	test: 0.7068587	best: 0.7070905 (403)	total: 6.89s	remaining: 2.28s
500:	learn: 0.7385720	test: 0.7072577	best: 0.7074300 (494)	total: 7.71s	remaining: 1.52s
550:	learn: 0.739

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from lightgbm import LGBMClassifier

# ========= 2. Séparer num / cat =========
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# ========= 3. Encodage & scaling =========
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# ========= 4. Modèle LightGBM =========
model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective="binary"
)

# ========= 5. Pipeline complet =========
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

# ========= 6. Split =========
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ========= 7. Entraînement =========
print("Entraînement LightGBM...")
clf.fit(X_train, y_train)

# ========= 8. Prédictions =========
y_pred = clf.predict(X_test)

# ========= 9. Scores =========
acc = accuracy_score(y_test, y_pred)
f1_w = f1_score(y_test, y_pred, average="weighted")

print("\nAccuracy :", acc)
print("F1-score global (weighted) :", f1_w)

print("\nClassification Report :\n")
print(classification_report(y_test, y_pred))

# ========= 10. Exemple =========
print("\nExemples de prédictions :", clf.predict(X_test.iloc[:5]))


Entraînement LightGBM...
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 868
[LightGBM] [Info] Number of data points in the train set: 36177, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247837 -> initscore=-1.110182
[LightGBM] [Info] Start training from score -1.110182

Accuracy : 0.8667772249861803
F1-score global (weighted) : 0.8627181973399053

Classification Report :

              precision    recall  f1-score   support

           0       0.89      0.94      0.91      6803
           1       0.78      0.65      0.71      2242

    accuracy                           0.87      9045
   macro avg       0.83      0.79      0.81      9045
weighted avg       0.86      0.87      0.8



In [10]:
from xgboost import XGBClassifier

# ========= Variables numériques / catégorielles =========
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

# ========= Préprocessing =========
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# ========= Modèle XGBoost =========
model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

# ========= Pipeline =========
clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

# ========= Split train/test =========
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# ========= Entraînement =========
print("Entraînement du modèle XGBoost…")
clf.fit(X_train, y_train)

# ========= Prédictions =========
y_pred = clf.predict(X_test)

# ========= Scores =========
acc = accuracy_score(y_test, y_pred)
f1_global = f1_score(y_test, y_pred, average="weighted")

print("\nAccuracy :", acc)
print("F1-score global (weighted) :", f1_global)

print("\n=== Classification Report ===\n")
print(classification_report(y_test, y_pred))

# ========= Exemple de prédiction =========
print("\nExemples de prédictions :", clf.predict(X_test.iloc[:5]))


Entraînement du modèle XGBoost…

Accuracy : 0.865118850193477
F1-score global (weighted) : 0.8606801689993266

=== Classification Report ===

              precision    recall  f1-score   support

           0       0.89      0.94      0.91      6803
           1       0.77      0.64      0.70      2242

    accuracy                           0.87      9045
   macro avg       0.83      0.79      0.81      9045
weighted avg       0.86      0.87      0.86      9045


Exemples de prédictions : [0 1 0 1 0]


In [7]:
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


random.seed(42)
np.random.seed(42)


df = pd.read_csv("income_cleaned.csv")

TARGET_COL = "income"
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

num_cols = X.select_dtypes(include=["int", "float"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()



# 2. Feature engineering (simple mais malin)

X_fe = X.copy()

# 1) capital-gain / hours-per-week
X_fe["gain_per_hour"] = X["capital-gain"] / (X["hours-per-week"] + 1)

# 2) capital-loss / hours-per-week
X_fe["loss_per_hour"] = X["capital-loss"] / (X["hours-per-week"] + 1)

# 3) education-num / age
X_fe["edu_age_ratio"] = X["education-num"] / (X["age"] + 1)

# 4) log(capital-gain)
X_fe["log_gain"] = np.log1p(X["capital-gain"])

# 5) binning sur l'âge
X_fe["age_bin"] = pd.cut(X["age"], bins=4, labels=False)

# Recalcul des colonnes numériques / catégorielles
num_cols = X_fe.select_dtypes(include=["int", "float"]).columns.tolist()
cat_cols = X_fe.select_dtypes(include=["object"]).columns.tolist()



# 3. Split train / test

X_train, X_test, y_train, y_test = train_test_split(
    X_fe,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)



# 4. Préprocesseur (StandardScaler + OneHotEncoder)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)



# 5. Wrapper CatBoost pour éviter l'erreur "buffer read-only"

class CatBoostWrapper(CatBoostClassifier):
    def fit(self, X, y, **kwargs):
        if hasattr(X, "toarray"):
            X = X.toarray().copy()
        else:
            X = np.array(X).copy()
        y = np.array(y).copy()
        return super().fit(X, y, **kwargs)

    def predict(self, X):
        if hasattr(X, "toarray"):
            X = X.toarray().copy()
        else:
            X = np.array(X).copy()
        return super().predict(X)

    def predict_proba(self, X):
        if hasattr(X, "toarray"):
            X = X.toarray().copy()
        else:
            X = np.array(X).copy()
        return super().predict_proba(X)


# 6. Modèles de base 


xgb = XGBClassifier(
    n_estimators=350,
    max_depth=6,
    learning_rate=0.07,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)

lgbm = LGBMClassifier(
    n_estimators=400,
    learning_rate=0.06,
    max_depth=-1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    n_jobs=-1
)

cat = CatBoostWrapper(
    iterations=500,
    depth=6,
    learning_rate=0.06,
    silent=True,
    random_seed=42
)


# 7. Stacking final (XGB + LGBM + CatBoost)

stack = StackingClassifier(
    estimators=[
        ("xgb", xgb),
        ("lgbm", lgbm),
        ("cat", cat),
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    n_jobs=-1
)

model = Pipeline(steps=[
    ("prep", preprocessor),
    ("stack", stack)
])



# 8. Cross-validation (CV=5) avec F1 & ROC AUC

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = cross_val_score(
    model,
    X_fe,
    y,
    scoring="f1_weighted",
    cv=skf,
    n_jobs=-1
)

auc_scores = cross_val_score(
    model,
    X_fe,
    y,
    scoring="roc_auc",
    cv=skf,
    n_jobs=-1
)

print("===== RESULTATS CV (5-fold) =====")
print(f"F1 moyen (CV5)      : {f1_scores.mean():.4f} ± {f1_scores.std():.4f}")
print(f"ROC AUC moyen (CV5) : {auc_scores.mean():.4f} ± {auc_scores.std():.4f}")


# 9. Entraînement final + scores sur hold-out

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

f1_holdout = f1_score(y_test, y_pred, average="weighted")
auc_holdout = roc_auc_score(y_test, y_proba)

print("\n===== HOLD-OUT (20% test) =====")
print(f"F1 (hold-out)      : {f1_holdout:.4f}")
print(f"ROC AUC (hold-out) : {auc_holdout:.4f}")


[LightGBM] [Info] [LightGBM] [Info] Number of positive: 8967, number of negative: 27211
Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Number of positive: 8967, number of negative: 27211
[LightGBM] [Info] Number of positive: 8966, number of negative: 27212
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.161158 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1397
[LightGBM] [Info] Number of data points in the train set: 36178, number of used features: 97
[LightGBM] [Info] Number of data poin



[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089902 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247815 -> initscore=-1.110302
[LightGBM] [Info] Start training from score -1.110302




[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247815 -> initscore=-1.110302
[LightGBM] [Info] Start training from score -1.110302
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1368
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 96
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.087172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1374
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040040 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1381
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7172, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008237 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247806 -> initscore=-1.110348
[Lig



[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] [LightGBM] [Info] Number of positive: 7173, number of negative: 21769
Auto-choosing row-wise multi-threading, the overhead of testing was 0.023651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1369




[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247832 -> initscore=-1.110209
[LightGBM] [Info] Start training from score -1.110209




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.056950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009645 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 96
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003774 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247832 -> initscore=-1.110209
[LightGBM] [Info] Start training from score -1.110209




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1370
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024




[LightGBM] [Info] Number of positive: 8967, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043754 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1394
[LightGBM] [Info] Number of data points in the train set: 36178, number of used features: 98
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247858 -> initscore=-1.110070
[LightGBM] [Info] Start training from score -1.110070
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040927 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1401
[LightGBM] [Info] Number of data points in the train set: 36177, number of used features: 99
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1374
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.157667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247815 -> initscore=-1.110302
[LightGBM] [Info] Start training from score -1.110302
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.280105 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1368
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 96
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.170159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247815 -> initscore=-1.110302
[LightGBM] [Info] Start training from score -1.110302




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1379
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7172, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247806 -> initscore=-1.110348
[LightGBM] [Info] Start training from score -1.110348




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1381
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010220 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1372
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006697 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1371
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036122 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247832 -> initscore=-1.110209
[LightGBM] [Info] Start training from score -1.110209




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034370 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1376
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1369
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018025 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 96
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247832 -> initscore=-1.110209
[LightGBM] [Info] Start training from score -1.110209




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1370
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024
[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 96
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.205702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




===== RESULTATS CV (5-fold) =====
F1 moyen (CV5)      : 0.8662 ± 0.0052
ROC AUC moyen (CV5) : 0.9290 ± 0.0043
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006313 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1401
[LightGBM] [Info] Number of data points in the train set: 36177, number of used features: 99
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247837 -> initscore=-1.110182
[LightGBM] [Info] Start training from score -1.110182
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Number of positive: 7173, number of




===== HOLD-OUT (20% test) =====
F1 (hold-out)      : 0.8637
ROC AUC (hold-out) : 0.9264


In [10]:
random.seed(42)
np.random.seed(42)



df = pd.read_csv("income_boosted.csv")

TARGET_COL = "income"
y = df[TARGET_COL]
X = df.drop(columns=[TARGET_COL])

num_cols = X_fe.select_dtypes(include=["int", "float"]).columns.tolist()
cat_cols = X_fe.select_dtypes(include=["object"]).columns.tolist()



X_train, X_test, y_train, y_test = train_test_split(
    X_fe,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)


class CatBoostWrapper(CatBoostClassifier):
    def fit(self, X, y, **kwargs):
        if hasattr(X, "toarray"):
            X = X.toarray().copy()
        else:
            X = np.array(X).copy()
        y = np.array(y).copy()
        return super().fit(X, y, **kwargs)

    def predict(self, X):
        if hasattr(X, "toarray"):
            X = X.toarray().copy()
        else:
            X = np.array(X).copy()
        return super().predict(X)

    def predict_proba(self, X):
        if hasattr(X, "toarray"):
            X = X.toarray().copy()
        else:
            X = np.array(X).copy()
        return super().predict_proba(X)




xgb = XGBClassifier(
    n_estimators=650,
    max_depth=5,
    learning_rate=0.045,
    subsample=0.8,
    colsample_bytree=0.85,
    min_child_weight=3,
    gamma=0.5,
    reg_alpha=0.1,
    reg_lambda=1.2,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1,
    tree_method="hist"
)


lgbm = LGBMClassifier(
    n_estimators=700,
    learning_rate=0.045,
    max_depth=-1,
    num_leaves=32,
    min_child_samples=30,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.2,
    reg_lambda=1.5,
    random_state=42,
    n_jobs=-1
)

cat = CatBoostWrapper(
    iterations=900,
    depth=7,
    learning_rate=0.035,
    l2_leaf_reg=6,
    random_seed=42,
    silent=True,
    border_count=150
)




stack = StackingClassifier(
    estimators=[
        ("xgb", xgb),
        ("lgbm", lgbm),
        ("cat", cat),
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    n_jobs=-1
)

model = Pipeline(steps=[
    ("prep", preprocessor),
    ("stack", stack)
])



skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_scores = cross_val_score(
    model,
    X_fe,
    y,
    scoring="f1_weighted",
    cv=skf,
    n_jobs=-1
)

auc_scores = cross_val_score(
    model,
    X_fe,
    y,
    scoring="roc_auc",
    cv=skf,
    n_jobs=-1
)

print("===== RESULTATS CV (5-fold) =====")
print(f"F1 moyen (CV5)      : {f1_scores.mean():.4f} ± {f1_scores.std():.4f}")
print(f"ROC AUC moyen (CV5) : {auc_scores.mean():.4f} ± {auc_scores.std():.4f}")



model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

f1_holdout = f1_score(y_test, y_pred, average="weighted")
auc_holdout = roc_auc_score(y_test, y_proba)

print("\n===== HOLD-OUT (20% test) =====")
print(f"F1 (hold-out)      : {f1_holdout:.4f}")
print(f"ROC AUC (hold-out) : {auc_holdout:.4f}")


[LightGBM] [Info] Number of positive: 8967, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1382
[LightGBM] [Info] Number of data points in the train set: 36178, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247858 -> initscore=-1.110070
[LightGBM] [Info] Start training from score -1.110070
[LightGBM] [Info] Number of positive: 8967, number of negative: 27211
[LightGBM] [Info] Number of positive: 8966, number of negative: 27212
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.084550 seconds.
You can set `force_row_wise=true` to remove the overhead.
A



[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005845 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1354
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011789 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 91
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019704 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1359
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247815 -> initscore=-1.110302
[LightGBM] [Info] Start training from score -1.110302




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1360
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7172, number of negative: 21770
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1368
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247806 -> initscore=-1.110348
[LightGBM] [Info] Start training from score -1.110348




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024
[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017652 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 91
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1370
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1363
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 90
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1356
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024




[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007749 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1353
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247832 -> initscore=-1.110209
[LightGBM] [Info] Start training from score -1.110209
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024089 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1351
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1354
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Number of positive: 8967, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1389
[LightGBM] [Info] Number of data points in the train set: 36177, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247837 -> initscore=-1.110182
[LightGBM] [Info] Start training from score -1.110182
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018915 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 36178, number of used features: 92
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1359
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247815 -> initscore=-1.110302
[LightGBM] [Info] Start training from score -1.110302




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1362
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.085263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1354
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013620 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1367
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 28941, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247815 -> initscore=-1.110302
[LightGBM] [Info] Start training from score -1.110302




[LightGBM] [Info] Number of positive: 7172, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1368
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247806 -> initscore=-1.110348
[LightGBM] [Info] Start training from score -1.110348




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.119315 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1373
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1360
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070344 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1366
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024
[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1363
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 90
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.124897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1370
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.152334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1357
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020663 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 91
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




[LightGBM] [Info] Number of positive: 7173, number of negative: 21770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011007 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1353
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 89
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247832 -> initscore=-1.110209
[LightGBM] [Info] Start training from score -1.110209
[LightGBM] [Info] Number of positive: 7174, number of negative: 21769




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.088359 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1356
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247866 -> initscore=-1.110024
[LightGBM] [Info] Start training from score -1.110024




[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1354
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163
[LightGBM] [Info] Number of positive: 7174, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.184493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1361
[LightGBM] [Info] Number of data points in the train set: 28943, number of used features: 90
[LightGBM] [Info] [b



[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070804 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1351
[LightGBM] [Info] Number of data points in the train set: 28942, number of used features: 90
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247841 -> initscore=-1.110163
[LightGBM] [Info] Start training from score -1.110163




===== RESULTATS CV (5-fold) =====
F1 moyen (CV5)      : 0.8660 ± 0.0052
ROC AUC moyen (CV5) : 0.9288 ± 0.0043
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1387
[LightGBM] [Info] Number of data points in the train set: 36177, number of used features: 92
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247837 -> initscore=-1.110182
[LightGBM] [Info] Start training from score -1.110182
[LightGBM] [Info] Number of positive: 7173, number of negative: 21768
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Number of positive: 7172, number of negative: 21769
[LightGBM] [Info] Number of positive: 7173, number of negative: 21769
[LightGBM] [Info] Number of positive: 7173, number of




===== HOLD-OUT (20% test) =====
F1 (hold-out)      : 0.8634
ROC AUC (hold-out) : 0.9263


In [15]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score


df = pd.read_csv("income_boosted.csv")
TARGET_COL = "income"

X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

cat_features = [i for i, col in enumerate(X.columns) if X[col].dtype == "object"]


# 2. Split train/test

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)


# 3. AUTO-CATBOOST 

model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
)

model.randomized_search(
    {
        "iterations": [400, 600, 800, 1000],
        "depth": [4, 5, 6, 7, 8],
        "learning_rate": [0.01, 0.03, 0.05, 0.07],
        "l2_leaf_reg": [1, 3, 5, 7, 9],
        "bagging_temperature": [0.0, 0.5, 1.0],
        "border_count": [32, 64, 128, 254],
    },
    train_pool,
    n_iter=30,     
    verbose=False
)


# 4. Train final

model.fit(train_pool, verbose=False)


# 5. Scores

y_pred = model.predict(test_pool)
y_proba = model.predict_proba(test_pool)[:, 1]

f1 = f1_score(y_test, y_pred, average="weighted")
auc = roc_auc_score(y_test, y_proba)

print("\n======= AUTO CATBOOST RESULTS =======")
print("F1 score :", f1)
print("ROC AUC  :", auc)
print("\nBest params found:")
print(model.get_params())


0:	test: 0.8623816	best: 0.8623816 (0)	total: 77.4ms	remaining: 1m 1s
1:	test: 0.8791110	best: 0.8791110 (1)	total: 94.7ms	remaining: 37.8s
2:	test: 0.8809857	best: 0.8809857 (2)	total: 108ms	remaining: 28.8s
3:	test: 0.8818394	best: 0.8818394 (3)	total: 124ms	remaining: 24.7s
4:	test: 0.8805830	best: 0.8818394 (3)	total: 139ms	remaining: 22.1s
5:	test: 0.8818208	best: 0.8818394 (3)	total: 155ms	remaining: 20.5s
6:	test: 0.8815586	best: 0.8818394 (3)	total: 170ms	remaining: 19.3s
7:	test: 0.8855408	best: 0.8855408 (7)	total: 187ms	remaining: 18.5s
8:	test: 0.8897373	best: 0.8897373 (8)	total: 200ms	remaining: 17.6s
9:	test: 0.8913154	best: 0.8913154 (9)	total: 216ms	remaining: 17.1s
10:	test: 0.8916264	best: 0.8916264 (10)	total: 231ms	remaining: 16.5s
11:	test: 0.8917534	best: 0.8917534 (11)	total: 244ms	remaining: 16.1s
12:	test: 0.8915832	best: 0.8917534 (11)	total: 262ms	remaining: 15.9s
13:	test: 0.8925548	best: 0.8925548 (13)	total: 279ms	remaining: 15.7s
14:	test: 0.8932626	best

In [18]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    balanced_accuracy_score
)
import numpy as np

# Prédictions
y_pred = model.predict(test_pool)
y_proba = model.predict_proba(test_pool)[:, 1]

print("\n========== FULL METRICS ==========")

# --- Scores globaux ---
print("Accuracy             :", accuracy_score(y_test, y_pred))
print("Precision (weighted) :", precision_score(y_test, y_pred, average="weighted"))
print("Recall (weighted)    :", recall_score(y_test, y_pred, average="weighted"))
print("F1 (weighted)        :", f1_score(y_test, y_pred, average="weighted"))
print("Balanced Accuracy    :", balanced_accuracy_score(y_test, y_pred))
print("ROC AUC              :", roc_auc_score(y_test, y_proba))



# --- Specificity / Sensitivity ---
print("\n----- Specificity Sensitivity -----")

tn, fp, fn, tp = cm.ravel()

specificity = tn / (tn + fp)
sensitivity = tp / (tp + fn)

print("Specificity :", specificity)
print("Sensitivity :", sensitivity)



Accuracy             : 0.8676616915422886
Precision (weighted) : 0.8631896514530768
Recall (weighted)    : 0.8676616915422886
F1 (weighted)        : 0.8637317404689422
Balanced Accuracy    : 0.7964465551024809
ROC AUC              : 0.9256459637697226

----- Specificity Sensitivity -----
Specificity : 0.937674555343231
Sensitivity : 0.6552185548617306


🔹 Spécificité

La spécificité mesure la capacité du modèle à bien reconnaître les personnes gagnant 50K$ ou moins (vrais négatifs). Elle indique la proportion de négatifs correctement identifiés parmi tous les négatifs réels.

🔹 Sensibilité

La sensibilité mesure la capacité du modèle à identifier correctement les personnes gagnant plus de 50K$ (vrais positifs). Elle indique la proportion de cas positifs correctement détectés parmi tous les positifs réels.

In [1]:
# ============================================================
#  COMPARISON TABLE: F1_weighted & ROC_AUC FOR ALL MAIN MODELS
# ============================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

TARGET_COL = "income"

# ---------- Helper to store results ----------

results = []

def add_result(model_name, dataset_name, y_true, y_pred, y_proba):
    """
    Compute F1 (weighted) and ROC-AUC and append to a global list.
    y_proba must be the probability for the positive class (income = 1).
    """
    f1_w = f1_score(y_true, y_pred, average="weighted")
    roc = roc_auc_score(y_true, y_proba)

    results.append({
        "Model": model_name,
        "Dataset": dataset_name,
        "F1_weighted": f1_w,
        "ROC_AUC": roc,
    })


# ============================================================
# 1) RandomForest on income_cleaned.csv
# ============================================================

df_clean = pd.read_csv("income_cleaned.csv")
X_clean = df_clean.drop(columns=[TARGET_COL])
y_clean = df_clean[TARGET_COL]

num_cols_clean = X_clean.select_dtypes(include=["int", "float"]).columns.tolist()
cat_cols_clean = X_clean.select_dtypes(include=["object"]).columns.tolist()

preproc_clean = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols_clean),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_clean),
    ]
)

rf_clean = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_clean_pipe = Pipeline(
    steps=[
        ("preprocess", preproc_clean),
        ("model", rf_clean),
    ]
)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_clean, y_clean,
    test_size=0.2,
    random_state=42,
    stratify=y_clean
)

print("\n=== Training RandomForest on income_cleaned.csv ===")
rf_clean_pipe.fit(X_train_c, y_train_c)

y_pred_c = rf_clean_pipe.predict(X_test_c)
y_proba_c = rf_clean_pipe.predict_proba(X_test_c)[:, 1]

add_result(
    model_name="RandomForest",
    dataset_name="income_cleaned",
    y_true=y_test_c,
    y_pred=y_pred_c,
    y_proba=y_proba_c,
)


# ============================================================
# 2) Prepare single train/test split for models on income_boosted.csv
# ============================================================

df_boost = pd.read_csv("income_boosted.csv")
X_boost = df_boost.drop(columns=[TARGET_COL])
y_boost = df_boost[TARGET_COL]

X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
    X_boost, y_boost,
    test_size=0.2,
    random_state=42,
    stratify=y_boost
)

num_cols_boost = X_boost.select_dtypes(include=["int", "float"]).columns.tolist()
cat_cols_boost = X_boost.select_dtypes(include=["object"]).columns.tolist()

preproc_boost = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols_boost),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols_boost),
    ]
)


# ============================================================
# 3) RandomForest on income_boosted.csv
# ============================================================

rf_boost = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

rf_boost_pipe = Pipeline(
    steps=[
        ("preprocess", preproc_boost),
        ("model", rf_boost),
    ]
)

print("\n=== Training RandomForest on income_boosted.csv ===")
rf_boost_pipe.fit(X_train_b, y_train_b)

y_pred_b_rf = rf_boost_pipe.predict(X_test_b)
y_proba_b_rf = rf_boost_pipe.predict_proba(X_test_b)[:, 1]

add_result(
    model_name="RandomForest",
    dataset_name="income_boosted",
    y_true=y_test_b,
    y_pred=y_pred_b_rf,
    y_proba=y_proba_b_rf,
)


# ============================================================
# 4) LightGBM on income_boosted.csv
# ============================================================

lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective="binary"
)

lgbm_pipe = Pipeline(
    steps=[
        ("preprocess", preproc_boost),
        ("model", lgbm),
    ]
)

print("\n=== Training LightGBM on income_boosted.csv ===")
lgbm_pipe.fit(X_train_b, y_train_b)

y_pred_b_lgbm = lgbm_pipe.predict(X_test_b)
y_proba_b_lgbm = lgbm_pipe.predict_proba(X_test_b)[:, 1]

add_result(
    model_name="LightGBM",
    dataset_name="income_boosted",
    y_true=y_test_b,
    y_pred=y_pred_b_lgbm,
    y_proba=y_proba_b_lgbm,
)


# ============================================================
# 5) XGBoost on income_boosted.csv
# ============================================================

xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

xgb_pipe = Pipeline(
    steps=[
        ("preprocess", preproc_boost),
        ("model", xgb),
    ]
)

print("\n=== Training XGBoost on income_boosted.csv ===")
xgb_pipe.fit(X_train_b, y_train_b)

y_pred_b_xgb = xgb_pipe.predict(X_test_b)
y_proba_b_xgb = xgb_pipe.predict_proba(X_test_b)[:, 1]

add_result(
    model_name="XGBoost",
    dataset_name="income_boosted",
    y_true=y_test_b,
    y_pred=y_pred_b_xgb,
    y_proba=y_proba_b_xgb,
)


# ============================================================
# 6) CatBoost (final-style) on income_boosted.csv
#    -> you can replace hyperparameters with those from randomized_search
# ============================================================

cat_features_idx = [
    i for i, col in enumerate(X_boost.columns)
    if X_boost[col].dtype == "object"
]

cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=42,
    # 👉 Replace with the best hyperparameters you got from randomized_search
    iterations=600,
    depth=6,
    learning_rate=0.05,
    verbose=False,
)

train_pool = Pool(X_train_b, y_train_b, cat_features=cat_features_idx)
test_pool = Pool(X_test_b, y_test_b, cat_features=cat_features_idx)

print("\n=== Training CatBoost (final) on income_boosted.csv ===")
cat_model.fit(train_pool)

y_pred_b_cat = cat_model.predict(test_pool)
y_proba_b_cat = cat_model.predict_proba(test_pool)[:, 1]

add_result(
    model_name="CatBoost (final)",
    dataset_name="income_boosted",
    y_true=y_test_b,
    y_pred=y_pred_b_cat,
    y_proba=y_proba_b_cat,
)


# ============================================================
# 7) Build and display comparison table
# ============================================================

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1_weighted", ascending=False).reset_index(drop=True)

print("\n========== COMPARISON TABLE ==========")
display(results_df)



=== Training RandomForest on income_cleaned.csv ===

=== Training RandomForest on income_boosted.csv ===

=== Training LightGBM on income_boosted.csv ===
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 868
[LightGBM] [Info] Number of data points in the train set: 36177, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247837 -> initscore=-1.110182
[LightGBM] [Info] Start training from score -1.110182





=== Training XGBoost on income_boosted.csv ===

=== Training CatBoost (final) on income_boosted.csv ===



Unnamed: 0,Model,Dataset,F1_weighted,ROC_AUC
0,LightGBM,income_boosted,0.862718,0.925775
1,CatBoost (final),income_boosted,0.862182,0.925202
2,XGBoost,income_boosted,0.862182,0.925732
3,RandomForest,income_boosted,0.841999,0.893518
4,RandomForest,income_cleaned,0.840374,0.891597
