# Income Prediction – Model Benchmarking with MLflow

This notebook compares several models to predict whether an individual's income is above 50k, using:

- Two datasets: `income_cleaned.csv` and `income_boosted.csv`
- Models: RandomForest, XGBoost, LightGBM, CatBoost
- Evaluation metrics: **F1-score (weighted)** and **ROC-AUC**

We also use **MLflow** to track experiments: parameters, metrics and models.


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    balanced_accuracy_score,
)

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool

import mlflow
import mlflow.sklearn

RANDOM_STATE = 42
TARGET_COL = "income"

mlflow.set_experiment("Income Prediction – Notebook")


  return FileStore(store_uri, store_uri)
2025/11/18 18:41:57 INFO mlflow.tracking.fluent: Experiment with name 'Income Prediction – Notebook' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/moutt/Documents/Albert%20School%20x%20Mines%20de%20Paris%20-%20PSL/Master%201/Courses/Bootcamp%20-%20Intermediate%20data%20Level/ML%20for%20Business%20II/Project/supervised-learning/mlruns/525794916860285511', creation_time=1763487717889, experiment_id='525794916860285511', last_update_time=1763487717889, lifecycle_stage='active', name='Income Prediction – Notebook', tags={}>

In [2]:
def load_data(path: str):
    df = pd.read_csv(path)
    X = df.drop(columns=[TARGET_COL])
    y = df[TARGET_COL]
    return X, y


def build_preprocessor(X: pd.DataFrame):
    """
    StandardScaler for numerical features,
    OneHotEncoder for categorical features.
    """
    numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ]
    )

    return preprocessor, numeric_cols, cat_cols


def train_test_split_stratified(X, y, test_size=0.2):
    return train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=RANDOM_STATE,
        stratify=y
    )


In [3]:
def compute_metrics(y_true, y_pred, y_proba):
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    prec_w = precision_score(y_true, y_pred, average="weighted")
    rec_w = recall_score(y_true, y_pred, average="weighted")
    f1_w = f1_score(y_true, y_pred, average="weighted")
    roc = roc_auc_score(y_true, y_proba)

    return {
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision_weighted": prec_w,
        "recall_weighted": rec_w,
        "f1_weighted": f1_w,
        "roc_auc": roc,
    }


def log_metrics_to_mlflow(metrics: dict):
    for name, value in metrics.items():
        mlflow.log_metric(name, float(value))


In [4]:
def run_sklearn_model_with_mlflow(
    model,
    model_name: str,
    dataset_name: str,
    csv_path: str,
):
    print(f"\n=== {model_name} on {dataset_name} ===")

    X, y = load_data(csv_path)
    preprocessor, numeric_cols, cat_cols = build_preprocessor(X)
    X_train, X_test, y_train, y_test = train_test_split_stratified(X, y)

    # Full pipeline
    clf = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", model),
        ]
    )

    with mlflow.start_run(run_name=f"{model_name} - {dataset_name}"):
        # Log high-level params
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("dataset", dataset_name)
        mlflow.log_param("csv_path", csv_path)

        # Log model hyperparameters
        for key, value in model.get_params().items():
            mlflow.log_param(f"{model_name.lower()}_{key}", value)

        # Fit
        clf.fit(X_train, y_train)

        # Predict
        y_pred = clf.predict(X_test)
        y_proba = clf.predict_proba(X_test)[:, 1]

        # Metrics
        metrics = compute_metrics(y_test, y_pred, y_proba)
        log_metrics_to_mlflow(metrics)

        # Log model artifact
        mlflow.sklearn.log_model(clf, f"{model_name}_pipeline")

    print("Metrics:", metrics)
    return metrics


In [5]:
def run_catboost_with_mlflow(
    model_name: str,
    dataset_name: str,
    csv_path: str,
    cat_params: dict,
):
    print(f"\n=== {model_name} on {dataset_name} ===")

    X, y = load_data(csv_path)
    # categorical features are those with dtype object
    cat_features_idx = [
        i for i, col in enumerate(X.columns)
        if X[col].dtype == "object"
    ]

    X_train, X_test, y_train, y_test = train_test_split_stratified(X, y)

    train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
    test_pool = Pool(X_test, y_test, cat_features=cat_features_idx)

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        random_seed=RANDOM_STATE,
        verbose=False,
        **cat_params,
    )

    with mlflow.start_run(run_name=f"{model_name} - {dataset_name}"):
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("dataset", dataset_name)
        mlflow.log_param("csv_path", csv_path)

        # Log CatBoost params
        for key, value in model.get_params().items():
            mlflow.log_param(f"catboost_{key}", value)

        # Fit
        model.fit(train_pool)

        # Predict
        y_pred = model.predict(test_pool)
        y_proba = model.predict_proba(test_pool)[:, 1]

        # Metrics
        metrics = compute_metrics(y_test, y_pred, y_proba)
        log_metrics_to_mlflow(metrics)

        # (Optional) log model as artifact
        # mlflow.catboost.log_model(model, "catboost_model")  # only if mlflow-catboost plugin installed

    print("Metrics:", metrics)
    return metrics


In [6]:
rf_clean = RandomForestClassifier(
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

metrics_rf_clean = run_sklearn_model_with_mlflow(
    model=rf_clean,
    model_name="RandomForest",
    dataset_name="income_cleaned",
    csv_path="income_cleaned.csv",
)

metrics_rf_clean



=== RandomForest on income_cleaned ===




Metrics: {'accuracy': 0.8443338861249309, 'balanced_accuracy': 0.7685286821170751, 'precision_weighted': 0.8388089477998856, 'recall_weighted': 0.8443338861249309, 'f1_weighted': 0.8403743245590086, 'roc_auc': 0.8915973865232097}


{'accuracy': 0.8443338861249309,
 'balanced_accuracy': 0.7685286821170751,
 'precision_weighted': 0.8388089477998856,
 'recall_weighted': 0.8443338861249309,
 'f1_weighted': 0.8403743245590086,
 'roc_auc': 0.8915973865232097}

In [7]:
results = []

# 1) RandomForest on boosted
rf_boost = RandomForestClassifier(
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

metrics_rf_boost = run_sklearn_model_with_mlflow(
    model=rf_boost,
    model_name="RandomForest",
    dataset_name="income_boosted",
    csv_path="income_boosted.csv",
)
results.append(("RandomForest", "income_boosted", metrics_rf_boost))


# 2) LightGBM (your final model)
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=RANDOM_STATE,
    objective="binary"
)

metrics_lgbm = run_sklearn_model_with_mlflow(
    model=lgbm,
    model_name="LightGBM",
    dataset_name="income_boosted",
    csv_path="income_boosted.csv",
)
results.append(("LightGBM", "income_boosted", metrics_lgbm))


# 3) XGBoost
xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

metrics_xgb = run_sklearn_model_with_mlflow(
    model=xgb,
    model_name="XGBoost",
    dataset_name="income_boosted",
    csv_path="income_boosted.csv",
)
results.append(("XGBoost", "income_boosted", metrics_xgb))



=== RandomForest on income_boosted ===




Metrics: {'accuracy': 0.8462133775566612, 'balanced_accuracy': 0.7694790945328601, 'precision_weighted': 0.8405481287701642, 'recall_weighted': 0.8462133775566612, 'f1_weighted': 0.8419985076956406, 'roc_auc': 0.8935177821402454}

=== LightGBM on income_boosted ===
[LightGBM] [Info] Number of positive: 8966, number of negative: 27211
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002979 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 868
[LightGBM] [Info] Number of data points in the train set: 36177, number of used features: 97
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.247837 -> initscore=-1.110182
[LightGBM] [Info] Start training from score -1.110182




Metrics: {'accuracy': 0.8667772249861803, 'balanced_accuracy': 0.794662433782231, 'precision_weighted': 0.8622044881733293, 'recall_weighted': 0.8667772249861803, 'f1_weighted': 0.8627181973399053, 'roc_auc': 0.9257753210887308}

=== XGBoost on income_boosted ===




Metrics: {'accuracy': 0.8664455500276396, 'balanced_accuracy': 0.7930962792166913, 'precision_weighted': 0.8617806460852351, 'recall_weighted': 0.8664455500276396, 'f1_weighted': 0.8621815786939088, 'roc_auc': 0.9257323440372307}


In [8]:
cat_params = {
    "iterations": 600,
    "depth": 6,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3,
    "bagging_temperature": 0.5,
    "border_count": 128,
}

metrics_cat = run_catboost_with_mlflow(
    model_name="CatBoost",
    dataset_name="income_boosted",
    csv_path="income_boosted.csv",
    cat_params=cat_params,
)

results.append(("CatBoost", "income_boosted", metrics_cat))



=== CatBoost on income_boosted ===
Metrics: {'accuracy': 0.866334991708126, 'balanced_accuracy': 0.7934713367652908, 'precision_weighted': 0.8616929614738434, 'recall_weighted': 0.866334991708126, 'f1_weighted': 0.8621587491282503, 'roc_auc': 0.9254250794272296}


In [9]:
rows = []

# RandomForest on cleaned (baseline)
rows.append({
    "Model": "RandomForest",
    "Dataset": "income_cleaned",
    "F1_weighted": metrics_rf_clean["f1_weighted"],
    "ROC_AUC": metrics_rf_clean["roc_auc"],
})

# Others (boosted)
for model_name, dataset_name, m in results:
    rows.append({
        "Model": model_name,
        "Dataset": dataset_name,
        "F1_weighted": m["f1_weighted"],
        "ROC_AUC": m["roc_auc"],
    })

results_df = pd.DataFrame(rows).sort_values(by="F1_weighted", ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,Model,Dataset,F1_weighted,ROC_AUC
0,LightGBM,income_boosted,0.862718,0.925775
1,XGBoost,income_boosted,0.862182,0.925732
2,CatBoost,income_boosted,0.862159,0.925425
3,RandomForest,income_boosted,0.841999,0.893518
4,RandomForest,income_cleaned,0.840374,0.891597
