In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install xgboost lightgbm "mlflow<3"



In [3]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/customer_churn"
%cd "{base_folder}"

/content/drive/MyDrive/Colab Notebooks/customer_churn


In [4]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/customer_churn.db")
customer = pd.read_sql_query(
    """
    SELECT
        c.customerId,
        c.surname,
        c.CreditScore,
        c.age,
        c.tenure,
        c.balance,
        c.numofproducts,
        c.hascrcard,
        c.IsActiveMember,
        g.gender,
        geo.geography,
        c.estimatedSalary,
        c.exited
    FROM customer_fact AS c
    JOIN gender AS g
        ON g.gender_id = c.gender_id
    JOIN geography AS geo
        ON geo.geography_id = c.geography_id
    ORDER BY c.customerId
    """,
    conn,
)
conn.close()

customer.head()

Unnamed: 0,customerId,surname,creditScore,age,tenure,balance,numofProducts,hasCrCard,isActiveMember,gender,geography,estimatedSalary,exited
0,15565701,Ferri,698,39,9,161993.89,1,0,0,Female,Spain,90212.38,0
1,15565706,Akobundu,612,35,1,0.0,1,1,1,Male,Spain,83256.26,1
2,15565714,Cattaneo,601,47,1,64430.06,2,0,1,Male,France,96517.97,0
3,15565779,Kent,627,30,6,57809.32,1,1,0,Female,Germany,188258.49,0
4,15565796,Docherty,745,48,10,96048.55,1,1,0,Male,Germany,74510.65,0


In [5]:
# =============================================================================
# FULL PIPELINE:
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (Ridge, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test F1
# - Save, load, and compare the global best model
# =============================================================================

import os
import sys
import numpy as np
import pandas as pd
import time

from dotenv import load_dotenv

from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline

import mlflow
from mlflow.models import infer_signature
import joblib

parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Import shared components
from customer_pipeline import (
    build_preprocessing,
    make_estimator_for_name,
)

start_time = time.monotonic()

# =============================================================================
# STEP 1: Build Full ML Preprocessing Pipeline
# =============================================================================

preprocessing = build_preprocessing()
print("‚úì STEP 1: Preprocessing pipeline created.")

# =============================================================================
# STEP 2: Split Data into Stratified Train and Test Sets
# =============================================================================
# Drop customerId and surname
customer.drop(["customerId", "surname"], axis=1, inplace=True)

# Create isZeroBalance feature to add zero balance information on prediction
customer["isZeroBalance"] = (customer["balance"] == 0).astype(int)

# Stratification prep
customer["combined_geo_exited"] = customer["geography"] + "_" + customer["exited"].astype(str)

train_set, test_set = train_test_split(
    customer,
    test_size=0.20,
    stratify=customer["combined_geo_exited"],
    random_state=42,
)

for df in (train_set, test_set):
    df.drop("combined_geo_exited", axis=1, inplace=True)

X_train = train_set.drop(["exited"], axis=1).copy()
y_train = train_set["exited"].copy()

X_test = test_set.drop(["exited"], axis=1).copy()
y_test = test_set["exited"].copy()

print(f"‚úì STEP 2: Stratified split done. Train size: {len(X_train)}, Test size: {len(X_test)}")

# =============================================================================
# STEP 3: Define 4 Model Pipelines (WITHOUT PCA)
# =============================================================================

models = {}
for name in ["ridge", "histgradientboosting", "xgboost", "lightgbm"]:
    est = make_estimator_for_name(name)
    models[name] = make_pipeline(preprocessing, est)

print("‚úì STEP 3: 4 baseline model pipelines defined.")


# =============================================================================
# STEP 4: Configure MLflow (e.g., Dagshub) via .env
# =============================================================================

load_dotenv(
    dotenv_path=f"{base_folder}/notebooks/.env",
    override=True
)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("customer_churn_multi_model")

print("‚úì STEP 4: MLflow configured.")



# =============================================================================
# STEP 5: Train, Evaluate, and Log 4 Baseline Models (NO PCA)
# =============================================================================

results = {}

for name, pipeline in models.items():
    print(f"\n{'=' * 80}")
    print(f"Training baseline model: {name}")
    print(f"{'=' * 80}")

    # Compute CV f1 before fitting on full training set
    cv_scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1",
        n_jobs=-1
    )
    cv_f1 = cv_scores.mean()
    print(f"{name} (no PCA) CV F1: ${cv_f1:.4f}")

    # Fit on full training set
    pipeline.fit(X_train, y_train)

    # Evaluate on test set
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    print(f"{name} (no PCA) Test F1: ${test_f1:.4f}")

    results[name] = {"pipeline": pipeline, "test_f1": test_f1, "cv_f1": cv_f1}

    with mlflow.start_run(run_name=f"{name}_baseline"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)

        est_step_name = list(pipeline.named_steps.keys())[-1]
        est = pipeline.named_steps[est_step_name]
        est_params = {f"{est_step_name}__{k}": v for k, v in est.get_params().items()}
        mlflow.log_params(est_params)

        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)

        signature = infer_signature(X_train, pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="customer_churn",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{name}_pipeline",
        )

print("\n‚úì STEP 5: All 4 baseline models trained and logged.")


# =============================================================================
# STEP 6: Train, Evaluate, and Log PCA Versions of ALL 4 Models
# =============================================================================

pca_results = {}

for name in models.keys():
    print("\n" + "=" * 80)
    print(f"Training PCA-augmented model: {name}")
    print("=" * 80)

    est = make_estimator_for_name(name)

    pca_pipeline = make_pipeline(
        preprocessing,
        PCA(n_components=0.95),
        est,
    )

    # Compute CV F1 before fitting on full training set
    cv_scores_pca = cross_val_score(
        pca_pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    cv_f1_pca = cv_scores_pca.mean()
    print(f"{name}_with_pca CV F1: ${cv_f1_pca:.4f}")

    # Fit on full training set
    pca_pipeline.fit(X_train, y_train)

    # Evaluate on test set
    y_pred_pca = pca_pipeline.predict(X_test)
    test_f1_pca = f1_score(y_test, y_pred_pca)

    model_key = f"{name}_with_pca"
    pca_results[model_key] = {
        "pipeline": pca_pipeline,
        "test_f1": test_f1_pca,
        "cv_f1": cv_f1_pca,
    }

    print(f"{model_key} Test F1: ${test_f1_pca:.4f}")

    with mlflow.start_run(run_name=model_key):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", True)

        est_step_name = list(pca_pipeline.named_steps.keys())[-1]
        est_step = pca_pipeline.named_steps[est_step_name]
        est_params = {f"{est_step_name}__{k}": v for k, v in est_step.get_params().items()}
        mlflow.log_params(est_params)

        pca_step = pca_pipeline.named_steps["pca"]
        mlflow.log_param("pca__n_components", pca_step.n_components)

        mlflow.log_metric("cv_f1", cv_f1_pca)
        mlflow.log_metric("test_f1", test_f1_pca)

        signature_pca = infer_signature(X_train, pca_pipeline.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=pca_pipeline,
            artifact_path="customer_churn_with_pca",
            signature=signature_pca,
            input_example=X_train,
            registered_model_name=f"{name}_pipeline_with_pca",
        )

print("\n‚úì STEP 6: All 4 PCA models trained and logged.")


# =============================================================================
# STEP 7: Choose GLOBAL Best Model (with or without PCA)
# =============================================================================

all_results = {}
all_results.update(results)
all_results.update(pca_results)

global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])
global_best_f1 = all_results[global_best_name]["test_f1"]
global_best_cv_f1 = all_results[global_best_name]["cv_f1"]
global_best_pipeline = all_results[global_best_name]["pipeline"]

uses_pca = "with_pca" in global_best_name

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)")
print("=" * 80)
print(f"Global best model key: {global_best_name}")
print(f"Global best CV F1:    ${global_best_cv_f1:.4f}")
print(f"Global best Test F1:  ${global_best_f1:.4f}")
print(f"Uses PCA:            {uses_pca}")


# =============================================================================
# STEP 9: Save, Load, and Compare the GLOBAL Best Model
# =============================================================================

def save_model(model, filename="global_best_model.pkl"):
    joblib.dump(model, filename)
    print(f"‚úì Model saved to {filename}")

print("\n" + "-" * 80)
print("Saving and reloading GLOBAL best model...")
print("-" * 80)

save_model(global_best_pipeline, filename=f"{base_folder}/models/global_best_model.pkl")

print("\nDone:")
print(f"- GLOBAL best model key: {global_best_name}")
print(f"- GLOBAL best CV F1:    ${global_best_cv_f1:.4f}")
print(f"- GLOBAL best Test F1:  ${global_best_f1:.4f}")

end_time = time.monotonic()
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60
print(f"Elapsed time: {minutes} minutes and {seconds:.2f} seconds")

‚úì STEP 1: Preprocessing pipeline created.
‚úì STEP 2: Stratified split done. Train size: 8000, Test size: 2000
‚úì STEP 3: 4 baseline model pipelines defined.
‚úì STEP 4: MLflow configured.

Training baseline model: ridge
ridge (no PCA) CV F1: $0.2114
ridge (no PCA) Test F1: $0.1954


Registered model 'ridge_pipeline' already exists. Creating a new version of this model...
2025/12/18 19:53:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline, version 2
Created version '2' of model 'ridge_pipeline'.


üèÉ View run ridge_baseline at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/aa9d65f711ed469d8a5b1f0559702831
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

Training baseline model: histgradientboosting
histgradientboosting (no PCA) CV F1: $0.6014
histgradientboosting (no PCA) Test F1: $0.5929


Registered model 'histgradientboosting_pipeline' already exists. Creating a new version of this model...
2025/12/18 19:53:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline, version 2
Created version '2' of model 'histgradientboosting_pipeline'.


üèÉ View run histgradientboosting_baseline at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/79d2e99c972342b390ee234ee50c5b0f
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

Training baseline model: xgboost
xgboost (no PCA) CV F1: $0.5777
xgboost (no PCA) Test F1: $0.5897


Registered model 'xgboost_pipeline' already exists. Creating a new version of this model...
2025/12/18 19:53:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline, version 2
Created version '2' of model 'xgboost_pipeline'.


üèÉ View run xgboost_baseline at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/01728d0b35234699b55d6791d2a4cffd
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

Training baseline model: lightgbm
lightgbm (no PCA) CV F1: $0.5841
[LightGBM] [Info] Number of positive: 1630, number of negative: 6370
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 870
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203750 -> initscore=-1.363019
[LightGBM] [Info] Start training from score -1.363019




lightgbm (no PCA) Test F1: $0.5986


Registered model 'lightgbm_pipeline' already exists. Creating a new version of this model...
2025/12/18 19:54:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline, version 2
Created version '2' of model 'lightgbm_pipeline'.


üèÉ View run lightgbm_baseline at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/1206bbd793ea4ed3af1404485eb5b2c4
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

‚úì STEP 5: All 4 baseline models trained and logged.

Training PCA-augmented model: ridge
ridge_with_pca CV F1: $0.1642
ridge_with_pca Test F1: $0.1857


Registered model 'ridge_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:54:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_with_pca, version 2
Created version '2' of model 'ridge_pipeline_with_pca'.


üèÉ View run ridge_with_pca at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/8da77a3028274484b058afdc97942b73
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

Training PCA-augmented model: histgradientboosting
histgradientboosting_with_pca CV F1: $0.5643
histgradientboosting_with_pca Test F1: $0.5876


Registered model 'histgradientboosting_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:54:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_with_pca, version 2
Created version '2' of model 'histgradientboosting_pipeline_with_pca'.


üèÉ View run histgradientboosting_with_pca at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/1658182c366b4a4991a33e2c272f8be3
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

Training PCA-augmented model: xgboost
xgboost_with_pca CV F1: $0.5523
xgboost_with_pca Test F1: $0.5665


Registered model 'xgboost_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:55:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_with_pca, version 2
Created version '2' of model 'xgboost_pipeline_with_pca'.


üèÉ View run xgboost_with_pca at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/355065bf9c204b9cbd8b18d282299cd9
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

Training PCA-augmented model: lightgbm
lightgbm_with_pca CV F1: $0.5645
[LightGBM] [Info] Number of positive: 1630, number of negative: 6370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000862 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.203750 -> initscore=-1.363019
[LightGBM] [Info] Start training from score -1.363019




lightgbm_with_pca Test F1: $0.5832


Registered model 'lightgbm_pipeline_with_pca' already exists. Creating a new version of this model...
2025/12/18 19:55:37 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_with_pca, version 2
Created version '2' of model 'lightgbm_pipeline_with_pca'.


üèÉ View run lightgbm_with_pca at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0/runs/31d296a9c6a24228990a688a1e416bd9
üß™ View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/0

‚úì STEP 6: All 4 PCA models trained and logged.

GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)
Global best model key: lightgbm
Global best CV F1:    $0.5841
Global best Test F1:  $0.5986
Uses PCA:            False

--------------------------------------------------------------------------------
Saving and reloading GLOBAL best model...
--------------------------------------------------------------------------------
‚úì Model saved to /content/drive/MyDrive/Colab Notebooks/customer_churn/models/global_best_model.pkl

Done:
- GLOBAL best model key: lightgbm
- GLOBAL best CV F1:    $0.5841
- GLOBAL best Test F1:  $0.5986
Elapsed time: 2 minutes and 55.32 seconds
