In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
!pip install optuna xgboost lightgbm "mlflow<3"



In [4]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/customer_churn"
%cd "{base_folder}"

/content/drive/MyDrive/Colab Notebooks/customer_churn


In [5]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/customer_churn.db")
customer = pd.read_sql_query(
    """
    SELECT
        c.customerId,
        c.surname,
        c.CreditScore,
        c.age,
        c.tenure,
        c.balance,
        c.numofproducts,
        c.hascrcard,
        c.IsActiveMember,
        g.gender,
        geo.geography,
        c.estimatedSalary,
        c.exited
    FROM customer_fact AS c
    JOIN gender AS g
        ON g.gender_id = c.gender_id
    JOIN geography AS geo
        ON geo.geography_id = c.geography_id
    ORDER BY c.customerId
    """,
    conn,
)
conn.close()

customer.head()

Unnamed: 0,customerId,surname,creditScore,age,tenure,balance,numofProducts,hasCrCard,isActiveMember,gender,geography,estimatedSalary,exited
0,15565701,Ferri,698,39,9,161993.89,1,0,0,Female,Spain,90212.38,0
1,15565706,Akobundu,612,35,1,0.0,1,1,1,Male,Spain,83256.26,1
2,15565714,Cattaneo,601,47,1,64430.06,2,0,1,Male,France,96517.97,0
3,15565779,Kent,627,30,6,57809.32,1,1,0,Female,Germany,188258.49,0
4,15565796,Docherty,745,48,10,96048.55,1,1,0,Male,Germany,74510.65,0


In [6]:
# =============================================================================
# FULL PIPELINE with OPTUNA
# - Build preprocessing
# - Stratified train/test split
# - Train & log 4 models WITHOUT PCA (RidgeClassifier, HGB, XGBoost, LightGBM)
# - Train & log 4 models WITH PCA (preprocessing + PCA(0.95) + model)
# - Pick GLOBAL best among 8 models by Test F1
# - Save, load, and compare the global best model
# =============================================================================

import time
import os
import numpy as np
import pandas as pd

from dotenv import load_dotenv

from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import mlflow
from mlflow.models import infer_signature
import joblib

import optuna
from optuna.samplers import TPESampler

from sklearn.base import clone

import sys
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Shared components
from customer_pipeline import (
    build_preprocessing,
    make_estimator_for_name
)

start_time = time.monotonic()
optuna.logging.set_verbosity(optuna.logging.WARNING)


# =============================================================================
# STEP 1: Build Full ML Preprocessing Pipeline
# =============================================================================

preprocessing = build_preprocessing()
print("✓ STEP 1: Preprocessing pipeline created.")


# =============================================================================
# STEP 2: Split Data into Stratified Train and Test Sets
# =============================================================================

# Drop customerId and surname
customer.drop(["customerId", "surname"], axis=1, inplace=True)

# Create isZeroBalance feature to add zero balance information on prediction
customer["isZeroBalance"] = (customer["balance"] == 0).astype(int)

# Stratification prep
customer["combined_geo_exited"] = customer["geography"] + "_" + customer["exited"].astype(str)

train_set, test_set = train_test_split(
    customer,
    test_size=0.20,
    stratify=customer["combined_geo_exited"],
    random_state=42,
)

for df in (train_set, test_set):
    df.drop("combined_geo_exited", axis=1, inplace=True)

X_train = train_set.drop(["exited"], axis=1).copy()
y_train = train_set["exited"].copy()

X_test = test_set.drop(["exited"], axis=1).copy()
y_test = test_set["exited"].copy()

print(f"✓ STEP 2: Stratified split done. Train size: {len(X_train)}, Test size: {len(X_test)}")


# =============================================================================
# STEP 3: Configure MLflow
# =============================================================================

load_dotenv(
    dotenv_path=f"{base_folder}/notebooks/.env",
    override=True
)

MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")
MLFLOW_TRACKING_USERNAME = os.getenv("MLFLOW_TRACKING_USERNAME")
MLFLOW_TRACKING_PASSWORD = os.getenv("MLFLOW_TRACKING_PASSWORD")

if MLFLOW_TRACKING_USERNAME:
    os.environ["MLFLOW_TRACKING_USERNAME"] = MLFLOW_TRACKING_USERNAME
if MLFLOW_TRACKING_PASSWORD:
    os.environ["MLFLOW_TRACKING_PASSWORD"] = MLFLOW_TRACKING_PASSWORD

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("customer_churn_multi_model_optuna")

print("✓ STEP 3: MLflow configured.")


# =============================================================================
# STEP 4: Define Optuna Objective Functions (NO PCA)
# =============================================================================

def objective_ridge(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, RidgeClassifier(alpha=alpha))
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_hgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        HistGradientBoostingClassifier(
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_xgb(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        XGBClassifier(
            objective="reg:squarederror",
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            tree_method="hist",
            n_jobs=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_lgbm(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        LGBMClassifier(
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            n_jobs=-1,
            verbose=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


# =============================================================================
# STEP 5: Run Optuna Studies for Each Model (NO PCA)
# =============================================================================

model_names = ["ridge", "histgradientboosting", "xgboost", "lightgbm"]
objective_functions = {
    "ridge": objective_ridge,
    "histgradientboosting": objective_hgb,
    "xgboost": objective_xgb,
    "lightgbm": objective_lgbm,
}

results = {}

for name in model_names:
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} (NO PCA) - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=42),
        study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: objective_functions[name](trial, preprocessing, X_train, y_train),
        n_trials=10,
        show_progress_bar=True
    )

    cv_f1 = study.best_value
    print(f"\nBest {name.upper()} CV F1: ${cv_f1:.4f}")
    print(f"Best params: {study.best_params}")

    best_params = study.best_params
    preprocessing_clone = clone(preprocessing)

    if name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            RidgeClassifier(alpha=best_params["ridge__alpha"])
        )
    elif name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            HistGradientBoostingClassifier(
                learning_rate=best_params["hgb__learning_rate"],
                max_depth=best_params["hgb__max_depth"],
                random_state=42
            )
        )
    elif name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            XGBClassifier(
                objective="reg:squarederror",
                random_state=42,
                n_estimators=best_params["xgb__n_estimators"],
                learning_rate=best_params["xgb__learning_rate"],
                max_depth=best_params["xgb__max_depth"],
                tree_method="hist",
                n_jobs=-1,
            )
        )
    elif name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            LGBMClassifier(
                random_state=42,
                n_estimators=best_params["lgbm__n_estimators"],
                learning_rate=best_params["lgbm__learning_rate"],
                num_leaves=best_params["lgbm__num_leaves"],
                n_jobs=-1,
                verbose=-1,
            )
        )

    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    print(f"{name} (no PCA) Test F1: ${test_f1:.4f}")

    results[name] = {"pipeline": final_model, "test_f1": test_f1, "cv_f1": cv_f1}

    with mlflow.start_run(run_name=f"{name}_baseline_optuna"):
        mlflow.log_param("model_family", name)
        mlflow.log_param("uses_pca", False)
        mlflow.log_params(best_params)
        mlflow.log_metric("cv_f1", cv_f1)
        mlflow.log_metric("test_f1", test_f1)

        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="customer_churn",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{name}_pipeline_optuna",
        )

print("\n✓ STEP 5: All 4 baseline models optimized and logged.")


# =============================================================================
# STEP 6: PCA Optuna Objectives
# =============================================================================

def objective_ridge_pca(trial, preprocessing, X_train, y_train):
    alpha = trial.suggest_float("ridge__alpha", 0.1, 100.0, log=True)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(preprocessing_clone, PCA(n_components=pca_components), RidgeClassifier(alpha=alpha))
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_hgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("hgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("hgb__max_depth", 3, 8)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        HistGradientBoostingClassifier(
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=42
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_xgb_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("xgb__learning_rate", 0.05, 0.2)
    max_depth = trial.suggest_int("xgb__max_depth", 3, 8)
    n_estimators = trial.suggest_int("xgb__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        XGBClassifier(
            objective="reg:squarederror",
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            tree_method="hist",
            n_jobs=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


def objective_lgbm_pca(trial, preprocessing, X_train, y_train):
    learning_rate = trial.suggest_float("lgbm__learning_rate", 0.05, 0.2)
    num_leaves = trial.suggest_int("lgbm__num_leaves", 20, 80)
    n_estimators = trial.suggest_int("lgbm__n_estimators", 100, 300, step=50)
    pca_components = trial.suggest_float("pca__n_components", 0.90, 0.99)
    preprocessing_clone = clone(preprocessing)
    pipeline = make_pipeline(
        preprocessing_clone,
        PCA(n_components=pca_components),
        LGBMClassifier(
            random_state=42,
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            num_leaves=num_leaves,
            n_jobs=-1,
            verbose=-1,
        )
    )
    scores = cross_val_score(
        pipeline, X_train, y_train,
        cv=3, scoring="f1", n_jobs=-1
    )
    return scores.mean()


# =============================================================================
# STEP 7: Run Optuna Studies for PCA Models
# =============================================================================

pca_model_names = ["ridge_with_pca", "histgradientboosting_with_pca", "xgboost_with_pca", "lightgbm_with_pca"]
pca_objective_functions = {
    "ridge_with_pca": objective_ridge_pca,
    "histgradientboosting_with_pca": objective_hgb_pca,
    "xgboost_with_pca": objective_xgb_pca,
    "lightgbm_with_pca": objective_lgbm_pca,
}

pca_results = {}

for name in pca_model_names:
    base_name = name.replace("_with_pca", "")
    print(f"\n{'='*80}")
    print(f"Optimizing {name.upper()} - 10 trials")
    print(f"{'='*80}")

    study = optuna.create_study(
        direction="maximize",
        sampler=TPESampler(seed=42),
        study_name=f"{name}_study"
    )

    study.optimize(
        lambda trial: pca_objective_functions[name](trial, preprocessing, X_train, y_train),
        n_trials=10,
        show_progress_bar=True
    )

    cv_f1_pca = study.best_value
    print(f"\nBest {name.upper()} CV F1: ${cv_f1_pca:.4f}")
    print(f"Best params: {study.best_params}")

    best_params = study.best_params
    preprocessing_clone = clone(preprocessing)

    if base_name == "ridge":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            RidgeClassifier(alpha=best_params["ridge__alpha"])
        )
    elif base_name == "histgradientboosting":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            HistGradientBoostingClassifier(
                learning_rate=best_params["hgb__learning_rate"],
                max_depth=best_params["hgb__max_depth"],
                random_state=42
            )
        )
    elif base_name == "xgboost":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            XGBClassifier(
                objective="reg:squarederror",
                random_state=42,
                n_estimators=best_params["xgb__n_estimators"],
                learning_rate=best_params["xgb__learning_rate"],
                max_depth=best_params["xgb__max_depth"],
                tree_method="hist",
                n_jobs=-1,
            )
        )
    elif base_name == "lightgbm":
        final_model = make_pipeline(
            preprocessing_clone,
            PCA(n_components=best_params["pca__n_components"]),
            LGBMClassifier(
                random_state=42,
                n_estimators=best_params["lgbm__n_estimators"],
                learning_rate=best_params["lgbm__learning_rate"],
                num_leaves=best_params["lgbm__num_leaves"],
                n_jobs=-1,
                verbose=-1,
            )
        )

    final_model.fit(X_train, y_train)

    y_pred = final_model.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)
    print(f"{name} Test F1: ${test_f1:.4f}")

    pca_results[name] = {"pipeline": final_model, "test_f1": test_f1, "cv_f1": cv_f1_pca}

    with mlflow.start_run(run_name=f"{name}_optuna"):
        mlflow.log_param("model_family", base_name)
        mlflow.log_param("uses_pca", True)
        mlflow.log_params(best_params)
        mlflow.log_metric("cv_f1", cv_f1_pca)
        mlflow.log_metric("test_f1", test_f1)

        signature = infer_signature(X_train, final_model.predict(X_train))
        mlflow.sklearn.log_model(
            sk_model=final_model,
            artifact_path="customer_churn_with_pca",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{base_name}_pipeline_with_pca_optuna",
        )

print("\n✓ STEP 7: All 4 PCA models optimized and logged.")


# =============================================================================
# STEP 8: Choose GLOBAL Best Model
# =============================================================================

all_results = {}
all_results.update(results)
all_results.update(pca_results)

global_best_name = max(all_results, key=lambda k: all_results[k]["test_f1"])
global_best_f1 = all_results[global_best_name]["test_f1"]
global_best_cv_f1 = all_results[global_best_name]["cv_f1"]
global_best_pipeline = all_results[global_best_name]["pipeline"]

uses_pca = "with_pca" in global_best_name

print("\n" + "=" * 80)
print("GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)")
print("=" * 80)
print(f"Global best model key: {global_best_name}")
print(f"Global best CV F1:    ${global_best_cv_f1:.4f}")
print(f"Global best Test F1:  ${global_best_f1:.4f}")
print(f"Uses PCA:               {uses_pca}")


# =============================================================================
# STEP 9: Save, Load, and Compare the GLOBAL Best Model
# =============================================================================

def save_model(model, filename="global_best_model_optuna.pkl"):
    joblib.dump(model, filename)
    print(f"✓ Model saved to {filename}")

print("\n" + "-" * 80)
print("Saving and reloading GLOBAL best model...")
print("-" * 80)

save_model(global_best_pipeline, filename=f"{base_folder}/models/global_best_model_optuna.pkl")

print("\nDone:")
print(f"- GLOBAL best model key: {global_best_name}")
print(f"- GLOBAL best CV F1:    ${global_best_cv_f1:.4f}")
print(f"- GLOBAL best Test F1:  ${global_best_f1:.4f}")

end_time = time.monotonic()
elapsed_time = end_time - start_time
minutes = int(elapsed_time // 60)
seconds = elapsed_time % 60
print(f"Elapsed time: {minutes} minutes and {seconds:.2f} seconds")

✓ STEP 1: Preprocessing pipeline created.
✓ STEP 2: Stratified split done. Train size: 8000, Test size: 2000
✓ STEP 3: MLflow configured.

Optimizing RIDGE (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best RIDGE CV F1: $0.2114
Best params: {'ridge__alpha': 1.3292918943162166}
ridge (no PCA) Test F1: $0.1954


Registered model 'ridge_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 20:03:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_optuna, version 3
Created version '3' of model 'ridge_pipeline_optuna'.


🏃 View run ridge_baseline_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/e9f707338c5d45a9993b6cc7d0e1b485
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

Optimizing HISTGRADIENTBOOSTING (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best HISTGRADIENTBOOSTING CV F1: $0.5998
Best params: {'hgb__learning_rate': 0.17486639612006327, 'hgb__max_depth': 4}
histgradientboosting (no PCA) Test F1: $0.6014


Registered model 'histgradientboosting_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 20:03:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_optuna, version 3
Created version '3' of model 'histgradientboosting_pipeline_optuna'.


🏃 View run histgradientboosting_baseline_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/da7a578a633d4e2c816260708cff8641
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

Optimizing XGBOOST (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best XGBOOST CV F1: $0.5773
Best params: {'xgb__learning_rate': 0.12713516576204176, 'xgb__max_depth': 6, 'xgb__n_estimators': 100}
xgboost (no PCA) Test F1: $0.5740


Registered model 'xgboost_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 20:03:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_optuna, version 3
Created version '3' of model 'xgboost_pipeline_optuna'.


🏃 View run xgboost_baseline_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/0ec70ebc02564f9bbff165729d9f4d34
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

Optimizing LIGHTGBM (NO PCA) - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best LIGHTGBM CV F1: $0.5890
Best params: {'lgbm__learning_rate': 0.1397987726295555, 'lgbm__num_leaves': 29, 'lgbm__n_estimators': 100}




lightgbm (no PCA) Test F1: $0.5847


Registered model 'lightgbm_pipeline_optuna' already exists. Creating a new version of this model...
2025/12/18 20:04:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_optuna, version 3
Created version '3' of model 'lightgbm_pipeline_optuna'.


🏃 View run lightgbm_baseline_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/e542147c2cc14801b59391cc11b1c0e3
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

✓ STEP 5: All 4 baseline models optimized and logged.

Optimizing RIDGE_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best RIDGE_WITH_PCA CV F1: $0.2106
Best params: {'ridge__alpha': 1.3292918943162166, 'pca__n_components': 0.9855642875768924}
ridge_with_pca Test F1: $0.1958


Registered model 'ridge_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 20:04:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ridge_pipeline_with_pca_optuna, version 3
Created version '3' of model 'ridge_pipeline_with_pca_optuna'.


🏃 View run ridge_with_pca_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/8e5f9b9882704a64806ae38c95203ca5
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

Optimizing HISTGRADIENTBOOSTING_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best HISTGRADIENTBOOSTING_WITH_PCA CV F1: $0.5811
Best params: {'hgb__learning_rate': 0.15621088666940686, 'hgb__max_depth': 3, 'pca__n_components': 0.9872918866945795}
histgradientboosting_with_pca Test F1: $0.5574


Registered model 'histgradientboosting_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 20:05:12 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: histgradientboosting_pipeline_with_pca_optuna, version 3
Created version '3' of model 'histgradientboosting_pipeline_with_pca_optuna'.


🏃 View run histgradientboosting_with_pca_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/deaee29f45834a2b81c51d814ca994b7
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

Optimizing XGBOOST_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best XGBOOST_WITH_PCA CV F1: $0.5625
Best params: {'xgb__learning_rate': 0.05975773894779193, 'xgb__max_depth': 8, 'xgb__n_estimators': 300, 'pca__n_components': 0.9727557613304815}
xgboost_with_pca Test F1: $0.5673


Registered model 'xgboost_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 20:05:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgboost_pipeline_with_pca_optuna, version 3
Created version '3' of model 'xgboost_pipeline_with_pca_optuna'.


🏃 View run xgboost_with_pca_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/91a9ba6e6ee143c9bc055df38cc0b4d1
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

Optimizing LIGHTGBM_WITH_PCA - 10 trials


  0%|          | 0/10 [00:00<?, ?it/s]


Best LIGHTGBM_WITH_PCA CV F1: $0.5715
Best params: {'lgbm__learning_rate': 0.09569206537600561, 'lgbm__num_leaves': 25, 'lgbm__n_estimators': 250, 'pca__n_components': 0.9396137244365641}




lightgbm_with_pca Test F1: $0.5710


Registered model 'lightgbm_pipeline_with_pca_optuna' already exists. Creating a new version of this model...
2025/12/18 20:06:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: lightgbm_pipeline_with_pca_optuna, version 3
Created version '3' of model 'lightgbm_pipeline_with_pca_optuna'.


🏃 View run lightgbm_with_pca_optuna at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1/runs/2f96b1738b4547b296e2f083f9b9b10d
🧪 View experiment at: https://dagshub.com/Avasarala-lusi/customer_churn.mlflow/#/experiments/1

✓ STEP 7: All 4 PCA models optimized and logged.

GLOBAL BEST MODEL (ACROSS 8 CANDIDATES)
Global best model key: histgradientboosting
Global best CV F1:    $0.5998
Global best Test F1:  $0.6014
Uses PCA:               False

--------------------------------------------------------------------------------
Saving and reloading GLOBAL best model...
--------------------------------------------------------------------------------
✓ Model saved to /content/drive/MyDrive/Colab Notebooks/customer_churn/models/global_best_model_optuna.pkl

Done:
- GLOBAL best model key: histgradientboosting
- GLOBAL best CV F1:    $0.5998
- GLOBAL best Test F1:  $0.6014
Elapsed time: 4 minutes and 8.01 seconds
