In [20]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK

In [21]:
df = pd.read_csv("../data/processed/customer_data_new_features.csv")

X = df.drop("churn", axis=1)
y = df["churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [22]:
def get_models():
    return {
        "LogisticRegression": LogisticRegression(max_iter=500),
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
    }

In [23]:
def train_and_log(model, model_name, params=None):
    with mlflow.start_run(run_name=model_name):
        if params:
            model.set_params(**params)

        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)

      
        mlflow.log_metric("accuracy", acc)
        mlflow.log_param("model", model_name)
        if params:
            mlflow.log_params(params)

        mlflow.sklearn.log_model(model, model_name)

        print(f"{model_name} -> Accuracy: {acc:.4f}")
        return acc

In [24]:
def hyperopt_tune(model_name, max_evals=20):
    if model_name == "RandomForest":
        space = {
            "n_estimators": hp.choice("n_estimators", [50, 100, 200]),
            "max_depth": hp.choice("max_depth", [5, 10, 20, None]),
            "min_samples_split": hp.uniform("min_samples_split", 0.01, 0.2)
        }
        base_model = RandomForestClassifier(random_state=42)

    elif model_name == "XGBoost":
        space = {
            "n_estimators": hp.choice("n_estimators", [50, 100, 200]),
            "max_depth": hp.choice("max_depth", [3, 5, 7]),
            "learning_rate": hp.uniform("learning_rate", 0.01, 0.3)
        }
        base_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

    else:
        print(f"No hyperopt space defined for {model_name}")
        return

    def objective(params):
        acc = train_and_log(base_model, f"{model_name}_tuned", params)
        return {"loss": -acc, "status": STATUS_OK}

    trials = Trials()
    best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
    print(f"Best params for {model_name}: {best}")
    return best

In [28]:
mlflow.set_experiment("demo_churn")

models = get_models()
for name, model in models.items():
    train_and_log(model, name)

hyperopt_tune("RandomForest", max_evals=8)
hyperopt_tune("XGBoost", max_evals=8)

2025/09/28 18:26:29 INFO mlflow.tracking.fluent: Experiment with name 'demo_churn' does not exist. Creating a new experiment.


LogisticRegression -> Accuracy: 0.8254




RandomForest -> Accuracy: 0.8217




XGBoost -> Accuracy: 0.8343
  0%|          | 0/8 [00:00<?, ?trial/s, best loss=?]





RandomForest_tuned -> Accuracy: 0.8366               
 12%|█▎        | 1/8 [00:09<01:04,  9.20s/trial, best loss: -0.8366366366366367]





RandomForest_tuned -> Accuracy: 0.8359                                          
 25%|██▌       | 2/8 [00:16<00:49,  8.26s/trial, best loss: -0.8366366366366367]





RandomForest_tuned -> Accuracy: 0.8357                                          
 38%|███▊      | 3/8 [00:28<00:48,  9.78s/trial, best loss: -0.8366366366366367]





RandomForest_tuned -> Accuracy: 0.8315                                          
 50%|█████     | 4/8 [00:35<00:34,  8.60s/trial, best loss: -0.8366366366366367]





RandomForest_tuned -> Accuracy: 0.8315                                          
 62%|██████▎   | 5/8 [00:41<00:22,  7.63s/trial, best loss: -0.8366366366366367]





RandomForest_tuned -> Accuracy: 0.8371                                          
 75%|███████▌  | 6/8 [00:48<00:15,  7.69s/trial, best loss: -0.8371371371371371]





RandomForest_tuned -> Accuracy: 0.8336                                          
 88%|████████▊ | 7/8 [00:54<00:07,  7.03s/trial, best loss: -0.8371371371371371]





RandomForest_tuned -> Accuracy: 0.8371                                          
100%|██████████| 8/8 [01:00<00:00,  7.51s/trial, best loss: -0.8371371371371371]
Best params for RandomForest: {'max_depth': np.int64(0), 'min_samples_split': np.float64(0.17570957228003786), 'n_estimators': np.int64(2)}
  0%|          | 0/8 [00:00<?, ?trial/s, best loss=?]





XGBoost_tuned -> Accuracy: 0.8373                    
 12%|█▎        | 1/8 [00:05<00:35,  5.06s/trial, best loss: -0.8373373373373373]





XGBoost_tuned -> Accuracy: 0.8351                                               
 25%|██▌       | 2/8 [00:09<00:29,  4.84s/trial, best loss: -0.8373373373373373]





XGBoost_tuned -> Accuracy: 0.8371                                               
 38%|███▊      | 3/8 [00:14<00:23,  4.75s/trial, best loss: -0.8373373373373373]





XGBoost_tuned -> Accuracy: 0.8365                                               
 50%|█████     | 4/8 [00:18<00:18,  4.68s/trial, best loss: -0.8373373373373373]





XGBoost_tuned -> Accuracy: 0.8355                                               
 62%|██████▎   | 5/8 [00:24<00:14,  4.85s/trial, best loss: -0.8373373373373373]





XGBoost_tuned -> Accuracy: 0.8366                                               
 75%|███████▌  | 6/8 [00:28<00:09,  4.79s/trial, best loss: -0.8373373373373373]





XGBoost_tuned -> Accuracy: 0.8300                                               
 88%|████████▊ | 7/8 [00:33<00:04,  4.89s/trial, best loss: -0.8373373373373373]





XGBoost_tuned -> Accuracy: 0.8355                                               
100%|██████████| 8/8 [00:38<00:00,  4.82s/trial, best loss: -0.8373373373373373]
Best params for XGBoost: {'learning_rate': np.float64(0.1706869478158034), 'max_depth': np.int64(1), 'n_estimators': np.int64(0)}


{'learning_rate': np.float64(0.1706869478158034),
 'max_depth': np.int64(1),
 'n_estimators': np.int64(0)}

In [29]:
exp_name = "demo_churn"
experiment = mlflow.get_experiment_by_name(exp_name)
runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

best_run = runs_df.loc[runs_df["metrics.accuracy"].idxmax()]
best_run_id = best_run.run_id

print("🔥 Best Run Details 🔥")
print(f"Run ID     : {best_run_id}")
print(f"Model Name : {best_run['params.model']}")
print(f"Accuracy   : {best_run['metrics.accuracy']:.4f}")

client = mlflow.tracking.MlflowClient()
params = client.get_run(best_run_id).data.params

print("\nHyperparameters:")
for k, v in params.items():
    if k != "model":  
        print(f"{k}: {v}")


🔥 Best Run Details 🔥
Run ID     : b92465da1fe942bcb1ee716e2267f081
Model Name : XGBoost_tuned
Accuracy   : 0.8373

Hyperparameters:
learning_rate: 0.1706869478158034
max_depth: 5
n_estimators: 50
