In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import mlflow
import mlflow.sklearn
from hyperopt import hp, tpe, fmin, Trials, STATUS_OK



In [24]:
def load_data(path="../data/processed/customer_data_new_features.csv"):
    df = pd.read_csv(path)
    if "customer_id" in df.columns:
        df = df.drop("customer_id", axis=1)

    X = df.drop("churn", axis=1)
    y = df["churn"]

    return train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = load_data()


In [25]:
def objective_rf(params):
    model = RandomForestClassifier(
        n_estimators=int(params["n_estimators"]),
        max_depth=int(params["max_depth"]),
        min_samples_split=int(params["min_samples_split"]),
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)

    return {"loss": -auc, "status": STATUS_OK, "metrics": (acc, f1, auc), "model": model}


In [20]:

def objective_xgb(params):
    """Objective function for XGBoost"""
    model = XGBClassifier(
        n_estimators=int(params["n_estimators"]),
        max_depth=int(params["max_depth"]),
        learning_rate=params["learning_rate"],
        subsample=params["subsample"],
        colsample_bytree=params["colsample_bytree"],
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)

    return {"loss": -auc, "status": STATUS_OK, "metrics": (acc, f1, auc), "model": model}



In [26]:
space_rf = {
    "n_estimators": hp.quniform("n_estimators", 50, 500, 50),
    "max_depth": hp.quniform("max_depth", 3, 20, 1),
    "min_samples_split": hp.quniform("min_samples_split", 2, 10, 1),
}

space_xgb = {
    "n_estimators": hp.quniform("n_estimators", 50, 500, 50),
    "max_depth": hp.quniform("max_depth", 3, 15, 1),
    "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
    "subsample": hp.uniform("subsample", 0.5, 1.0),
    "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
}




In [27]:
def run_experiment(model_name, objective_fn, space, max_evals=20):
    mlflow.set_experiment("churn_prediction_experiments")

    trials = Trials()
    best = fmin(
        fn=objective_fn,
        space=space,
        algo=tpe.suggest,
        max_evals=max_evals,
        trials=trials
    )

    best_trial = min(trials.results, key=lambda x: x["loss"])
    acc, f1, auc = best_trial["metrics"]
    best_model = best_trial["model"]


    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(best)
        mlflow.log_metrics({"accuracy": acc, "f1_score": f1, "roc_auc": auc})
        mlflow.sklearn.log_model(best_model, artifact_path="model")

    print(f"{model_name} -> Acc: {acc:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")
    return best_model


In [None]:
best_rf = run_experiment("RandomForest", objective_rf, space_rf, max_evals=20)
best_xgb = run_experiment("XGBoost", objective_xgb, space_xgb, max_evals=20)

In [34]:
from mlflow.tracking import MlflowClient

def collect_best_run(experiment_name="churn_prediction_experiments", metric="roc_auc"):
    client = MlflowClient()
    experiment = client.get_experiment_by_name(experiment_name)
    runs = client.search_runs(experiment.experiment_id, order_by=[f"metrics.{metric} DESC"], max_results=1)
    
    if not runs:
        raise ValueError("No runs found in experiment")
    
    best_run = runs[0]
    best_metrics = best_run.data.metrics
    run_id = best_run.info.run_id
    
    print(f"🏆 Best run ID: {run_id}, {metric}={best_metrics[metric]:.4f}")
    
    return run_id, best_metrics


In [35]:
def register_best_run(run_id, model_name="churn_model"):
    client = MlflowClient()
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri, model_name)
    print(f"✅ Registered best model (run {run_id}) as {model_name}")


In [36]:
best_run_id, best_metrics = collect_best_run(metric="roc_auc")
register_best_run(best_run_id, model_name="churn_model")


Traceback (most recent call last):
  File "c:\Anurag\loylty_rewardz\churn_prediction\venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 367, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Anurag\loylty_rewardz\churn_prediction\venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 465, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Anurag\loylty_rewardz\churn_prediction\venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 1635, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Anurag\loylty_rewardz\churn_prediction\venv\Lib\site-packages\mlflow\store\tracking\file_store.py", line 1628, in _read_helper
    result =

🏆 Best run ID: d376ead2fc864c45a5a91c6da8bb1a11, roc_auc=0.8689
✅ Registered best model (run d376ead2fc864c45a5a91c6da8bb1a11) as churn_model


Created version '1' of model 'churn_model'.
