In [1]:
import os

# Data Manipulation
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Model Tunnning
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# MLflow
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("churn-prediction")

2022/08/21 23:18:22 INFO mlflow.tracking.fluent: Experiment with name 'churn-prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='churn-prediction', tags={}>

# Global Variables

In [2]:
RANDOM_STATE = 12354

# Read Data

In [5]:
# Paths
root = os.getcwd()
data_folder_path = os.path.join("data")
data_train_file_path = os.path.join(data_folder_path, "train_set.csv")
data_test_file_path = os.path.join(data_folder_path, "test_set.csv")

In [6]:
# Read Data
data_train = pd.read_csv(data_train_file_path)
data_test = pd.read_csv(data_test_file_path)
data_train["TotalCharges"] = data_train["TotalCharges"].replace(" ", "0").astype(float)
data_test["TotalCharges"] = data_test["TotalCharges"].replace(" ", "0").astype(float)
#data["Churn"] = data["Churn"].replace({"No": 0, "Yes": 1})

# Preproccesing

In [8]:
# Selecting Features and Target
y_target = "Churn"
X_features = [c for c in data_train.columns if c != y_target]

y_train = data_train[y_target].copy()
X_train = data_train[X_features].copy()
y_test = data_test[y_target].copy()
X_test = data_test[X_features].copy()

In [9]:
# Pipelines For Column Tranformation
numeric_columns = ['TotalCharges', 'tenure', 'MonthlyCharges']

categorical_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
 'PaymentMethod', 'SeniorCitizen']

numeric_pipeline = "passthrough"

categorical_pipeline = Pipeline(steps =
                                [("encoder", OrdinalEncoder(handle_unknown = "use_encoded_value",
                                                            unknown_value = np.nan))])

column_trans = ColumnTransformer(
    [("num_columns", numeric_pipeline, numeric_columns),
     ("cat_columns", categorical_pipeline, categorical_columns)])

# Model Trainning

**Objectives:**

- We want to detect as churn

**Metric to be optimized:**

- **accuracy**


<kbd>
  <img src="images/confusion_matrix.ppm">
</kbd>

In [10]:
# Adding SMOTE and Random forest to the pipeline
rf_pipeline = Pipeline(steps = [
        ("transformations", column_trans),
        ("rf_estimator", RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state = RANDOM_STATE))
])

## Tracking a Single Experiment Run

In [11]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Chris")
    mlflow.set_tag("model_type", "RandomForest")
    
    rf_params = {"max_features": 3,
                "max_depth": 4,
                "ccp_alpha": 0.1}
    mlflow.log_params(rf_params)
    rf_pipeline.named_steps["rf_estimator"].set_params(**rf_params)
    
    skfold = StratifiedKFold(n_splits = 5, random_state = RANDOM_STATE, shuffle = True)
    new_scores = cross_val_score(rf_pipeline, X_train, y_train, cv = skfold, scoring = "accuracy", n_jobs = -1)
    accuracy_mean = new_scores.mean()
    
    mlflow.log_metric("accuracy_mean", accuracy_mean)

## Tracking a Multiples Experiment Runs

In [23]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("developer", "Chris")
        mlflow.set_tag("model_type", "RandomForest")
        mlflow.set_tag("tunning_type", "tpe")
        
        mlflow.log_params(params)
        rf_pipeline.named_steps["rf_estimator"].set_params(**params)
        
        skfold = StratifiedKFold(n_splits = 5, random_state = RANDOM_STATE, shuffle = True)
        new_scores = cross_val_score(rf_pipeline, X_train, y_train, cv = skfold, scoring = "accuracy", n_jobs = -1)
        accuracy_mean = new_scores.mean()
        
        mlflow.log_metric("accuracy_mean", accuracy_mean)
        
    return {'loss': -accuracy_mean, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 10, 1)),
    "max_features": scope.int(hp.quniform('max_features', 1, 5, 1)),
    "ccp_alpha": hp.uniform('ccp_alpha', 0.00000001, 0.001),
}

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 20,
    trials = Trials(),
    rstate = np.random.default_rng(RANDOM_STATE)
)

100%|████████████████████████████████████| 20/20 [01:47<00:00,  5.37s/trial, best loss: -0.8015873613141826]


In [22]:
best_result

{'ccp_alpha': 0.0003595063999887283, 'max_depth': 8.0, 'max_features': 4.0}

## Best Model

In [25]:
with mlflow.start_run() as run:
    mlflow.set_tag("developer", "Chris")
    mlflow.set_tag("model_type", "RandomForest")
    
    best_params = {'ccp_alpha': best_result["ccp_alpha"], 
                   'max_depth': int(best_result["max_depth"]), 
                   'max_features': int(best_result["max_features"])}
    mlflow.log_params(rf_params)
    rf_pipeline.named_steps["rf_estimator"].set_params(**best_params)
    
    rf_pipeline.fit(X_train, y_train)
    
    y_pred = rf_pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precission = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precission", precission)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    
    mlflow.sklearn.log_model(rf_pipeline, artifact_path = "models_mlflow")
    BEST_RUN_ID = run.info.run_id

## Model Registry

**First we need to have the model id to be registered.**

In [27]:
# MLflow Client Instance
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)

In [28]:
# Lists Experiments
client.list_experiments()

[<Experiment: artifact_location='./mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='churn-prediction', tags={}>]

In [29]:
# See Some Experiments Accuracy
runs = client.search_runs(
    experiment_ids = '1',    # Experiment ID we want
    filter_string = "metrics.accuracy > 0.75",
    run_view_type = ViewType.ACTIVE_ONLY,
    max_results = 10,
    order_by = ["metrics.accuracy DESC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}, accuracy: {run.data.metrics['accuracy']:.4f}")

run id: 66ea5b26e7e643e98aed14a30ce90c3d, accuracy: 0.8109
run id: a69ec5457f69406ca7b8b30cec1d9b17, accuracy: 0.8081
run id: d752185177cf4d489bb771fb3a85a325, accuracy: 0.8012


**Now we can register the model.**

In [30]:
BEST_RUN_ID

'a69ec5457f69406ca7b8b30cec1d9b17'

In [31]:
# Register the Model
model_uri = f"runs:/{BEST_RUN_ID}/models"
mlflow.register_model(model_uri = model_uri, name = "churn-classifier")

Successfully registered model 'churn-classifier'.
2022/08/21 23:37:31 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: churn-classifier, version 1
Created version '1' of model 'churn-classifier'.


<ModelVersion: creation_timestamp=1661125051759, current_stage='None', description=None, last_updated_timestamp=1661125051759, name='churn-classifier', run_id='a69ec5457f69406ca7b8b30cec1d9b17', run_link=None, source='./mlruns/1/a69ec5457f69406ca7b8b30cec1d9b17/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>

**Promote the model to staging.**

In [32]:
# Promote a model to staging
model_version = 1
new_stage = "Staging"
model_name = "churn-classifier"
client.transition_model_version_stage(
    name = model_name,
    version = model_version,
    stage = new_stage,
    archive_existing_versions = False
)

<ModelVersion: creation_timestamp=1661125051759, current_stage='Staging', description=None, last_updated_timestamp=1661125057284, name='churn-classifier', run_id='a69ec5457f69406ca7b8b30cec1d9b17', run_link=None, source='./mlruns/1/a69ec5457f69406ca7b8b30cec1d9b17/artifacts/models', status='READY', status_message=None, tags={}, user_id=None, version=1>