In [1]:
DATA_PATH = "../data/processed/02_cleaned.pkl"

MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_EXPERIMENT_NAME = "churn_rate_prediction"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

In [2]:
# Load packages
import pandas as pd

from pathlib import Path
import os
import pickle

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier 


import mlflow
from mlflow.tracking import MlflowClient

___
## Functions

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, model_name):
    """
    Calculate the quality of the model according to different metric scores
    Input:
        ground_truth: from real observed data
        predictions: the predicted values from the model
        metric_function: the metric score funcrion used to measure performance
    Output:
        A dict of all scores for the given inputs
    """
    quality_score = {}
    quality_score[model_name] = round(metric_function(ground_truth, predictions), 3)
    quality_score = pd.Series(quality_score.values(), index=quality_score.keys())
    
    return quality_score

___
## Read data

In [4]:
df = pd.read_pickle(DATA_PATH)
df.sample(5)

Unnamed: 0,seniorcitizen,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
1556,0,0.0,1,1.0,1.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,91.7,91.7,1.0
123,0,1.0,7,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,50.7,350.35,0.0
5810,0,1.0,17,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,85.45,1451.6,1.0
1921,0,1.0,70,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,85.95,5931.75,0.0
90,0,1.0,30,1.0,1.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,0.0,2.0,82.05,2570.2,0.0


___
## Split data for modeling

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('churn', axis=1),
                                                    df[['churn']],
                                                    test_size=0.2,
                                                    random_state=42)

In [6]:
# quick check of splitted data:
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
X_train.sample(5)

X_train shape (5600, 17)
X_test shape (1400, 17)


Unnamed: 0,seniorcitizen,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
212,0,1.0,61,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,19.75,1124.2
86,0,0.0,35,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,62.15,2215.45
5362,0,1.0,65,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,24.75,1715.1
6084,0,0.0,36,1.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0,2.0,95.0,3440.25
3671,0,0.0,1,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,75.1,75.1


In [7]:
# quick check of splitted data:
print("y_train classes", y_train.value_counts(), "\n")
print("y_test classes", y_test.value_counts())

y_train classes churn
0.0      4134
1.0      1466
dtype: int64 

y_test classes churn
0.0      1020
1.0       380
dtype: int64


### Balancing data to test:

In [8]:
sm = SMOTE(random_state=42)
X_train_balance, y_train_balance = sm.fit_resample(X_train, y_train)

In [9]:
# quick check of splitted data:
print("y_train classes", y_train_balance.value_counts())

y_train classes churn
0.0      4134
1.0      4134
dtype: int64


___
## Intializing MLFLOW

In [10]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

In [11]:
try:
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
except:
    print(f'Experiment "{MLFLOW_EXPERIMENT_NAME}" exists at "{mlflow.get_tracking_uri()}"')

Experiment "churn_rate_prediction" exists at "../models/mlruns"


In [12]:
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
print(exp)

<Experiment: artifact_location='../models/mlruns/0', experiment_id='0', lifecycle_stage='active', name='churn_rate_prediction', tags={}>


___
## 1. Support Vector Machines (SVC)

In [13]:
svc = make_pipeline(MinMaxScaler(),
                    SVC())

svc.fit(X_train.values, (y_train.values).reshape(-1,))

Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svc', SVC())])

### Evaluate on training data:

In [14]:
# Evaluate on training set:
predictions = pd.DataFrame(svc.predict(X_train.values),
                          columns=y_train.columns)

train_scores = {score.__name__:calculate_quality(y_train, predictions, score, "SVC")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

train_scores = pd.concat(train_scores, axis=1)
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
SVC,0.815,0.7,0.512,0.592


In [15]:
mean_train_score = train_scores.mean()
mean_train_score

accuracy_score     0.815
precision_score    0.700
recall_score       0.512
f1_score           0.592
dtype: float64

### Evaluate on testing data:

In [16]:
# Evaluate on training set:
predictions = pd.DataFrame(svc.predict(X_test.values),
                          columns=y_test.columns)

test_scores = {score.__name__:calculate_quality(y_test, predictions, score, "SVC")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

test_scores = pd.concat(test_scores, axis=1)
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
SVC,0.798,0.693,0.458,0.552


In [17]:
mean_test_score = test_scores.mean()
mean_test_score

accuracy_score     0.798
precision_score    0.693
recall_score       0.458
f1_score           0.552
dtype: float64

### Log run

In [18]:
# Data details
data_details = {"data_path": DATA_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_test.index.tolist(),
                "feature_names": X_train.columns.tolist(),
                "target_names": y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [19]:
# Model details
model = {"model_description": "Support Vector Classifier SVC",
         "model_details": str(svc),
         "model_object": svc}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [20]:
# Performance details
regression_metrics = {"train_scores": train_scores,
                      "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(regression_metrics, output_file)

### Logging

In [21]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # log pickle
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_score.items():
        mlflow.log_metric(metric, score)

___
## 2. Random Forest Classifier:

In [49]:
rf = make_pipeline(MinMaxScaler(),
                    RandomForestClassifier(n_estimators=1000,
                                          oob_score=True,
                                          n_jobs=1, random_state=42,
                                          max_features='auto', max_leaf_nodes=30))

rf.fit(X_train.values, (y_train.values).reshape(-1,))

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_leaf_nodes=30, n_estimators=1000,
                                        n_jobs=1, oob_score=True,
                                        random_state=42))])

### Evaluate on training data:

In [50]:
# Evaluate on training set:
predictions = pd.DataFrame(rf.predict(X_train.values),
                          columns=y_train.columns)

train_scores = {score.__name__:calculate_quality(y_train, predictions, score, "RandomForestClassifier")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

train_scores = pd.concat(train_scores, axis=1)
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
RandomForestClassifier,0.822,0.714,0.534,0.611


In [53]:
mean_train_score = train_scores.mean()
mean_train_score

accuracy_score     0.822
precision_score    0.714
recall_score       0.534
f1_score           0.611
dtype: float64

### Evaluate on testing data:

In [51]:
# Evaluate on training set:
predictions = pd.DataFrame(rf.predict(X_test.values),
                          columns=y_test.columns)

test_scores = {score.__name__:calculate_quality(y_test, predictions, score, "RandomForestClassifier")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

test_scores = pd.concat(test_scores, axis=1)
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
RandomForestClassifier,0.794,0.683,0.447,0.541


In [52]:
mean_test_score = test_scores.mean()
mean_test_score

accuracy_score     0.794
precision_score    0.683
recall_score       0.447
f1_score           0.541
dtype: float64

### Log run

In [54]:
# Data details
data_details = {"data_path": DATA_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_test.index.tolist(),
                "feature_names": X_train.columns.tolist(),
                "target_names": y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [55]:
# Model details
model = {"model_description": "Random Forrest Classifier",
         "model_details": str(rf),
         "model_object": rf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [56]:
# Performance details
regression_metrics = {"train_scores": train_scores,
                      "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(regression_metrics, output_file)

### Logging

In [57]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # log pickle
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_score.items():
        mlflow.log_metric(metric, score)

___
## 3. AdaBoost Classifier

In [31]:
adaboost = make_pipeline(MinMaxScaler(),
                    AdaBoostClassifier())

adaboost.fit(X_train.values, (y_train.values).reshape(-1,))

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('adaboostclassifier', AdaBoostClassifier())])

### Evaluate on training data:

In [32]:
# Evaluate on training set:
predictions = pd.DataFrame(adaboost.predict(X_train.values),
                          columns=y_train.columns)

train_scores = {score.__name__:calculate_quality(y_train, predictions, score, "AdaBoost")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

train_scores = pd.concat(train_scores, axis=1)
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
AdaBoost,0.808,0.669,0.529,0.59


In [33]:
mean_train_score = train_scores.mean()
mean_train_score

accuracy_score     0.808
precision_score    0.669
recall_score       0.529
f1_score           0.590
dtype: float64

### Evaluate on testing data:

In [34]:
# Evaluate on training set:
predictions = pd.DataFrame(adaboost.predict(X_test.values),
                          columns=y_test.columns)

test_scores = {score.__name__:calculate_quality(y_test, predictions, score, "AdaBoost")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

test_scores = pd.concat(test_scores, axis=1)
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
AdaBoost,0.799,0.687,0.474,0.561


In [35]:
mean_test_score = test_scores.mean()
mean_test_score

accuracy_score     0.799
precision_score    0.687
recall_score       0.474
f1_score           0.561
dtype: float64

### Log run

In [36]:
# Data details
data_details = {"data_path": DATA_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_test.index.tolist(),
                "feature_names": X_train.columns.tolist(),
                "target_names": y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [37]:
# Model details
model = {"model_description": "Ada-Boost Classifier",
         "model_details": str(adaboost),
         "model_object": adaboost}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [38]:
# Performance details
regression_metrics = {"train_scores": train_scores,
                      "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(regression_metrics, output_file)

### Logging

In [39]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # log pickle
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_score.items():
        mlflow.log_metric(metric, score)

___
## 3. XGBoost Classifier

In [40]:
xgboost = make_pipeline(MinMaxScaler(),
                    XGBClassifier())

xgboost.fit(X_train.values, (y_train.values).reshape(-1,))

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('xgbclassifier',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=0, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=6, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                               predictor='auto', random_state=0, reg_alpha=0,
    

### Evaluate on training data:

In [41]:
# Evaluate on training set:
predictions = pd.DataFrame(xgboost.predict(X_train.values),
                          columns=y_train.columns)

train_scores = {score.__name__:calculate_quality(y_train, predictions, score, "XGBoost")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

train_scores = pd.concat(train_scores, axis=1)
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
XGBoost,0.932,0.904,0.829,0.865


In [42]:
mean_train_score = train_scores.mean()
mean_train_score

accuracy_score     0.932
precision_score    0.904
recall_score       0.829
f1_score           0.865
dtype: float64

### Evaluate on testing data:

In [43]:
# Evaluate on training set:
predictions = pd.DataFrame(xgboost.predict(X_test.values),
                          columns=y_test.columns)

test_scores = {score.__name__:calculate_quality(y_test, predictions, score, "XGBoost")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

test_scores = pd.concat(test_scores, axis=1)
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
XGBoost,0.776,0.612,0.474,0.534


In [44]:
mean_test_score = test_scores.mean()
mean_test_score

accuracy_score     0.776
precision_score    0.612
recall_score       0.474
f1_score           0.534
dtype: float64

### Log run

In [45]:
# Data details
data_details = {"data_path": DATA_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_test.index.tolist(),
                "feature_names": X_train.columns.tolist(),
                "target_names": y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [46]:
# Model details
model = {"model_description": "XGBoost Classifier",
         "model_details": str(xgboost),
         "model_object": xgboost}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [47]:
# Performance details
regression_metrics = {"train_scores": train_scores,
                      "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(regression_metrics, output_file)

### Logging

In [48]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # log pickle
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_score.items():
        mlflow.log_metric(metric, score)