In [15]:
DATA_PATH = "../data/processed/02_cleaned.pkl"

MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_EXPERIMENT_NAME = "churn_rate_prediction"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

EXPORT_PATH = "../data/processed/03_balanced_data.pkl"

In [17]:
# Load packages
import pandas as pd

from pathlib import Path
import os
import pickle

from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import mlflow
from mlflow.tracking import MlflowClient

___
## Functions

In [31]:
def calculate_quality(ground_truth, predictions, metric_function, model_name):
    """
    Calculate the quality of the model according to different metric scores
    Input:
        ground_truth: from real observed data
        predictions: the predicted values from the model
        metric_function: the metric score funcrion used to measure performance
    Output:
        A dict of all scores for the given inputs
    """
    quality_score = {}
    quality_score[model_name] = round(metric_function(ground_truth, predictions), 3)
    quality_score = pd.Series(quality_score.values(), index=quality_score.keys())
    
    return quality_score

___
## Read data

In [19]:
df = pd.read_pickle(DATA_PATH)
df.sample(5)

Unnamed: 0,seniorcitizen,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
4787,0,0.0,4,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,54.7,235.05,0.0
6451,1,1.0,66,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,116.25,7862.25,0.0
1623,0,0.0,54,1.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,2.0,97.2,5129.45,0.0
6397,1,0.0,54,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,0.0,2.0,65.25,3529.95,0.0
3850,0,0.0,51,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,83.25,4089.45,0.0


___
## Split data for modeling

In [40]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('churn', axis=1),
                                                    df[['churn']],
                                                    test_size=0.2,
                                                    random_state=42)

In [23]:
print(X_train.shape)
X_train.sample(5)

(5600, 17)


Unnamed: 0,seniorcitizen,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
5359,1,0.0,10,0.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,29.65,291.4
1286,0,0.0,1,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,0.0,3.0,74.1,74.1
5912,1,0.0,47,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,0.0,1.0,2.0,85.2,3969.35
3964,0,1.0,68,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,3.0,111.75,7511.3
4121,1,0.0,27,1.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,2.0,0.0,1.0,0.0,95.55,2510.2


___
## Intializing MLFLOW

In [24]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

In [25]:
try:
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
except:
    print(f'Experiment "{MLFLOW_EXPERIMENT_NAME}" exists at "{mlflow.get_tracking_uri()}"')

In [26]:
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
print(exp)

<Experiment: artifact_location='../models/mlruns/0', experiment_id='0', lifecycle_stage='active', name='churn_rate_prediction', tags={}>


___
## Base model; Logistic Regression
**1. without balancing**

In [30]:
clf = make_pipeline(MinMaxScaler(),
                    LogisticRegression())

clf.fit(X_train.values, y_train.values)

  y = column_or_1d(y, warn=True)


Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('logisticregression', LogisticRegression())])

### Evaluate on training data:

In [32]:
# Evaluate on training set:
predictions = pd.DataFrame(clf.predict(X_train.values),
                          columns=y_train.columns)

train_scores = {score.__name__:calculate_quality(y_train, predictions, score, "LogisticRegression")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

train_scores = pd.concat(train_scores, axis=1)
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression,0.807,0.657,0.547,0.597


In [33]:
mean_train_score = train_scores.mean()
mean_train_score

accuracy_score     0.807
precision_score    0.657
recall_score       0.547
f1_score           0.597
dtype: float64

### Evaluate on testing data:

In [34]:
# Evaluate on training set:
predictions = pd.DataFrame(clf.predict(X_test.values),
                          columns=y_test.columns)

test_scores = {score.__name__:calculate_quality(y_test, predictions, score, "LogisticRegression")
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

test_scores = pd.concat(test_scores, axis=1)
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LogisticRegression,0.797,0.671,0.495,0.57


In [35]:
mean_test_score = test_scores.mean()
mean_test_score

accuracy_score     0.797
precision_score    0.671
recall_score       0.495
f1_score           0.570
dtype: float64