# Customer Churn Analysis

In [None]:
!pip freeze

In [None]:
%%capture
%load_ext autoreload
%autoreload 2

import sys 
sys.path.append('model_management')

from model_management.sklearn_model import SklearnModel
import numpy as np
import pandas as pd
import h2o
from h2o.automl import H2OAutoML
from __future__ import print_function
import pandas_profiling

# Suppress unwatned warnings
import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger("requests").setLevel(logging.WARNING)

# Load our favorite visualization library
import os
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import cufflinks as cf
plotly.offline.init_notebook_mode(connected=True)

# Sign into Plotly with masked, encrypted API key

myPlotlyKey = os.environ['SECRET_ENV_BRETTS_PLOTLY_KEY']
py.sign_in(username='bretto777',api_key=myPlotlyKey)

### Load The Dataset

In [None]:
# Load some data
churnDF = pd.read_csv('https://trifactapro.s3.amazonaws.com/churn.csv', delimiter=',')
churnDF.head(5)

In [None]:
#%%capture
#pandas_profiling.ProfileReport(churnDF)

In [None]:
churnDF["Churn"] = churnDF["Churn"].replace([True, False],[1,0])
churnDF["Int'l Plan"] = churnDF["Int'l Plan"].replace(["no","yes"],[0,1])
churnDF["VMail Plan"] = churnDF["VMail Plan"].replace(["no","yes"],[0,1])
churnDF.drop(["State", "Area Code", "Phone"], axis=1, inplace=True)


## Train a Model

In [None]:
%%capture

# Split data into training and testing frames


h2o.init(nthreads=1, max_mem_size="768m")
from sklearn import cross_validation
from sklearn.model_selection import train_test_split

training, testing = train_test_split(churnDF, train_size=0.8, stratify=churnDF["Churn"], random_state=9)
x_train = training.drop(["Churn"], axis = 1)
y_train = training["Churn"]
x_test = testing.drop(["Churn"], axis = 1)
y_test = testing["Churn"]
# fit into H2O
train = h2o.H2OFrame(python_obj=training)
test = h2o.H2OFrame(python_obj=testing)
train["Churn"] = train["Churn"].asfactor()
test["Churn"] = test["Churn"].asfactor()
# Set predictor and response variables
y = "Churn"
x = train.columns
x.remove(y)

In [None]:
x_train = x_train.values
y_train = y_train.values
x_test = x_test.values
y_test = y_test.values

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier(n_estimators=200, 
                                                           learning_rate=.07,
                                                           max_depth=4, 
                                                           random_state=0).fit(x_train,y_train)


In [None]:

model_gbm = SklearnModel(model=gbm,
                     problem_class='binary_classification',
                     description='GBM, 275 trees, learning rate .04, max depth 5',
                     name="GBM-4",
                     y_test = y_test,
                     x_test = x_test)

In [None]:
model_gbm.metrics()

In [None]:
model_gbm.save()

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(x_train,y_train)

In [None]:
model_LR = SklearnModel(model=lr,
                     problem_class='binary_classification',
                     description='sklearn logistic regression, default settings',
                     name="LR-2",
                     y_test = y_test,
                     x_test = x_test)

In [None]:
model_LR.metrics()

In [None]:
model_LR.save()

# Automatic Machine Learning

The Automatic Machine Learning (AutoML) function automates the supervised machine learning model training process. The current version of AutoML trains and cross-validates a Random Forest, an Extremely-Randomized Forest, a random grid of Gradient Boosting Machines (GBMs), a random grid of Deep Neural Nets, and a Stacked Ensemble of all the models.

In [None]:
%%capture
# Run AutoML until 11 models are built
autoModel = H2OAutoML(max_models = 9)
autoModel.train(x = x, y = y,
          training_frame = train,
          validation_frame = test, 
          leaderboard_frame = test)


## Leaderboard

In [None]:
leaders = autoModel.leaderboard
leaders

# Variable Importances
Below we plot variable importances as reported by the best performing algo in the ensemble.

In [None]:
importances = h2o.get_model(leaders[2, 0]).varimp(use_pandas=True)
importances = importances.loc[:,['variable','relative_importance']].groupby('variable').mean()
importances.sort_values(by="relative_importance", ascending=False).iplot(kind='bar', colors='#5AC4F2', theme='white')

In [None]:
import matplotlib.pyplot as plt
plt.figure()
bestModel = h2o.get_model(leaders[2, 0])
plt = bestModel.partial_plot(data=test, cols=["Day Mins","CustServ Calls","Day Charge"])


# Best Model vs the Base Learners
This plot shows the ROC curves for the best models

In [None]:
Model0 = np.array(h2o.get_model(leaders[0,0]).roc(xval=True))
Model1 = np.array(h2o.get_model(leaders[1,0]).roc(xval=True))
Model2 = np.array(h2o.get_model(leaders[2,0]).roc(xval=True))
Model3 = np.array(h2o.get_model(leaders[3,0]).roc(xval=True))
Model4 = np.array(h2o.get_model(leaders[4,0]).roc(xval=True))
Model5 = np.array(h2o.get_model(leaders[5,0]).roc(xval=True))
Model6 = np.array(h2o.get_model(leaders[6,0]).roc(xval=True))
Model7 = np.array(h2o.get_model(leaders[7,0]).roc(xval=True))
Model8 = np.array(h2o.get_model(leaders[8,0]).roc(xval=True))
Model9 = np.array(h2o.get_model(leaders[9,0]).roc(xval=True))

layout = go.Layout(autosize=False, width=725, height=575,  xaxis=dict(title='False Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')), 
                                                           yaxis=dict(title='True Positive Rate', titlefont=dict(family='Arial, sans-serif', size=15, color='grey')))

traceChanceLine = go.Scatter(x = [0,1], y = [0,1], mode = 'lines+markers', name = 'chance', line = dict(color = ('rgb(136, 140, 150)'), width = 4, dash = 'dash'))
Model0Trace = go.Scatter(x = Model0[0], y = Model0[1], mode = 'lines', name = 'Model 0', line = dict(color = ('rgb(26, 58, 126)'), width = 3))
Model1Trace = go.Scatter(x = Model1[0], y = Model1[1], mode = 'lines', name = 'Model 1', line = dict(color = ('rgb(156, 190, 241))'), width = 1))
Model2Trace = go.Scatter(x = Model2[0], y = Model2[1], mode = 'lines', name = 'Model 2', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model3Trace = go.Scatter(x = Model3[0], y = Model3[1], mode = 'lines', name = 'Model 3', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model4Trace = go.Scatter(x = Model4[0], y = Model4[1], mode = 'lines', name = 'Model 4', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model5Trace = go.Scatter(x = Model5[0], y = Model5[1], mode = 'lines', name = 'Model 5', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model6Trace = go.Scatter(x = Model6[0], y = Model6[1], mode = 'lines', name = 'Model 6', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model7Trace = go.Scatter(x = Model7[0], y = Model7[1], mode = 'lines', name = 'Model 7', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model8Trace = go.Scatter(x = Model8[0], y = Model8[1], mode = 'lines', name = 'Model 8', line = dict(color = ('rgb(156, 190, 241)'), width = 1))
Model9Trace = go.Scatter(x = Model9[0], y = Model9[1], mode = 'lines', name = 'Model 9', line = dict(color = ('rgb(156, 190, 241)'), width = 1))

fig = go.Figure(data=[Model0Trace,Model1Trace,Model2Trace,Model3Trace,Model4Trace,Model5Trace,Model6Trace,Model8Trace,Model9Trace,traceChanceLine], layout=layout)

py.iplot(fig)

# Confusion Matrix

In [None]:
cm = h2o.get_model(leaders[1, 0]).confusion_matrix(xval=True)
cm = cm.table.as_data_frame()
cm
confusionMatrix = ff.create_table(cm)
confusionMatrix.layout.height=300
confusionMatrix.layout.width=800
confusionMatrix.layout.font.size=17
py.iplot(confusionMatrix)

# Business Impact Matrix

Weighting Predictions With a Dollar Value
-   Correctly predicting retain: `+$5`
-   Correctly predicting churn: `+$75`
-   Incorrectly predicting retain: `-$150`
-   Incorrectly predicting churn: `-$1.5`

    

In [None]:
CorrectPredictChurn = cm.loc[1,'1']
CorrectPredictChurnImpact = 75
cm1 = CorrectPredictChurn*CorrectPredictChurnImpact

IncorrectPredictChurn = cm.loc[1,'0']
IncorrectPredictChurnImpact = -5
cm2 = IncorrectPredictChurn*IncorrectPredictChurnImpact

IncorrectPredictRetain = cm.loc[0,'1']
IncorrectPredictRetainImpact = -150
cm3 = IncorrectPredictRetain*IncorrectPredictRetainImpact

CorrectPredictRetain = cm.loc[0,'0']
CorrectPredictRetainImpact = 5
cm4 = IncorrectPredictRetain*CorrectPredictRetainImpact


data_matrix = [['Business Impact', '($) Predicted Churn', '($) Predicted Retain', '($) Total'],
               ['($) Actual Churn', cm1, cm3, '' ],
               ['($) Actual Retain', cm2, cm4, ''],
               ['($) Total', cm1+cm2, cm3+cm4, cm1+cm2+cm3+cm4]]

impactMatrix = ff.create_table(data_matrix, height_constant=20, hoverinfo='weight')
impactMatrix.layout.height=300
impactMatrix.layout.width=800
impactMatrix.layout.font.size=17
py.iplot(impactMatrix)

In [None]:
print("Total customers evaluated: 2132")

In [None]:
print("Total value created by the model: $" + str(cm1+cm2+cm3+cm4))

In [None]:
print("Total value per customer: $" +str(round(((cm1+cm2+cm3+cm4)/2132),3)))

In [None]:
#%%capture
# Save the best model

#path = h2o.save_model(model=h2o.get_model(leaders[0, 0]), force=True)
#os.rename(h2o.get_model(leaders[0, 0]).model_id, "AutoML-leader")    

In [None]:
#%%capture
#LoadedEnsemble = h2o.load_model(path="AutoML-leader")
#print(LoadedEnsemble)