### Predicting Customer Churn

### Environment Setup

In [None]:
#Uncomment and run once to install the wget package in your runtime environment
!pip install wget

In [None]:
#Uncomment and run once to install the package in your runtime environment
!pip install pandas_profiling

In [None]:
#Uncomment and run once to install the package in your runtime environment
!pip install sklearn-pandas

In [None]:
#Uncomment and run once to install the package in your runtime environment
!pip install watson-machine-learning-client --upgrade


In [None]:
import wget
import pandas as pd
import numpy as np
import pandas_profiling
import sklearn.pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, LabelBinarizer, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score
from sklearn_pandas import DataFrameMapper
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import json
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline


### Step 1: Load data 

#### 1.1: Download the data files

In [None]:
# download data from GitHub repository

url_churn='https://raw.githubusercontent.com/SidneyPhoon/Data/master/churn.csv'

url_customer='https://raw.githubusercontent.com/SidneyPhoon/Data/master/customer-profile.csv'

#remove existing files before downloading
!rm -f churn.csv
!rm -f customer-profile.csv

churnFilename=wget.download(url_churn)
customerFilename=wget.download(url_customer)

#list existing files
!ls -l churn.csv
!ls -l customer-profile.csv

In [None]:
customer_churn = pd.read_csv('churn.csv')
customer = pd.read_csv('customer-profile.csv')

### Step 2: Merge Files

In [None]:
data = pd.merge(customer, customer_churn, on='ID')

### Step 3: Rename some columns
This step is to remove spaces from columns names, it's an example of data preparation that you may want to do before creating a model. 

In [None]:
data.columns

In [None]:
data.rename(columns={'Est Income':'EstIncome', 'Car Owner':'CarOwner' }, inplace=True)

In [None]:
data.head()

In [None]:
data.shape

### Step 4: Data understanding

In [None]:
data.describe()

In [None]:
#pandas_profiling.ProfileReport(data)

### Step 5: Build the sklearn pipeline and the Random Forest model


In [None]:
# Define input data to the model
X = data.drop(['ID','CHURN'], axis=1)

In [None]:
# Define the target variable and encode with value between 0 and n_classes-1, that is from T/F to 1/0
le = LabelEncoder()
y = le.fit_transform(data['CHURN'])

In [None]:
label_mapping=le.inverse_transform([0,1])
print('0: ', label_mapping[0])
print('1: ', label_mapping[1])

In [None]:
# split the data to training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

#### Use the DataFrameMapper class to declare transformations and variable imputations.

* LabelBinarizer - Converts a categorical variable into a dummy variable (aka binary variable)
* StandardScaler - Standardize features by removing the mean and scaling to unit variance, z = (x - u) / s

See docs: 
* https://github.com/scikit-learn-contrib/sklearn-pandas
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
* https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html#sklearn.preprocessing.LabelBinarizer
* https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [None]:

mapper_good = DataFrameMapper([
    (['Gender'], LabelBinarizer()),
    (['Status'], LabelBinarizer()),
    (['CarOwner'], LabelBinarizer()),
    (['Paymethod'], LabelBinarizer()),
    (['MembershipPlan'], LabelBinarizer()),
    (['Children'],  StandardScaler()),
    (['EstIncome'],  StandardScaler()),
    (['Age'],  StandardScaler()),
    (['AvgMonthlySpend'],  StandardScaler()),
    (['CustomerSupportCalls'],  StandardScaler())], default=False)


In [None]:
# Instantiate the Classifier
random_forest = RandomForestClassifier(random_state=5)

# Define the steps in the pipeline to sequentially apply a list of transforms and the estimator, i.e. RandomForestClassifier
steps = [('mapper', mapper_good),('RandonForestClassifier', random_forest)]
pipeline = sklearn.pipeline.Pipeline(steps)

# train the model
model=pipeline.fit( X_train, y_train )

model

In [None]:
# Display Label Mapping to assist with interpretation of the model
label_mapping=le.inverse_transform([0,1])
print('0: ', label_mapping[0])
print('1: ', label_mapping[1])

In [None]:
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

### test your predictions using sklearn.classification_report()
report = sklearn.metrics.classification_report( y_test, y_prediction )

### and print the report
print(report)

###  Step 6:  Tune the model to find the best model

In [None]:
# List keys to the model param to tune
#model.get_params().keys()

In [None]:
parameters = { 'RandonForestClassifier__max_depth': [5,8,10],
               'RandonForestClassifier__n_estimators': [150,180,200]}

In [None]:
grid_obj = GridSearchCV(estimator=model, param_grid=parameters,  cv=3)

In [None]:
# TODO: Fit the grid search object to the training data and find the optimal parameters using fit()
grid_fit = grid_obj.fit(X_train,y_train)


In [None]:
# Get the estimator
best_clf = grid_fit.best_estimator_

In [None]:
best_predictions = best_clf.predict(X_test)

In [None]:
best_predictions_report = sklearn.metrics.classification_report( y_test, best_predictions )

In [None]:
print('Results of best fitted model: \n\n',best_predictions_report)

In [None]:
print('Results of default model: \n\n',report)

In [None]:
m_step=pipeline.named_steps['mapper']

In [None]:
m_step.transformed_names_

In [None]:
features = m_step.transformed_names_

In [None]:
# Get the features importance
importances = pipeline.named_steps['RandonForestClassifier'][1].feature_importances_
indices = np.argsort(importances)

In [None]:
plt.figure(1)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b',align='center')
plt.yticks(range(len(indices)), (np.array(features))[indices])
plt.xlabel('Relative Importance')

### Step 7: Save Model in the Project and WML Deployment Space


Watson Machine Learning provides deployment spaces where the user can save, configure and deploy their models. We can also save the model in the project and then promote the model to the deployment space.  We will perform both operations in the code cells below.

First, we will check if an existing deployment space is already associated with this project and set the associated deployment space as the default space.  If this project is not yet associated with a deployment space, we will create a deployment space.

The steps involved for saving and deploying the model into the deployment space are as follows:

1. If a deployment space is already associated with this project, retrieve the SPACE_ID and space details, otherwise, create a new deployment space. 
2. Set the deployment space as the default space.
3. Store the model pipeline in the deployment space. Enter the name for the model in the cell below. Specify a tag for the model in the cell below.
4. Deploy the saved model. Enter the deployment name in the cell below. Specifu a tag for the deployment. Similarily, this tag will be used in the future to identify this deployment.
5. Retrieve the scoring endpoint to score the model with a payload
5. We will use the watson_machine_learning_client package to complete these steps. 


In [None]:
from watson_machine_learning_client import WatsonMachineLearningAPIClient
import os

token = os.environ['USER_ACCESS_TOKEN']

wml_credentials = {
   "token": token,
   "instance_id" : "wml_local",
   "url": os.environ['RUNTIME_ENV_APSX_URL'],
   "version": "2.5.0"
}

client = WatsonMachineLearningAPIClient(wml_credentials)

In [None]:
# specify values for the model_name, model_tag for the model to be saved

model_name = 'customer_churn_model_1211'
model_tag = 'customer_churn_model_tag_1211'


### Store the mode in the project

In [None]:
# get the Project ID and set the location to save the model to the project
project_id = os.environ['PROJECT_ID']
client.set.default_project(project_id)

In [None]:
metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.20",
    client.repository.ModelMetaNames.RUNTIME_UID: "scikit-learn_0.20-py3",
    client.repository.ModelMetaNames.TAGS: [{'value' : model_tag}]

}

stored_model_details = client.repository.store_model(pipeline,
                                               meta_props=metadata,
                                               training_data=X_train,
                                               training_target=y_train)

### Deployment Space

Use an existing deployment space that is already associated with this project, or create a new deployment space if there is no associated deployment space

In [None]:
# get deployment space that is already associated with the project

space_id = os.getenv('SPACE_ID')
if str(space_id)!='None':
    space_name = client.spaces.get_details(space_id)['entity']['name']

In [None]:
space_id

In [None]:
# Obtain the UId of your space
#def guid_from_space_name(client, space_name):
#    instance_details = client.service_instance.get_details()
#    space = client.spaces.get_details()
#    return(next(item for item in space['resources'] if item['entity']["name"] == space_name)['metadata']['guid'])

In [None]:
# if your project is indeed already associated with a space and the above code cell to display the space_id does not return anything, then uncomment the code below and
# enter your deployment space name. I have see this problem before where a newly created project with a space associated with it does not have the 
# environment variable 'SPACE_ID'.


# Enter the name of your deployment space here:
#space_uid = guid_from_space_name(client, 'YOUR DEPLOYMENT SPACE')
#print("Space UID = " + space_uid)

#### <font color='red'>Action required:</font> If this project is not already associated with a "_Deployment Space_", specify values for the space_name and space_tag in the code cell below

In [None]:
if str(space_id)=='None':
    space_name = 'XXXXX '  # e.g deployment-space-sidneyp-sandbox 
    space_tag =  'XXXXX'   # e.g deployment-space-tag-sidneyp-sandbox
    
    # create the space and set it as default
    space_meta_data = {
            client.spaces.ConfigurationMetaNames.NAME : space_name,
            client.spaces.ConfigurationMetaNames.TAGS : [{'value': space_tag}]
    }

    stored_space_details = client.spaces.store(space_meta_data)

    space_uid = stored_space_details['metadata']['guid']

    # set the newly created deployment space as the default
    client.set.default_space(space_uid)
    
    

else:
    # retrieve existing space details
    stored_space_details = client.spaces.get_details(space_id)
    space_uid = stored_space_details['metadata']['guid']
    # set deployment space as the default
    client.set.default_space(space_uid)
    


### Store the model in the deployment space

In [None]:
# run this line if you do not know the version of scikit-learn that was used to build the model
!pip list | grep scikit-learn

In [None]:
metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.20",
    client.repository.ModelMetaNames.RUNTIME_UID: "scikit-learn_0.20-py3",
    client.repository.ModelMetaNames.TAGS: [{'value' : model_tag}],
    client.repository.ModelMetaNames.SPACE_UID: space_uid
}

stored_model_details = client.repository.store_model(pipeline,
                                               meta_props=metadata,
                                               training_data=X_train,
                                               training_target=y_train)


In [None]:
stored_model_details

### Create a deployment for the stored model

In [None]:
# specify values for the deployment_name, deployment_tag

deployment_name = 'customer_churn_model-deployment_1211'
deployment_tag = 'customer_churn_deployment_tag_1211'

In [None]:
# deploy the model
meta_props = {
    client.deployments.ConfigurationMetaNames.NAME: deployment_name,
    client.deployments.ConfigurationMetaNames.TAGS : [{'value' : deployment_tag}],
    client.deployments.ConfigurationMetaNames.ONLINE: {}
}

# deploy the model

model_uid = stored_model_details["metadata"]["guid"]
deployment_details = client.deployments.create( artifact_uid=model_uid, meta_props=meta_props)

### Score the model

In [None]:
# retrieve the scoring endpoint
scoring_endpoint = client.deployments.get_scoring_href(deployment_details)

print('Scoring Endpoint:   ',scoring_endpoint)

In [None]:
scoring_deployment_id = client.deployments.get_uid(deployment_details)
client.deployments.get_details(scoring_deployment_id)

In [None]:

# payload_scoring = {"input_data": [{"fields": ["Gender", "Status", "Children", "EstIncome", "CarOwner", "Age", "AvgMonthlySpend", "CustomerSupportCalls", "Paymethod", "MembershipPlan"], "values": [["M","S",2.0,25000,"Y",25,10,1,"CC",1], ["S","S",2.0,25000,"Y",25,10,1,"CC",1]]}]}

payload_scoring = [{"values": [ ["M","S",2.0,25000,"Y",25,10,1,"CC",1]]}]


In [None]:
payload_metadata = {client.deployments.ScoringMetaNames.INPUT_DATA: payload_scoring}
# score
predictions = client.deployments.score(scoring_deployment_id, payload_metadata)
predictions

In [None]:
# get the predicted value and reverse the label transformation
predicted_value = predictions.get('predictions')[0].get('values')[0][0]
le.inverse_transform([predicted_value])

#### <font color='red'>Action required:</font> 
To see your deployed models, go to your project **Settings** -> **Associated deployment space**.  If this project is not yet associated with a deployment space, associate it with the newly created deployment space.  Clicked into the associated deployment space to see the deployed model.

#### Write test data into csv file for batch scoring

In [None]:
# Write the test data a .csv so that we can later use it for batch scoring
X_test.to_csv('/project_data/data_asset/new_customers.csv', sep=',', index=False)

**Author:**  Sidney Phoon <br/>
**Date:**  Dec 5th, 2019