# Introduction - Customer Churn Prediction notebook
In this notebook, we illustrate how you can train a model for Churn Prediction using scikit learn. After training the model, you step through the instructions to deploy the model using Watson Machine Learning.

This notebook is a variation of the original notebook reference in this github repo: https://github.com/elenalowery/cpd4_demo/blob/master/assets/jupyterlab/Predict_Customer_Churn_CPD4.ipynb


In [62]:
# Install required Python modules
!pip install sklearn-pandas > /dev/null


## Step 1: Review Use Case
The analytics use case implemented in this notebook is telco churn prediction. It is a simple use case which illustrates typical process for model development and deployment using Cloud Pak for Data.

In [63]:
import subprocess
CURRENT_BRANCH = subprocess.run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'], stdout=subprocess.PIPE)\
    .stdout.strip().decode("utf-8")

if CURRENT_BRANCH in ['prd','uat']:
    CURRENT_ENV=CURRENT_BRANCH
else:
    CURRENT_ENV='dev'
    
print('Current branch     : {}'.format(CURRENT_BRANCH))
print('Current environment: {}'.format(CURRENT_ENV))

Current branch     : optimize-churn-model
Current environment: dev


In [64]:
import pandas as pd
customer_data_df=pd.read_csv('/userfs/assets/data_asset/CUSTOMER_DATA_ready-'+CURRENT_ENV+'.csv')
customer_data_df.head(10)

Unnamed: 0,ID,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CREDITCARD,DOB,ADDRESS_1,CITY,STATE,ZIP,ZIP4,LONGITUDE,LATITUDE,CHURN
0,1,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,1814139000000000.0,32dad3590f2243b8709201348e1ae897,159 HUTTON ST BSMT A,ABSECON,NJ,8201,0,,,T
1,1004,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,6494422000000000.0,c643e317495168f62085716c81ec164d,1724 WHITEHAVEN,GLYNDON,MN,56547,0,,,F
2,1005,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,3218720000000000.0,80c40ce517ca57e0919e238e0e29e75c,95 W 25TH ST APT 1,WAPPINGERS FALLS,NY,12590,1723,,,F
3,1006,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,3016220000000000.0,df7b078f544b61f867ad0dc1fa51c046,66 KULLA DR,RICHLAND,NE,68601,0,-97.377539,41.441233,T
4,1008,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,7070216000000000.0,273a525adc7bb0bd49252e47dab190e9,5621 MCCARTY RD,EVERETT,WA,98205,0,,,F
5,1009,29,0,9,0,CC,Budget,Intnl_discount,38,2,M,S,2,72084.7,N,55.64,4919386000000000.0,efb18ce1ef44f169687df57e9b9fdf53,2000 CALLE 4,CAROLINA,PR,979,0,,,F
6,1010,13,0,40,0,CC,Budget,Standard,53,4,F,S,0,42760.5,N,47.0,9402648000000000.0,227f74a0e2d7b254a9c73ec61528ee94,3801 YOSEMITE BLVD STE F,HOUSTON,TX,77024,7776,,,F
7,1016,16,0,114,0,CH,Budget,Standard,130,1,M,M,1,71472.9,N,41.913333,8522563000000000.0,92e4302092a290acd3bc1fb75ada5267,843 EUCLID ST APT 101S,KIRKLAND,WA,98034,0,-122.209175,47.709619,T
8,1017,7,0,6,0,CC,Budget,Standard,13,3,F,M,0,95405.7,N,48.0,2981966000000000.0,32bd821d9a01040a89f9a7d3766017ce,3801 MAC CV,NEW YORK,NY,10019,0,-73.990852,40.768196,F
9,1018,21,0,87,0,CC,Budget,Standard,108,1,F,S,0,95786.8,Y,52.646667,3074091000000000.0,e78d37c276f03bdfa0eef28dc18f9c3a,390 W BROADWAY ST,BUTLER,NJ,7405,0,,,F


In [65]:
# COPY the dataFrame into a new dataFrame called *data*
data=customer_data_df.copy()

In [66]:
# List all the columns
print(data.columns)

Index(['ID', 'LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD',
       'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER',
       'STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE', 'CREDITCARD',
       'DOB', 'ADDRESS_1', 'CITY', 'STATE', 'ZIP', 'ZIP4', 'LONGITUDE',
       'LATITUDE', 'CHURN'],
      dtype='object')


In [67]:
# Keep only the columns that are relevant for churn prediction
data = data[['LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD', 'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'USAGE', 'RATEPLAN', 'GENDER','STATUS', 'CHILDREN', 'ESTINCOME', 'CAROWNER', 'AGE', 'CHURN']]
data.head()


Unnamed: 0,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,T
1,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,F
2,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,F
3,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,T
4,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,F


## Step 2: Try the Random Forest model

In [68]:
import pandas as pd
import sklearn
pd.options.display.max_columns = 999

import warnings
warnings.filterwarnings('ignore')

from scipy.stats import chi2_contingency,ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score

import numpy as np

import urllib3, requests, json

In [69]:
#convert CHURN to 1/0
le = LabelEncoder()
data.loc[:,'CHURN']= le.fit_transform(data.loc[:,'CHURN'])
data.head()

Unnamed: 0,LONGDISTANCE,INTERNATIONAL,LOCAL,DROPPED,PAYMETHOD,LOCALBILLTYPE,LONGDISTANCEBILLTYPE,USAGE,RATEPLAN,GENDER,STATUS,CHILDREN,ESTINCOME,CAROWNER,AGE,CHURN
0,23,0,206,0,CC,Budget,Intnl_discount,229,3,F,S,1,38000.0,N,24.393333,1
1,28,0,60,0,Auto,FreeLocal,Standard,89,4,F,M,1,8073.11,N,46.0,0
2,24,0,5,0,CH,Budget,Standard,29,4,M,M,0,95448.6,Y,53.68,0
3,28,0,97,0,CC,FreeLocal,Standard,125,1,M,S,1,24141.5,Y,17.006667,1
4,0,0,4,2,CC,Budget,Standard,4,2,M,S,1,31952.0,N,34.266667,0


In [70]:
# define the label and features
y = np.float32(data.CHURN)
x = data.drop(['CHURN','RATEPLAN','GENDER','ESTINCOME','STATUS','AGE','USAGE'], axis = 1)

In [71]:
x.columns

Index(['LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD',
       'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'CHILDREN', 'CAROWNER'],
      dtype='object')

In [72]:
# Apply the LabelEncoder to encode the input features in numeric form where applicable
from sklearn_pandas import DataFrameMapper

'''
mapper = DataFrameMapper(
    [('GENDER', LabelEncoder()),
     ('STATUS', LabelEncoder()),
     ('CHILDREN', None),
     ('ESTINCOME',None),
     ('CAROWNER', LabelEncoder()),
     ('AGE',None),
     ('LONGDISTANCE',None),
     ('INTERNATIONAL',None),
     ('LOCAL',None),
     ('DROPPED',None),
     ('PAYMETHOD',LabelEncoder()),
     ('LOCALBILLTYPE',LabelEncoder()),
     ('LONGDISTANCEBILLTYPE',LabelEncoder()),
     ('USAGE',None),
     ('RATEPLAN',None)
    ]
)
'''

mapper = DataFrameMapper(
    [
     ('CHILDREN', None),
     ('CAROWNER', LabelEncoder()),
     ('LONGDISTANCE',None),
     ('INTERNATIONAL',None),
     ('LOCAL',None),
     ('DROPPED',None),
     ('PAYMETHOD',LabelEncoder()),
     ('LOCALBILLTYPE',LabelEncoder()),
     ('LONGDISTANCEBILLTYPE',LabelEncoder())
    ]
)

In [73]:
# split the data to training and testing set
X_train, X_test, y_train, y_test = train_test_split(x, y, 
    test_size=0.2, 
    random_state=42, stratify=y)

In [74]:
# fit the model

import sklearn.pipeline
from sklearn.preprocessing import OneHotEncoder

random_forest = RandomForestClassifier()
steps = [('mapper', mapper),('RandonForestClassifier', random_forest)]
pipeline = sklearn.pipeline.Pipeline(steps)
model=pipeline.fit( X_train, y_train )
model

Pipeline(steps=[('mapper',
                 DataFrameMapper(drop_cols=[],
                                 features=[('CHILDREN', None),
                                           ('CAROWNER', LabelEncoder()),
                                           ('LONGDISTANCE', None),
                                           ('INTERNATIONAL', None),
                                           ('LOCAL', None), ('DROPPED', None),
                                           ('PAYMETHOD', LabelEncoder()),
                                           ('LOCALBILLTYPE', LabelEncoder()),
                                           ('LONGDISTANCEBILLTYPE',
                                            LabelEncoder())])),
                ('RandonForestClassifier', RandomForestClassifier())])

In [75]:
### call pipeline.predict() on your X_test data to make a set of test predictions
y_prediction = pipeline.predict( X_test )

### test your predictions using sklearn.classification_report()

report = sklearn.metrics.classification_report( y_test, y_prediction )
### and print the report
print(report)

              precision    recall  f1-score   support

         0.0       0.98      0.95      0.97       168
         1.0       0.93      0.97      0.95       115

    accuracy                           0.96       283
   macro avg       0.96      0.96      0.96       283
weighted avg       0.96      0.96      0.96       283



### Evaluate

Accuracy of the trained model is very good so we can now decide to deploy this model to be used by the applications.

## Step 3 - WML Deployment
In the next set of cells, we deploy the trained model using Watson Machine Learning into the space associated with the current environment.

In [76]:
import os
cpdtoken=os.environ['USER_ACCESS_TOKEN']
wml_credentials = {
"token": cpdtoken,
"instance_id" : "openshift",
"url": os.environ['RUNTIME_ENV_APSX_URL'],
"version": "4.0"
}

from ibm_watson_machine_learning import APIClient
client = APIClient(wml_credentials)

In [77]:
# Associate WML client with current project
project_id = os.environ['PROJECT_ID']
client.set.default_project(project_id)

'SUCCESS'

In [78]:
# Specify a name for the space being created, the saved model and the model deployment
space_name = 'churn-' + CURRENT_ENV
model_name = 'churn_pipeline'
deployment_name = 'churn_pipeline_deployment'

use_existing_space=True

In [79]:
from ibm_watson_machine_learning import APIClient
import os
import time

token = os.environ['USER_ACCESS_TOKEN']

wml_credentials = {
   "token": token,
   "instance_id" : "openshift",
   "url": os.environ['RUNTIME_ENV_APSX_URL'],
   "version": "4.0"
}

client = APIClient(wml_credentials)

### Create the deployment space if it doesn't exist yet

In [80]:
space_uid = ""
for space in client.spaces.get_details()['resources']:

    if space['entity']['name'] ==space_name:
        print("Deployment space with name",space_name,"already exists . .")
        space_uid=space['metadata']['id']
        client.set.default_space(space_uid)
        if(use_existing_space==False):

            for deployment in client.deployments.get_details()['resources']:
                print("Deleting deployment",deployment['entity']['name'], "in the space",)
                deployment_id=deployment['metadata']['id']
                client.deployments.delete(deployment_id)
            print("Deleting Space ",space_name,)
            client.spaces.delete(space_uid)
            time.sleep(5)
        else:
            print("Using the existing space")

if (space_uid == "" or use_existing_space == False):
    print("\nCreating a new deployment space -",space_name)
    # create the space and set it as default
    space_meta_data = {
        client.spaces.ConfigurationMetaNames.NAME : space_name

        }

    stored_space_details = client.spaces.store(space_meta_data)

    space_uid = stored_space_details['metadata']['id']

    client.set.default_space(space_uid)

Deployment space with name churn-dev already exists . .
Using the existing space


In [81]:
client.software_specifications.list()

-----------------------------  ------------------------------------  ----
NAME                           ASSET_ID                              TYPE
default_py3.6                  0062b8c9-8b7d-44a0-a9b9-46c416adcbd9  base
pytorch-onnx_1.3-py3.7-edt     069ea134-3346-5748-b513-49120e15d288  base
scikit-learn_0.20-py3.6        09c5a1d0-9c1e-4473-a344-eb7b665ff687  base
spark-mllib_3.0-scala_2.12     09f4cff0-90a7-5899-b9ed-1ef348aebdee  base
pytorch-onnx_rt22.1-py3.9      0b848dd4-e681-5599-be41-b5f6fccc6471  base
ai-function_0.1-py3.6          0cdb0f1e-5376-4f4d-92dd-da3b69aa9bda  base
shiny-r3.6                     0e6e79df-875e-4f24-8ae9-62dcc2148306  base
pytorch_1.1-py3.6              10ac12d6-6b30-4ccd-8392-3e922c096a92  base
runtime-22.1-py3.9             12b83a17-24d8-5082-900f-0ab31fbfd3cb  base
scikit-learn_0.22-py3.6        154010fa-5b3b-4ac1-82af-4d5ee5abbc85  base
pytorch-onnx_1.7-py3.8-edt     1b199910-c7d5-5af4-b8f1-e86b760f9779  base
default_r3.6                   1b70aec

In [82]:
software_spec_uid = client.software_specifications.get_uid_by_name('default_py3.7_opence')

metadata = {
    client.repository.ModelMetaNames.NAME: model_name,
    client.repository.ModelMetaNames.SOFTWARE_SPEC_UID: software_spec_uid,
    client.repository.ModelMetaNames.TYPE: "scikit-learn_0.23"
}

stored_model_details = client.repository.store_model(pipeline,
                                               meta_props=metadata,
                                               training_data=X_train,
                                               training_target=y_train)



In [83]:
# Confirm the model is stored in WML repository
client.repository.list_models()

------------------------------------  --------------  ------------------------  -----------------
ID                                    NAME            CREATED                   TYPE
5b2c7a73-71a7-48d9-adf8-5149d4284346  churn_pipeline  2022-05-16T05:08:35.002Z  scikit-learn_0.23
e8f77ad2-0680-480d-88d2-7ff3b544a7e9  churn_pipeline  2022-05-16T04:59:58.002Z  scikit-learn_0.23
------------------------------------  --------------  ------------------------  -----------------


In [84]:
stored_model_details

{'entity': {'hybrid_pipeline_software_specs': [],
  'label_column': 'l1',
  'software_spec': {'id': 'c2057dd4-f42c-5f77-a02f-72bdbd3282c9',
   'name': 'default_py3.7_opence'},
  'training_data_references': [{'connection': {'access_key_id': 'not_applicable',
     'endpoint_url': 'not_applicable',
     'secret_access_key': 'not_applicable'},
    'id': '1',
    'location': {},
    'schema': {'fields': [{'name': 'LONGDISTANCE', 'type': 'int64'},
      {'name': 'INTERNATIONAL', 'type': 'int64'},
      {'name': 'LOCAL', 'type': 'int64'},
      {'name': 'DROPPED', 'type': 'int64'},
      {'name': 'PAYMETHOD', 'type': 'object'},
      {'name': 'LOCALBILLTYPE', 'type': 'object'},
      {'name': 'LONGDISTANCEBILLTYPE', 'type': 'object'},
      {'name': 'CHILDREN', 'type': 'int64'},
      {'name': 'CAROWNER', 'type': 'object'}],
     'id': '1',
     'type': 'DataFrame'},
    'type': 'fs'}],
  'type': 'scikit-learn_0.23'},
 'metadata': {'created_at': '2022-05-16T05:08:35.735Z',
  'id': '5b2c7a73-7

In [85]:
# Deploy the model
deploy_metadata = {
    client.deployments.ConfigurationMetaNames.NAME: deployment_name,
    client.deployments.ConfigurationMetaNames.ONLINE: {}
}

published_model_uid = client.repository.get_model_uid(stored_model_details)
created_deployment = client.deployments.create(published_model_uid, meta_props=deploy_metadata)


This method is deprecated, please use get_model_id()


#######################################################################################

Synchronous deployment creation for uid: '5b2c7a73-71a7-48d9-adf8-5149d4284346' started

#######################################################################################


initializing
Note: Software specification default_py3.7_opence is deprecated. Use runtime-22.1-py3.9 software specification instead when saving a model. For details, see https://www.ibm.com/docs/en/cloud-paks/cp-data/4.0?topic=specifications-software-hardware-deployments.

ready


------------------------------------------------------------------------------------------------
Successfully finished deployment creation, deployment_uid='0f4e418a-b91e-4209-9c85-65414367ef74'
------------------------------------------------------------------------------------------------




In [86]:
deployment_uid = client.deployments.get_uid(created_deployment)
scoring_endpoint = client.deployments.get_scoring_href(created_deployment)
print(scoring_endpoint)

https://internal-nginx-svc.cpd-instance.svc.cluster.local:12443/ml/v4/deployments/0f4e418a-b91e-4209-9c85-65414367ef74/predictions


### Delete the old deployments of the previous model(s)

In [87]:
for deployment in client.deployments.get_details()['resources']:
    if (deployment['metadata']['name']==deployment_name) and (deployment['metadata']['id'] != deployment_uid):
        print('Deployment '+ deployment['metadata']['id'] + ' will be deleted')
        client.deployments.delete(deployment['metadata']['id'])

Deployment b70238e8-d813-4407-aedf-59a3a7b450a3 will be deleted


### Delete previous models

In [88]:
for model in client.repository.get_model_details()['resources']:
    if (model['metadata']['name']==model_name) and (model['metadata']['id'] != stored_model_details['metadata']['id']):
        print('Model '+ model['metadata']['id'] + ' will be deleted')
        client.repository.delete(model['metadata']['id'])

Model e8f77ad2-0680-480d-88d2-7ff3b544a7e9 will be deleted


### List remaining models and deployments

In [89]:
client.repository.list_models()
client.deployments.list()

------------------------------------  --------------  ------------------------  -----------------
ID                                    NAME            CREATED                   TYPE
5b2c7a73-71a7-48d9-adf8-5149d4284346  churn_pipeline  2022-05-16T05:08:35.002Z  scikit-learn_0.23
------------------------------------  --------------  ------------------------  -----------------
------------------------------------  -------------------------  -----  ------------------------
GUID                                  NAME                       STATE  CREATED
0f4e418a-b91e-4209-9c85-65414367ef74  churn_pipeline_deployment  ready  2022-05-16T05:08:38.868Z
------------------------------------  -------------------------  -----  ------------------------


### Run a test score of the newly deployed model

In [90]:
# Score the model on a test dataset
scoring_payload = {
    "input_data": [{
        'fields': ['LONGDISTANCE', 'INTERNATIONAL', 'LOCAL', 'DROPPED', 'PAYMETHOD', 
                   'LOCALBILLTYPE', 'LONGDISTANCEBILLTYPE', 'CHILDREN', 'CAROWNER'],
        'values': [[28,0,60,0,"Auto","FreeLocal","Standard",1,"N"]]}]
}


In [91]:
predictions = client.deployments.score(deployment_uid, scoring_payload)
print(json.dumps(predictions, indent=2))

ApiRequestFailure: Failure during scoring. (POST https://internal-nginx-svc.cpd-instance.svc.cluster.local:12443/ml/v4/deployments/0f4e418a-b91e-4209-9c85-65414367ef74/predictions?version=2021-06-24)
Status code: 400, body: {"trace": "f30432d678d847cfcda27cc325f51f4f", "errors": [{"code": "score_processing_failure", "message": "'DecisionTreeClassifier' object has no attribute 'n_features_'"}], "status_code": 400}
