# Prediction on Customer Enrollment with Amazon SageMaker XGBoost 

Business problem: Predict whether a customer will enroll for a certificate of deposit product.

Labeled Data: Customer demographics (age, employment, type of job, education etc.), responses to marketing events (including past response), external factors (month, day of the week etc.) and whether the customer is enrolled.

---

## Step 1: Import libraries and define environment variables

In [16]:
# import libraries
import boto3, re, sys, math, json, os, sagemaker
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


---

## Step2: Create S3 bucket to store data 

Please note the bucket_name needs to be unique globally for AWS S3, so we suggest you use 'sagemaker-844-firstnamelastname'.

In [17]:
bucket_name = 'sagemaker-844-demo' # <-- Change to a globally unique bucket_name 
s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
      s3.create_bucket(Bucket=bucket_name)
    else: 
      s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={ 'LocationConstraint': my_region })
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully



Download raw data from external URL to your SageMaker instance.

In [3]:
!wget -N https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
!unzip -o bank-additional.zip

--2020-10-26 19:38:17--  https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 444572 (434K) [application/x-httpd-php]
Saving to: ‘bank-additional.zip’


2020-10-26 19:38:18 (1.32 MB/s) - ‘bank-additional.zip’ saved [444572/444572]

Archive:  bank-additional.zip
   creating: bank-additional/
  inflating: bank-additional/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/bank-additional/
  inflating: __MACOSX/bank-additional/._.DS_Store  
  inflating: bank-additional/.Rhistory  
  inflating: bank-additional/bank-additional-full.csv  
  inflating: bank-additional/bank-additional-names.txt  
  inflating: bank-additional/bank-additional.csv  
  inflating: __MACOSX/._bank-additional  


* Read data as a dataframe, check data shape in the number of rows and columns

In [6]:
data = pd.read_csv('./bank-additional/bank-additional-full.csv', sep=";")
data.shape

(41188, 21)


View data information

In [7]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [8]:
data.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

---

## Step3: Prepare the raw data for model training

Data contains 20 features for each customer - 

Demographics:
* age: Customer's age (numeric)
* job: Type of job (categorical: 'admin.', 'services', ...)
* marital: Marital status (categorical: 'married', 'single', ...)
* education: Level of education (categorical: 'basic.4y', 'high.school', ...)

Past customer events:
* default: Has credit in default? (categorical: 'no', 'unknown', ...)
* housing: Has housing loan? (categorical: 'no', 'yes', ...)
* loan: Has personal loan? (categorical: 'no', 'yes', ...)

Past direct marketing contacts:
* contact: Contact communication type (categorical: 'cellular', 'telephone', ...)
* month: Last contact month of year (categorical: 'may', 'nov', ...)
* day_of_week: Last contact day of the week (categorical: 'mon', 'fri', ...)
* duration: Last contact duration, in seconds (numeric). Important note: If duration = 0 then y = 'no'.

Campaign information:
* campaign: Number of contacts performed during this campaign and for this client (numeric, includes last contact)
* pdays: Number of days that passed by after the client was last contacted from a previous campaign (numeric)
* previous: Number of contacts performed before this campaign and for this client (numeric)
* poutcome: Outcome of the previous marketing campaign (categorical: 'nonexistent','success', ...)

External environment factors:
* emp.var.rate: Employment variation rate - quarterly indicator (numeric)
* cons.price.idx: Consumer price index - monthly indicator (numeric)
* cons.conf.idx: Consumer confidence index - monthly indicator (numeric)
* euribor3m: Euribor 3 month rate - daily indicator (numeric)
* nr.employed: Number of employees - quarterly indicator (numeric)

Target variable:
* y: Has the client subscribed a term deposit? (binary: 'yes','no')

Many records have "999" for pdays, which is the number of days that passed by after a client was last contacted. It is very likely to be a magic number to represent that no contact was made before. Therefore, we create a new column called "no_previous_contact", then make it "1" when pdays is 999 and "0" otherwise.

In the "job" column, various categories mean the customer is not working, e.g., "student", "retire", and "unemployed". Since it is highly likely whether or not a customer is working will affect his/her decision to enroll in the certificate of deposit, we create a new column to show whether the customer is working based on the "job" column.

Finally, we convert categorical data to numeric using pd.get_dummies(data), and view the transformed data.

In [10]:
data['no_previous_contact'] = np.where(data['pdays'] == 999, 1, 0)                                 # Indicator variable to capture when pdays takes a value of 999
data['not_working'] = np.where(np.in1d(data['job'], ['student', 'retired', 'unemployed']), 1, 0)   # Indicator for individuals not actively employed
model_data = pd.get_dummies(data)                                                                  # Convert categorical variables to sets of indicators
model_data.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0,1,0,0,0,0,1,0,1,0



Certain economic features in the data won't be available at the time of predicting a customer's enrollment behaviour, or they can be as difficult to forecast as the business problem, with data being only available for defined time periods and on a lag.

So we remove the economic features and duration from the data as they would need to be forecasted with high precision to use as inputs in future predictions.

In [11]:
model_data = model_data.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'], axis=1)


View model_data. Now the dataset is cleaned and ready to be split into training and test sets.

In [12]:
model_data.head()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0


In [13]:
model_data.tail()

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
41183,73,1,999,0,1,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
41184,46,1,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
41185,56,2,999,0,1,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
41186,44,1,999,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
41187,74,3,999,1,1,1,0,0,0,0,...,1,0,0,0,0,1,0,0,1,0


In [14]:
model_data.shape

(41188, 61)



Shuffle and split the data into training and test sets. In this example, select 70% of customers for training data.

The rest 30% of customers data is used to evaluate model performance.

In [18]:
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(28831, 61) (12357, 61)


---

## Step4: Train the training data using SageMaker pre-built XGBoost model.  

XGBoost is a gradient-based optimization to iteratively refine the model parameters. Gradient-based optimization is to find model parameter values that minimize the model error, using the gradient of the model loss function.

Reformat the header and first column, load data from S3. (Disregard message on second version SDK v2)

In [19]:
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.



Set up SageMaker session, create an estimator (an instance) of the XGBoost model, and define the model's hyperparameters. (Disregard message on second version SDK v2)

In [20]:
sess = sagemaker.Session()
xgb = sagemaker.estimator.Estimator(containers[my_region],role, train_instance_count=1, train_instance_type='ml.m4.xlarge',output_path='s3://{}/{}/output'.format(bucket_name, prefix),sagemaker_session=sess)
xgb.set_hyperparameters(max_depth=5,eta=0.2,gamma=4,min_child_weight=6,subsample=0.8,silent=0,objective='binary:logistic',num_round=100)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.



Train the model

In [21]:
xgb.fit({'train': s3_input_train})

2020-10-26 20:46:21 Starting - Starting the training job...
2020-10-26 20:46:23 Starting - Launching requested ML instances.........
2020-10-26 20:48:07 Starting - Preparing the instances for training......
2020-10-26 20:49:17 Downloading - Downloading input data
2020-10-26 20:49:17 Training - Downloading the training image...
2020-10-26 20:49:50 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2020-10-26:20:49:45:INFO] Running standalone xgboost training.[0m
[34m[2020-10-26:20:49:45:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-10-26:20:49:45:INFO] File size need to be processed in the node: 3.38mb. Available memory size in the node: 8495.79mb[0m
[34m[2020-10-26:20:49:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:49:45] S3DistributionType set as FullyReplicated[0m
[34m[20:49:45] 28831x59 matrix with 1701029 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[20:49:45]

---

## Step5: Deploy the Model

Deploy the model on a server and create an endpoint

In [22]:
xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-------------!


Run the model to create predictions on whether customers in the test data enrolled for the certificate of deposit product

In [23]:
test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #load the data into an array
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict!
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

(12357,)


---

## Step6: Evaluate model performance

Compare actual vs. predictions in a confusion matrix. Data shows that accuracy is 89.5%, with a precision of 65% (278/429) for enrolled and 90% (10,785/11,928) for customers who didn't enroll.

Precision = True Positive / (True Positive + False Positive) = True Positive / Total Predicted Positive

Recall = True Positive / (True Positive + False Negative) = True Positive / Total Actual Positive = 278/(1143+278) = 0.1956

F1 = 2 * (Precision * Recall) / (Precision + Recall) = 2 * (0.65 * 0.1956)/(0.65+0.1956) = 0.3005

In [24]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.5%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (10785)    35% (151)
Purchase        10% (1143)     65% (278) 



---

## Step7: Terminate resources.

Terminate resources not actively being used to reduce costs and is a best practice. Delete endpoint and all objects in S3 bucket.

In [25]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': '55A74B6F9347A6EB',
   'HostId': 'JJmCGQszqSHsBh1IguAIHOkKfTNEZfAgsjA51RdVwNhOYnoyw2Mz+Jmsv0bE5jfCzlqqBjqZZZE=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'JJmCGQszqSHsBh1IguAIHOkKfTNEZfAgsjA51RdVwNhOYnoyw2Mz+Jmsv0bE5jfCzlqqBjqZZZE=',
    'x-amz-request-id': '55A74B6F9347A6EB',
    'date': 'Mon, 26 Oct 2020 21:09:02 GMT',
    'connection': 'close',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/DEMO-xgboost-dm/train/train.csv'},
   {'Key': 'sagemaker/DEMO-xgboost-dm/output/xgboost-2020-10-26-20-46-20-218/output/model.tar.gz'}]}]