## Steps to build and deploy the cardiac arrest predictor

1. Loading and pre-processing the data
2. Transferring the pre-processed data on Amazon bucket
3. Defining and training the model
4. Deploying the model

### 1. Loading and pre-processing the data

In [1]:
# import libraries
import numpy as np
import pandas as pd

In [2]:
# reading the data
data = pd.read_csv('./Cardiac_arrest_data.csv')

# first 5 rows of the data
data.head()

Unnamed: 0,Gender,Height,Weight,Smoke,Alcohol,Cardio
0,Male,168,62.0,No,No,No
1,Female,156,85.0,No,No,Yes
2,Female,165,64.0,No,No,Yes
3,Male,169,82.0,No,No,Yes
4,Female,156,56.0,No,No,No


In [3]:
# converting categories into numbers
data['Gender']= data['Gender'].map({'Male':0, 'Female':1})
data['Smoke']= data['Smoke'].map({'No':0, 'Yes':1})
data['Alcohol']= data['Alcohol'].map({'No':0, 'Yes':1})
data['Cardio']= data['Cardio'].map({'No':0, 'Yes':1})

In [4]:
# creating train and test data
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = 0.3, random_state = 10)
print(train.shape, test.shape)

(49000, 6) (21000, 6)


### 2. Transferring the pre-processed data on Amazon S3 bucket

In [5]:
# specify bucket name
bucket_name = 'cardiac-arrest-predictor'

NOTE: target variable has to be the first column in the training and validation datasets

In [6]:
# creating the training set for sagemaker
pd.concat([train['Cardio'], train.drop(['Cardio'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)

In [7]:
# importing libraries for sagemaker
import boto3, sagemaker

In [8]:
# storing the training set in s3 bucket
boto3.Session().resource('s3').Bucket(bucket_name).Object('train/train.csv').upload_file('train.csv')

### 3. Defining and training the model

In [9]:
# defining the training set for sagemaker
s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/train'.format(bucket_name), content_type='csv')

In [10]:
# set the region of the instance
my_region = boto3.session.Session().region_name
my_region

'ap-south-1'

In [11]:
# defining the xgboost container
from sagemaker import image_uris
xgboost_container = image_uris.retrieve(framework = 'xgboost', region = my_region, version = '1.2-1')

In [12]:
# defining the hyperparameters for xgboost algorithm
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"}

In [13]:
# Define IAM role (Identity and Access Management)
from sagemaker import get_execution_role
role = get_execution_role()

In [14]:
# defining the machine learning model
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m4.xlarge', 
                                          volume_size=1,
                                          output_path='s3://{}/output'.format(bucket_name))

In [15]:
# training the model
estimator.fit({'train': s3_input_train})

2021-03-08 14:13:26 Starting - Starting the training job...
2021-03-08 14:13:29 Starting - Launching requested ML instancesProfilerReport-1615212806: InProgress
......
2021-03-08 14:14:40 Starting - Preparing the instances for training.........
2021-03-08 14:16:21 Downloading - Downloading input data
2021-03-08 14:16:21 Training - Downloading the training image.....[34m[2021-03-08 14:17:04.102 ip-10-0-88-156.ap-south-1.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined de

### 4. Deploying the model

In [16]:
# defining serializer
from sagemaker.serializers import CSVSerializer

In [17]:
# deploying the model on sagemaker
xgb_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge', serializer=CSVSerializer())

---------------!

In [18]:
# converting the data into an array
test_data_array = test.drop(['Cardio'], axis=1).values

# generating predictions
predictions = xgb_predictor.predict(test_data_array)

predictions

b'0.6073262095451355,0.5173147320747375,0.3424641788005829,0.45691725611686707,0.5105768442153931,0.41609349846839905,0.7107326984405518,0.4658510684967041,0.35433053970336914,0.5108885169029236,0.5740941762924194,0.43859389424324036,0.44286635518074036,0.44286635518074036,0.3700127899646759,0.45618247985839844,0.5083429217338562,0.39742618799209595,0.5818092823028564,0.45762142539024353,0.4340724050998688,0.6839481592178345,0.5108885169029236,0.44286635518074036,0.5527456402778625,0.5059058666229248,0.5999979972839355,0.40653273463249207,0.6083126068115234,0.659843921661377,0.2358853667974472,0.46616652607917786,0.42941024899482727,0.513866662979126,0.7049894332885742,0.4671017825603485,0.5343353152275085,0.7101582288742065,0.5428138971328735,0.6308749318122864,0.6886477470397949,0.45116832852363586,0.5533838272094727,0.5737374424934387,0.48070836067199707,0.5170656442642212,0.38194456696510315,0.537693202495575,0.5257443189620972,0.45437097549438477,0.3975895345211029,0.6474557518959

In [19]:
# turn the prediction into an array
predictions_array = np.fromstring(predictions[1:], sep=',') 
predictions_array

array([0.60732621, 0.51731473, 0.34246418, ..., 0.51223242, 0.43407241,
       0.52876359])