In [41]:
import boto3
s3_client = boto3.client("s3")
# s3_client.upload_file("./cardio_train.csv",
#                       "davidcw-first-bucket","data/cardio_train.csv")
s3_client.list_buckets(bucket="davidcw-first-bucket")

[i["Name"] for i in s3_client.list_buckets()["Buckets"]]

In [42]:
s3_client.list_objects(Bucket="davidcw-first-bucket")['Contents']

# TASK #1 : UNDERSTAND THE PROBLEM STATEMENT


Aim of the problem is to detect the presence or absence of cardiovascular disease in person based on the given features.
Features available are:


- Age | Objective Feature | age | int (days)
- Height | Objective Feature | height | int (cm) |
- Weight | Objective Feature | weight | float (kg) |
- Gender | Objective Feature | gender | categorical code |
- Systolic blood pressure | Examination Feature | ap_hi | int |
- Diastolic blood pressure | Examination Feature | ap_lo | int |
- Cholesterol | Examination Feature | cholesterol | 1: normal, 2: above normal, 3: well above normal |
- Glucose | Examination Feature | gluc | 1: normal, 2: above normal, 3: well above normal |
- Smoking | Subjective Feature | smoke | binary |
- Alcohol intake | Subjective Feature | alco | binary |
- Physical activity | Subjective Feature | active | binary |
- Presence or absence of cardiovascular disease | Target Variable | cardio | binary |

Note that:
- Objective: factual information;
- Examination: results of medical examination;
- Subjective: information given by the patient.

Data Source:https://www.kaggle.com/sulianova/cardiovascular-disease-dataset

# TASK #2: IMPORT LIBRARIES AND DATASETS

In [2]:
# import the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
# read the csv file 
cardio_df = pd.read_csv("cardio_train.csv", sep=";")

In [24]:
cardio_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.391781,2,168,62.0,110,80,1,1,0,0,1,0
1,55.419178,1,156,85.0,140,90,3,1,0,0,1,1
2,51.663014,1,165,64.0,130,70,3,1,0,0,0,1
3,48.282192,2,169,82.0,150,100,1,1,0,0,1,1
4,47.873973,1,156,56.0,100,60,1,1,0,0,0,0


In [46]:
cardio_df.groupby("gender")["height"].mean()

gender
1    161.355612
2    169.947895
Name: height, dtype: float64

# TASK #3: PERFORM EXPLORATORY DATA ANALYSIS

In [47]:
cardio_df.shape, cardio_df["id"].nunique()

((70000, 13), 70000)

In [4]:
# Drop id

cardio_df = cardio_df.drop(columns = 'id')

In [5]:
# since the age is given in days, we convert it into years

cardio_df['age'] = cardio_df['age']/365

In [50]:
cardio_df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.391781,2,168,62.0,110,80,1,1,0,0,1,0
1,55.419178,1,156,85.0,140,90,3,1,0,0,1,1
2,51.663014,1,165,64.0,130,70,3,1,0,0,0,1
3,48.282192,2,169,82.0,150,100,1,1,0,0,1,1
4,47.873973,1,156,56.0,100,60,1,1,0,0,0,0


In [62]:
# checking the null values
cardio_df.isnull().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [63]:
# Checking the dataframe information

# cardio_df.info()


In [9]:
# Statistical summary of the dataframe
cardio_df.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,53.339358,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,6.759594,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,29.583562,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,48.394521,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,53.980822,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,58.430137,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,64.967123,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


MINI CHALLENGE
- Obtain the features of the individuals who are older than 64.8 years old 

In [64]:
cardio_df[cardio_df["age"]>64.8]

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
395,64.824658,2,160,78.0,140,90,1,1,0,0,1,0
643,64.80274,1,154,79.0,130,80,1,1,0,0,1,0
2936,64.808219,2,156,60.0,140,90,1,1,0,0,1,1
4295,64.849315,1,169,75.0,120,80,1,1,0,0,1,0
9787,64.838356,2,165,70.0,120,80,1,1,0,0,0,0
10422,64.816438,2,177,102.0,130,80,1,2,0,0,1,0
16439,64.821918,1,160,65.0,130,80,3,3,0,0,0,1
18062,64.863014,1,140,48.0,170,100,2,1,0,0,1,1
20931,64.90411,1,165,63.0,150,90,2,1,0,0,1,1
22229,64.838356,2,170,91.0,130,90,1,1,0,0,1,0


# TASK #4: VISUALIZE DATASET

MINI CHALLENGE
- plot the histogram for all features (use 20 bins) 
- plot the correlation matrix and indicate if there exists any correlations between features

In [None]:
plt.hist(cardio_df,  bin=20);

In [6]:
sns.pairplot(cardio_df)

# TASK #5: CREATE TRAINING AND TESTING DATASET

In [7]:
# split the dataframe into target and features

df_target = cardio_df['cardio']
df_final = cardio_df.drop(columns =['cardio'])

In [12]:
cardio_df.columns

Index(['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [13]:
df_final.shape

(70000, 11)

In [14]:
df_target.shape

(70000,)

In [8]:
#spliting the data in to test and train sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_final, df_target, test_size = 0.2)



In [16]:
X_train.shape

(56000, 11)

In [17]:
y_train.shape

(56000,)

In [18]:
X_test.shape

(14000, 11)

In [25]:
X_test.shape

(14000, 11)

# TASK #6: TRAIN AND TEST XGBOOST MODEL IN LOCAL MODE (NOTE THAT SAGEMAKER BUILT-IN ALGORITHMS ARE NOT USED HERE)

In [10]:
# install xgboost

!pip install xgboost

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting xgboost
  Using cached xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [31]:
# use xgboost model in local mode

# note that we have not performed any normalization or scaling since XGBoost is not sensitive to this.
# XGboost is a type of ensemble algorithms and works by selecting thresholds or cut points on features to split a node. 
# It doesn't really matter if the features are scaled or not.


from xgboost import XGBClassifier

# model = XGBClassifier(learning_rate=0.01, n_estimators=100, objective='binary:logistic')
# model = XGBClassifier()
model = XGBClassifier(learning_rate=0.01, n_estimators=500, max_depth=20)



model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=20, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=2,
              num_parallel_tree=1, objective='binary:logistic',
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              use_label_encoder=True, validate_parameters=1, verbosity=None)

In [32]:
# make predictions on test data

predict = model.predict(X_test)

In [23]:
predict

array([1, 0, 0, ..., 1, 0, 0])

In [33]:
# Assess trained model performance on training dataset
predict_train = model.predict(X_train)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_train, predict_train))
print(classification_report(y_train, predict_train))
# plt.figure()
# sns.heatmap(cm, annot=True)

[[26222  1876]
 [ 2823 25079]]
              precision    recall  f1-score   support

           0       0.90      0.93      0.92     28098
           1       0.93      0.90      0.91     27902

    accuracy                           0.92     56000
   macro avg       0.92      0.92      0.92     56000
weighted avg       0.92      0.92      0.92     56000



In [34]:
# print metrics for training dataset

from sklearn.metrics import precision_score, recall_score, accuracy_score

print("Precision = {}".format(precision_score(y_train, predict_train)))
print("Recall = {}".format(recall_score(y_train, predict_train)))
print("Accuracy = {}".format(accuracy_score(y_train, predict_train)))

Precision = 0.9304025227230569
Recall = 0.89882445702817
Accuracy = 0.9160892857142857


In [35]:
# print metrics for testing dataset

print("Precision = {}".format(precision_score(y_test, predict)))
print("Recall = {}".format(recall_score(y_test, predict)))
print("Accuracy = {}".format(accuracy_score(y_test, predict)))

Precision = 0.7409855769230769
Recall = 0.6969054684188215
Accuracy = 0.7236428571428571


In [36]:
# plot the confusion matrix

# from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))


[[5199 1724]
 [2145 4932]]
              precision    recall  f1-score   support

           0       0.71      0.75      0.73      6923
           1       0.74      0.70      0.72      7077

    accuracy                           0.72     14000
   macro avg       0.72      0.72      0.72     14000
weighted avg       0.72      0.72      0.72     14000



In [20]:
param_grid = {
        'gamma': [0.5, 1, 5],   # regularization parameter 
        'subsample': [0.6, 0.8, 1.0], # % of rows taken to build each tree
        'colsample_bytree': [0.6, 0.8, 1.0], # number of columns used by each tree
        'max_depth': [3, 4, 5] # depth of each tree
        }

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
grid = GridSearchCV(XGBClassifier(), param_grid=param_grid,scoring="f1",n_jobs=-1, cv=5,verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 284 tasks      | elapsed: 25.7min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 38.1min finished




GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_c...
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, use_label_encoder=True,
                                     validate_parameters=N

In [29]:
grid.best_params_, grid.best_score_

({'colsample_bytree': 0.6, 'gamma': 1, 'max_depth': 3, 'subsample': 0.6},
 0.7242237639329457)

In [26]:
y_predict_optim = grid.predict(X_test)

In [27]:
predict_train = grid.predict(X_train)
from sklearn.metrics import confusion_matrix, classification_report
print(confusion_matrix(y_train, predict_train))
print(classification_report(y_train, predict_train))

[[21995  6103]
 [ 8408 19494]]
              precision    recall  f1-score   support

           0       0.72      0.78      0.75     28098
           1       0.76      0.70      0.73     27902

    accuracy                           0.74     56000
   macro avg       0.74      0.74      0.74     56000
weighted avg       0.74      0.74      0.74     56000



In [30]:
# print metrics for testing dataset

print("Precision = {}".format(precision_score(y_test, y_predict_optim)))
print("Recall = {}".format(recall_score(y_test, y_predict_optim)))
print("Accuracy = {}".format(accuracy_score(y_test, y_predict_optim)))

Precision = 0.7675104829942537
Recall = 0.6983184965380811
Accuracy = 0.7405714285714285


# TASK #7: PERFORM DIMENSIONALITY REDUCTION USING PCA ( USING SAGEMAKER)

In [37]:
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2



import sagemaker
import boto3
from sagemaker import Session

# Let's create a Sagemaker session
sagemaker_session = sagemaker.Session()
bucket = Session().default_bucket() 
prefix = 'pca'  # prefix is the subfolder within the bucket.

#Let's get the execution role for the notebook instance. 
# This is the IAM role that you created when you created your notebook instance. You pass the role to the training job.
# Note that AWS Identity and Access Management (IAM) role that Amazon SageMaker can assume to perform tasks on your behalf (for example, reading training results, called model artifacts, from the S3 bucket and writing training results to Amazon S3). 

role = sagemaker.get_execution_role()

In [38]:
import io # The io module allows for dealing with various types of I/O (text I/O, binary I/O and raw I/O). 
import numpy as np
import sagemaker.amazon.common as smac # sagemaker common libary

# Code below converts the data in numpy array format to RecordIO format
# This is the format required by Sagemaker PCA

buf = io.BytesIO() # create an in-memory byte array (buf is a buffer I will be writing to)
df_matrix = df_final.to_numpy() # convert the dataframe into 2-dimensional array
smac.write_numpy_to_dense_tensor(buf, df_matrix)
buf.seek(0)

# When you write to in-memory byte arrays, it increments 1 every time you write to it
# Let's reset that back to zero 

0

In [39]:
import os

# Code to upload RecordIO data to S3
 
# Key refers to the name of the file 
 
key = 'pca'

#following code uploads the data in record-io format to S3 bucket to be accessed later for training
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)


print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-us-east-1-618725421363/pca/train/pca


In [40]:
# create output placeholder in S3 bucket to store the PCA output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://sagemaker-us-east-1-618725421363/pca/output


In [5]:
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

# Let's obtain a reference to the pca container image
# Note that all  models are named estimators
# You don't have to specify (hardcode) the region, get_image_uri will get the current region name using boto3.Session


# from sagemaker.amazon.amazon_estimator import get_image_uri  #old version
import sagemaker, boto3


container = sagemaker.image_uris.retrieve( 'pca', boto3.Session().region_name)

In [6]:
# We have pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training


pca = sagemaker.estimator.Estimator(container,
                                    role,
                                    tinstance_count=1,
                                    istance_type='ml.c4.xlarge',
                                    output_path=output_location,
                                    use_spot_instance = True,
                                    sagemaker_session=sagemaker_session)

# We can tune parameters like the number of features that we are passing in, mode of algorithm, mini batch size and number of pca components


pca.set_hyperparameters(feature_dim=11,
                        num_components=6,
                        subtract_mean=False,
                        algorithm_mode='regular',
                        mini_batch_size=100)


# Pass in the training data from S3 to train the pca model


# pca.fit({'train': s3_train_data})

# Let's see the progress using cloudwatch logs

NameError: name 'role' is not defined

MINI CHALLENGE:
 - Retrain the model with the following number of components 5, 4, and 7 

# TASK #8: DEPLOY THE TRAINED PCA MODEL 

In [44]:
# Deploy the model to perform inference 

pca_reduction = pca.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')

----------!

In [45]:
from sagemaker.predictor import csv_serializer, json_deserializer


# Content type overrides the data that will be passed to the deployed model, since the deployed model expects data in text/csv format.

# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content type

# Deserializer accepts two arguments, the result data and the response content type, and return a sequence of bytes in the specified content type.

# Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html


# pca_reduction.content_type = 'text/csv'
pca_reduction.serializer = csv_serializer
pca_reduction.deserializer = json_deserializer

In [46]:
# make prediction on the test data

result = pca_reduction.predict(np.array(df_final))

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The json_deserializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [54]:
pd.DataFrame.from_dict(result).iloc[:10] # results are in Json format

Unnamed: 0,projections
0,"{'projection': [-0.2891036570072174, 3.9943747..."
1,"{'projection': [1.0957019329071045, -4.7776770..."
2,"{'projection': [1.336683988571167, 1.799760699..."
3,"{'projection': [-0.4163884222507477, 6.1496973..."
4,"{'projection': [-0.14980435371398926, 2.675003..."
5,"{'projection': [0.8930057883262634, -10.548048..."
6,"{'projection': [0.9225766658782959, -9.4765748..."
7,"{'projection': [1.9741971492767334, -4.1789298..."
8,"{'projection': [-0.27406933903694153, 2.647378..."
9,"{'projection': [-0.3863908648490906, -1.092898..."


In [55]:
# Since the results are in Json format, we access the scores by iterating through the scores in the predictions
predictions = np.array([r['projection'] for r in result['projections']])

In [56]:
predictions

array([[-2.89103657e-01,  3.99437475e+00, -1.25495701e+01,
         6.42405853e+01,  3.68210945e+01, -2.17848068e+02],
       [ 1.09570193e+00, -4.77767706e+00,  1.23644390e+01,
         4.45821114e+01,  4.56663399e+01, -2.41025101e+02],
       [ 1.33668399e+00,  1.79976070e+00, -9.73582458e+00,
         5.34248428e+01,  5.57539673e+01, -2.23016754e+02],
       ...,
       [ 9.36102509e-01,  6.50153160e+00,  2.10844498e+01,
         4.50918350e+01,  7.45691071e+01, -2.82263916e+02],
       [ 1.15454197e-03, -8.17430115e+00, -2.97180176e+00,
         5.16770020e+01,  5.14335823e+01, -2.34024872e+02],
       [ 3.68915290e-01, -1.09633231e+00, -4.88934517e+00,
         6.41318741e+01,  4.39791260e+01, -2.28002136e+02]])

In [42]:
predictions.shape

(70000, 6)

In [57]:
# Delete the end-point

pca_reduction.delete_endpoint()

# TASK #9: TRAIN AND EVALUATE XGBOOST MODEL ON DATA AFTER DIMENSIONALITY REDUCTION (USING SAGEMAKER)

In [None]:
predictions.shape

In [59]:
# Convert the array into dataframe in a way that target variable is set as the first column and is followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format

train_data = pd.DataFrame({'Target':df_target})
train_data

Unnamed: 0,Target
0,0
1,1
2,1
3,1
4,0
...,...
69995,0
69996,1
69997,1
69998,1


In [60]:
for i in range(predictions.shape[1]):
    train_data[i] = predictions[:,i]

In [61]:
train_data.head()

Unnamed: 0,Target,0,1,2,3,4,5
0,0,-0.289104,3.994375,-12.54957,64.240585,36.821095,-217.848068
1,1,1.095702,-4.777677,12.364439,44.582111,45.66634,-241.025101
2,1,1.336684,1.799761,-9.735825,53.424843,55.753967,-223.016754
3,1,-0.416388,6.149697,5.682127,43.042397,45.009624,-257.243164
4,0,-0.149804,2.675003,-13.209843,63.61092,44.346691,-192.799316


In [63]:
train_data_size = int(0.9 * train_data.shape[0])
train_data_size

63000

In [64]:
# shuffle the data in dataframe and then split the dataframe into train, test and validation sets.

import sklearn 

train_data = sklearn.utils.shuffle(train_data)
train, test, valid = train_data[:train_data_size], train_data[train_data_size:train_data_size + 3500], train_data[train_data_size + 3500:]


In [65]:
train.shape, test.shape,valid.shape

((63000, 7), (3500, 7), (3500, 7))

In [66]:
X_test, y_test = test.drop(columns = ['Target']), test['Target']

In [67]:
# save train_data and validation_data as csv files

train.to_csv('train.csv',header = False, index = False)
valid.to_csv('valid.csv',header = False, index = False)

In [68]:
prefix = 'XGBoost-Classifier'
key = 'XGBoost-Classifier'

In [69]:
# read the data from csv file and then upload the data to s3 bucket
with open('train.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-us-east-1-618725421363/XGBoost-Classifier/train/XGBoost-Classifier


In [70]:
# reading the data from csv file and then upload the data to s3 bucket
with open('valid.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'valid', key)).upload_fileobj(f)

# Let's print out the validation data location in s3
s3_valid_data = 's3://{}/{}/valid/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_valid_data))

uploaded validation data location: s3://sagemaker-us-east-1-618725421363/XGBoost-Classifier/valid/XGBoost-Classifier


In [71]:
# creates output placeholder in S3 bucket to store the linear learner output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://sagemaker-us-east-1-618725421363/XGBoost-Classifier/output


In [72]:
# This code is used to get the training container of sagemaker built-in algorithms
# all we have to do is to specify the name of the algorithm, that we want to use

# Let's obtain a reference to the XGBoost container image
# Note that all  models are named estimators
# You don't have to specify (hardcode) the region, get_image_uri will get the current region name using boto3.Session


container = sagemaker.image_uris.retrieve('xgboost',boto3.Session().region_name,'1.3-1')

In [76]:
# We have pass in the container, the type of instance that we would like to use for training 
# output path and sagemaker session into the Estimator. 
# We can also specify how many instances we would like to use for training


Xgboost_classifier = sagemaker.estimator.Estimator(container,
                                       role, 
                                       instance_count=1, 
                                       instance_type='ml.m4.xlarge',
                                       output_path=output_location,
                                                   use_spot_instance=True,
                                       sagemaker_session=sagemaker_session)

# To improve the performance of the model, a hyperparameters tuning job need to be run 

Xgboost_classifier.set_hyperparameters(max_depth=8,
                           objective='multi:softmax',
                           num_class= 2,
                           eta = 0.5,
                           num_round = 150
                           )

In [77]:
# Create "train", "validation" channels to feed in the model
# Source: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html


train_input = sagemaker.session.s3_input(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.session.s3_input(s3_data = s3_valid_data, content_type='csv',s3_data_type = 'S3Prefix')

Xgboost_classifier.fit({'train': train_input, 'validation': valid_input})

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2022-02-11 10:56:12 Starting - Starting the training job...ProfilerReport-1644576971: InProgress
......
2022-02-11 10:57:38 Starting - Preparing the instances for training......
2022-02-11 10:58:32 Downloading - Downloading input data...
2022-02-11 10:59:13 Training - Downloading the training image......
2022-02-11 11:00:13 Training - Training image download completed. Training in progress...[34m[2022-02-11 11:00:17.892 ip-10-2-90-224.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2022-02-11:11:00:18:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2022-02-11:11:00:18:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2022-02-11:11:00:18:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2022-02-11:11:00:18:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2022-02-11:11:00:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[2022-02

MINI CHALLENGE
- Retrain the XGBoost model with deeper trees (max_depth)

# TASK #10: DEPLOY AND TEST THE TRAINED XGBOOST MODEL 

In [None]:
# Deploy the model to perfrom inference 

Xgboost_classifier = Xgboost_classifier.deploy(initial_instance_count = 1,
                                          instance_type = 'ml.m4.xlarge')

In [None]:
# Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data in text/csv format, we specify this as content -type.
# Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content type
#Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html

from sagemaker.predictor import csv_serializer, json_deserializer

Xgboost_classifier.serializer = csv_serializer


In [None]:
# make prediction

XGB_prediction = Xgboost_classifier.predict(np.array(X_test))

In [None]:
XGB_prediction

In [None]:
# custom code to convert the values in bytes format to array

def bytes_2_array(x):
    
    #makes entire prediction as string and splits based on ','
    l = str(x).split(',')
    
    #Since the first element contains unwanted characters like (b,',') we remove them
    l[0] = l[0][2:]
    #same-thing as above remove the unwanted last character (')
    l[-1] = l[-1][:-1]
    
    #iterating through the list of strings and converting them into float type
    for i in range(len(l)):
        l[i] = float(l[i])
        
    #converting the list to into array
    l = np.array(l).astype('float32')
    
    #reshape one-dimensional array to two-dimentaional array
    return l.reshape(-1,1)

In [None]:
predicted_values = bytes_2_array(XGB_prediction)

In [None]:
predicted_values

In [None]:
y_test = np.array(y_test)
y_test = y_test.reshape(-1,1)

In [None]:
y_test

In [None]:
# plot metrics

from sklearn.metrics import precision_score, recall_score, accuracy_score

print("Precision = {}".format(precision_score(y_test, predicted_values, average='macro')))
print("Recall = {}".format(recall_score(y_test, predicted_values, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, predicted_values)))

In [None]:
# plot confusion matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predicted_values)
plt.figure()
sns.heatmap(cm, annot=True)

In [None]:
# Delete the end-point

Xgboost_classifier.delete_endpoint()

# EXCELLENT JOB

MINI CHALLENGE SOLUTIONS

In [None]:
cardio_df [ cardio_df['ap_hi'] == 16020]
cardio_df [ cardio_df['age'] > 64.8]

In [None]:
cardio_df.hist(bins = 30, figsize = (20,20), color = 'r')
# get the correlation matrix

corr_matrix = cardio_df.corr()
corr_matrix
# plotting the correlation matrix
plt.figure(figsize = (16,16))
sns.heatmap(corr_matrix, annot = True)
plt.show()


In [None]:
xgb_model = XGBClassifier(learning_rate=0.01, n_estimators=100, objective='binary:logistic')
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(xgb_model, param_grid, refit = True, verbose = 4)
grid.fit(X_train, y_train)