# LAB: ML with SageMaker and XGBoost

NOTE: This notebook should be run in AWS SageMaker python environment.

In [None]:
import pandas as pd
import numpy as np
import boto3
import urllib.request, json, os, sagemaker
from sagemaker import get_execution_role
from time import gmtime, strftime
from sagemaker.predictor import csv_serializer
from sagemaker.tuner import (
    IntegerParameter,
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
)
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
my_region = boto3.session.Session().region_name
prefix = 'sagemaker/MLI-LAB-xgboost'

print("Region: {}".format(my_region))

Region: us-west-2


Create boto3 object

In [None]:
s3 = boto3.resource('s3')

Create your bucket, either manually or from notebook.

Set bucket name

In [None]:
bucket_name = 'bah-bucket-sagemaker-course-2023'

In [None]:
try:
    if  my_region == 'us-east-1':
        if not s3.Bucket(bucket_name).creation_date:
            s3.create_bucket(Bucket=bucket_name)
            print('S3 bucket created successfully')
        else:
            print('Bucket already exists!')
    else: 
        if not s3.Bucket(bucket_name).creation_date:
            s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration={'LocationConstraint': my_region})
            print('S3 bucket created successfully')
        else:
            print('Bucket already exists!')
except Exception as e:
    print('S3 error: ', e)

Bucket already exists!


Load `pima-indians-diabetes.csv` dataset

In [None]:
import os

DATASET = 'pima-indians-diabetes.csv'
DATA_FOLDER = 's3://bah-data'

data = pd.read_csv(os.path.join(DATA_FOLDER, DATASET), header=None)

Preprocess dataset for modeling

In [None]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Split data into train, validation and test datasets which you will upload to s3 bucket

In [None]:
train_data, validation_data, test_data = np.split(
    data.sample(frac=1, random_state=1729),
    [int(0.7 * len(data)), int(0.9 * len(data))],
)


print(train_data.shape, test_data.shape, validation_data.shape)

(537, 9) (77, 9) (154, 9)


Upload train dataset to s3 bucket

In [None]:
label_column_name = 8

In [None]:
label_column = train_data[label_column_name]
train_data = train_data.drop([label_column_name], axis=1)
train_data = pd.concat([label_column, train_data], axis=1)

train_data.to_csv('train.csv', index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

Upload validation dataset to s3 bucket

In [None]:
label_column = validation_data[label_column_name]
validation_data = validation_data.drop([label_column_name], axis=1)
validation_data = pd.concat([label_column, validation_data], axis=1)

validation_data.to_csv('validation.csv', index=False, header=False)

boto3.Session().resource("s3").Bucket(bucket_name).Object(os.path.join(prefix, "validation/validation.csv")).upload_file("validation.csv")

s3_input_validation = sagemaker.TrainingInput(s3_data='s3://{}/{}/validation'.format(bucket_name, prefix), content_type='csv')

Create SageMaker session

In [None]:
sess = sagemaker.Session()

Define IAM role

In [None]:
role = get_execution_role()

Specify XGBoost ECR container

In [None]:
xgboost_container = sagemaker.image_uris.retrieve("xgboost", my_region, "latest")

Create XGBoost Estimator

In [None]:
xgb = sagemaker.estimator.Estimator(xgboost_container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                                    sagemaker_session=sess)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


Set initial hyperparameters

In [None]:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)

Fit the model

In [None]:
xgb.fit({'train': s3_input_train})

INFO:sagemaker:Creating training-job with name: xgboost-2023-03-17-20-54-56-957


2023-03-17 20:54:57 Starting - Starting the training job......
2023-03-17 20:55:39 Starting - Preparing the instances for training......
2023-03-17 20:56:41 Downloading - Downloading input data...
2023-03-17 20:57:05 Training - Downloading the training image...
2023-03-17 20:57:51 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-03-17:20:58:06:INFO] Running standalone xgboost training.[0m
[34m[2023-03-17:20:58:06:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2023-03-17:20:58:06:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8607.31mb[0m
[34m[2023-03-17:20:58:06:INFO] Determined delimiter of CSV input is ','[0m
[34m[20:58:06] S3DistributionType set as FullyReplicated[0m
[34m[20:58:06] 537x8 matrix with 4296 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[20:58:06] src/tree/updater_prune.cc:74: tree pruning end, 1

Hyperparameters tuninig - tune hyperparameters (whatever you want) from XGBoost Algorithm

In [None]:
objective_metric_name = "validation:auc"
MAX_JOBS = 3
MAX_PARALLEL_JOBS = 4
STRATEGY = 'Random'
SCALING_TYPE = 'Logarithmic'

In [None]:
tuninig_job_name = "xgb-linsearch-" + strftime("%Y%m%d-%H-%M-%S", gmtime())

hyperparameter_ranges_linear = {
    "alpha": ContinuousParameter(0.01, 0.5, scaling_type=SCALING_TYPE),
    "lambda": ContinuousParameter(0.05, 0.5, scaling_type=SCALING_TYPE),
}

tuner_linear = HyperparameterTuner(
    xgb,
    objective_metric_name,
    hyperparameter_ranges_linear,
    max_jobs=MAX_JOBS,
    max_parallel_jobs=MAX_PARALLEL_JOBS,
    strategy=STRATEGY,
)

tuner_linear.fit(
    {"train": s3_input_train, 
     "validation": s3_input_validation},
    include_cls_metadata=False,
    job_name=tuninig_job_name)

INFO:sagemaker:Creating hyperparameter tuning job with name: xgb-linsearch-20230317-21-09-16


................................................................!


Show all results as DataFrame from tuninig

In [None]:
tuner = sagemaker.HyperparameterTuningJobAnalytics(tuninig_job_name)

full_df = tuner.dataframe()

if len(full_df) > 0:
    df = full_df[full_df["FinalObjectiveValue"] > -float("inf")]
    df

Terminate your resources

In [None]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'D7EFCWCMA4PZ5DRF',
   'HostId': 'acN5Q+rPaG7igm4mX7JEEoy1w146r6Gehc07nDxH3B/5qFxYqL15HBueaHXLMGRJq0Zst9vPIFE=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'acN5Q+rPaG7igm4mX7JEEoy1w146r6Gehc07nDxH3B/5qFxYqL15HBueaHXLMGRJq0Zst9vPIFE=',
    'x-amz-request-id': 'D7EFCWCMA4PZ5DRF',
    'date': 'Fri, 17 Mar 2023 21:28:05 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'sagemaker/MLI-Lab_solution/output/linear-learner-2023-03-17-21-04-59-395/profiler-output/system/incremental/2023031721/1679087160.algo-1.json'},
   {'Key': 'sagemaker/MLI-Lab_solution/output/linear-learner-2023-03-17-21-12-05-625/profiler-output/system/incremental/2023031721/1679087760.algo-1.json'},
   {'Key': 'sagemaker/MLI-Lab_solution/output/linear-learner-2023-03-17-21-21-59-547/profiler-output/system/incremental/2023031721/1679088240.alg