In [52]:
import sagemaker
import boto3 # Used for reading S3 buckets if public access is given for that bucket.
from sagemaker.image_uris import retrieve
from sagemaker.session import s3_input, Session # To create a Session of Sagemaker

### Creating an S3 bucket using boto3

In [30]:
bucket_name = 'sagemaker-ap-south-1-043309360841' # Has to be unique
my_region = boto3.session.Session().region_name
print(my_region) # Making sure I am currently running in Acia Pacific South region (Mumbai)

ap-south-1


In [31]:
s3 = boto3.resource('s3')
try:
    if my_region == "ap-south-1":
        s3.create_bucket(Bucket = bucket_name, CreateBucketConfiguration={'LocationConstraint': 'ap_south_1'})
    print('S3 Bucket Created Successfully!')
except Exception as e:
    print(f"S3 error: {e}")

S3 error: An error occurred (IllegalLocationConstraintException) when calling the CreateBucket operation: The ap_south_1 location constraint is incompatible for the region specific endpoint this request was sent to.


In [32]:
prefix = 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://sagemaker-ap-south-1-043309360841/xgboost-as-a-built-in-algo/output


### Downloading and uploading the data into S3 Buckets

In [35]:
import pandas as pd
import urllib
try:
    urllib.request.urlretrieve("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print("Successfully downloaded the dataset!")
except Exception as e:
    print(f"Failed: {e}")
    
try:
    model_data = pd.read_csv('./bank_clean.csv', index_col = 0)
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print(f'Failed: {e}')

Successfully downloaded the dataset!
Success: Data loaded into dataframe.


Data has been downloaded, now we split it for training and testing before we insert it into the S3 bucket. We are not separating the input from the output, rather entirely dumping the training data and the testing data.

In [38]:
import numpy as np

train_data, test_data = np.split(model_data.sample(frac = 1, random_state = 402), [int(0.75 * len(model_data))])
print(train_data.shape, test_data.shape)

(30891, 61) (10297, 61)


The format of data in Amazon Sagemaker is different from the usual format which is used for training in the sense that the dependent feature appears as the first column and all the independent features follow afterwards. Therefore we need to tune the dataset to push the dependent features (ones that will be predicted by the model) to the front of the dataset and allow all other independent features to follow.

In [39]:
import os
pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'],
                                               axis = 1)],
         axis = 1).to_csv('train.csv', index = False, header = False)

Saving training data into the bucket and recalling it from the bucket for use:

In [43]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data = 's3://{}/{}/train'.format(bucket_name, prefix), content_type = 'csv')

Now we do the same for the testing data:

In [47]:
pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'],
                                               axis = 1)],
         axis = 1).to_csv('test.csv', index = False, header = False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data = 's3://{}/{}/test'.format(bucket_name, prefix), content_type = 'csv')

### Using an inbuilt XGBoost algorithm to perform the classification task:

Whenever you want to use an inbuilt algorithm from Sagemaker, you need to pull instances of those algorithms into your runtime. These algorithms are usually an image or in a container.

In [56]:
container = retrieve(framework = 'xgboost',
                    region = boto3.Session().region_name,
                    version = '1.7-1')

You also need to initialize the hyperparameters if you want to save running time on Sagemaker, this step might cost you money since training on Sagemaker requires credits. However, the tutorial you were following provided you with these optimized hyperparameters:

In [57]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

We create an estimator to finally compile everything we have done so far into one single step that can be used for training.

In [60]:
estimator = sagemaker.estimator.Estimator(image_uri=container, # This pulls up the container we have mentioned that needs to be used for training, here, XGBoost Algorithm.
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(), # This is the session IAM role, which provides details on the project and the buckets that can be used.
                                          instance_count=1, # Number of Amazon EC2 instances that need to be used for training.
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB, size that might be needed to store the output models (I think) 
                                          output_path=output_path,
                                          # The following three arguments reduce the cost of running the training.
                                          use_spot_instances=True, 
                                          max_run=300,
                                          max_wait=600)

In [61]:
estimator.fit({'train': s3_input_train,'validation': s3_input_test}) # s3_input_train and s3_input_test are the datasets in the bucket that we are using.

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-10-21-14-53-06-475


2024-10-21 14:53:06 Starting - Starting the training job...
2024-10-21 14:53:26 Starting - Preparing the instances for training...
2024-10-21 14:54:03 Downloading - Downloading the training image......
2024-10-21 14:55:09 Training - Training image download completed. Training in progress....
2024-10-21 14:55:30 Uploading - Uploading generated training model[34m[2024-10-21 14:55:25.719 ip-10-0-66-127.ap-south-1.compute.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-10-21 14:55:25.742 ip-10-0-66-127.ap-south-1.compute.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-10-21:14:55:26:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-10-21:14:55:26:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-10-21:14:55:26:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-10-21:14:55:26:INFO] Running XGBoos

### Deploying trained model

You deploy the model with the same Estimator object.

In [62]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-10-21-15-00-35-617
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-10-21-15-00-35-617
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-10-21-15-00-35-617


------!

### Predicting using test data

After deploying your model, it takes tabulated data in a serialized form and makes inferences on that data.

In [87]:
from sagemaker.serializers import CSVSerializer
import io

test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = CSVSerializer()
predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
predictions_array = np.genfromtxt(io.StringIO(predictions[1:]), delimiter=',')
print(predictions_array.shape)

(10297,)


In [88]:
predictions_array

array([0.0443828 , 0.04712602, 0.04159706, ..., 0.05219468, 0.03978461,
       0.02386106])

In [89]:
cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.9%

Predicted      No Purchase    Purchase
Observed
No Purchase    91% (9017)    37% (141)
Purchase        9% (894)     63% (245) 



### Deleting endpoints and S3 Bucket data to avoid charges

After finishing this tutorial, make sure you delete all your endpoints (estimator, buckets) to avoid being charged.

In [96]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-10-21-15-00-35-617


ClientError: An error occurred (ValidationException) when calling the DeleteEndpoint operation: Could not find endpoint "sagemaker-xgboost-2024-10-21-15-00-35-617".