# Importing the models

In [1]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input,Session

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


# Creating the variables of bucket name and region of aws

In [2]:
bucket_name = 'aws-titanic'
my_region = boto3.session.Session().region_name
print(my_region)

us-east-1


# Figuring out if the S3 bucket is in the us-east or not

In [3]:
s3=boto3.resource('s3')
try:
    if my_region=='us-east-1':
        s3.create_bucket(Bucket= bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error:',e)

S3 bucket created successfully


# Gernerating the S3 path for storing the output data with given prefix

In [4]:
prefix= 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name,prefix)
print(output_path)

s3://aws-titanic/xgboost-as-a-built-in-algo/output


# Importing the dataset using pandas library

In [8]:
import pandas as pd
df = pd.read_excel('./titanic3.xlsx')
print('Success: Data loaded into dataframe.')

Success: Data loaded into dataframe.


# Creating a train data and test data with random state

In [9]:
from sklearn.preprocessing import StandardScaler
#dropping the age column because it contains too many null values to impute with mean
df.dropna(subset=['age'], inplace=True)
#dropping any rows with null values for fare (only 1)
df.dropna(subset=['fare'], inplace=True)
#extracting the title of each person on board from the name column
df['title'] = df['name'].str.extract(r',\s*([^\.]*)\.', expand=False).str.strip()
# create dictionary to map titles to encoded values
title_encoding = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rev': 5, 'Dr': 5, 'Col': 5, 
                  'Mlle': 6, 'Ms': 6, 'Major': 6, 'Capt': 6, 'Sir': 7, 'Dona': 7, 'Jonkheer': 7, 
                  'the Countess': 7, 'Don': 8, 'Mme': 8, 'Lady': 8}
# map titles to encoded values
df['title_encoded'] = df['title'].map(title_encoding)
columns_to_encode = ['sex']
df = pd.get_dummies(df, columns=columns_to_encode)

df['boat'].fillna(-999, inplace=True)
df['body'].fillna(-999, inplace=True)

# convert the series to numeric type
df['boat'] = pd.to_numeric(df['boat'], errors='coerce')

columns_to_drop = ['cabin', 'home.dest', 'embarked', 'name', 'ticket', 'title']
df.drop(columns=columns_to_drop, axis=1, inplace=True)
# select numerical columns to normalize
numerical_cols = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'boat', 'body', 'title_encoded', 'sex_male', 'sex_female']
# create scaler object
scaler = StandardScaler()
# fit and transform selected columns
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,boat,body,title_encoded,sex_female,sex_male
0,-1.434601,1,-0.059228,-0.551897,-0.501462,3.135243,1.320890,-0.357055,0.152718,1.301268,-1.301268
1,-1.434601,1,-2.011855,0.544553,1.880484,2.061972,1.339498,-0.357055,1.878024,-0.768481,0.768481
2,-1.434601,0,-1.936534,0.544553,1.880484,2.061972,-0.748661,-0.357055,0.152718,1.301268,-1.301268
3,-1.434601,0,0.010302,0.544553,1.880484,2.061972,-0.748661,2.710703,-0.709934,-0.768481,0.768481
4,-1.434601,0,-0.337347,0.544553,1.880484,2.061972,-0.748661,-0.357055,1.015371,1.301268,-1.301268
...,...,...,...,...,...,...,...,...,...,...,...
1301,0.943128,0,1.088014,-0.551897,-0.501462,-0.528869,-0.748661,3.189533,-0.709934,-0.768481,0.768481
1304,0.943128,0,-1.067411,0.544553,-0.501462,-0.399094,-0.748661,3.232817,0.152718,1.301268,-1.301268
1306,0.943128,0,-0.233052,-0.551897,-0.501462,-0.528869,-0.748661,3.167891,-0.709934,-0.768481,0.768481
1307,0.943128,0,-0.198288,-0.551897,-0.501462,-0.528869,-0.748661,-0.357055,-0.709934,-0.768481,0.768481


# Creating a train and test data set with random state

In [10]:
import numpy as np
train_data,test_data = np.split(df.sample(frac=1,random_state=1729),[int(0.7*len(df))])
print(train_data.shape,test_data.shape)

(731, 11) (314, 11)


# Converting training data set into csv and upload it to S3

In [11]:
import os
from sagemaker.inputs import TrainingInput
file_name = 'train.csv'
pd.concat([train_data['survived'],train_data.drop(['survived','body'],axis=1)],axis=1).to_csv('train.csv',index=False,header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'train/train.csv')).upload_file(file_name)
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

# Converting test data set into csv and upload it to S3

In [12]:
file_name = 'test.csv'
pd.concat([test_data['survived'],test_data.drop(['survived','body'],axis=1)],axis=1).to_csv('test.csv',index=False,header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'test/test.csv')).upload_file(file_name)
s3_input_test = TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name,prefix),content_type='csv')

# Container helps to hold uri for the xgboost container image. It helps to used for the sagemaker tasks like training model.

In [13]:
import sagemaker
container = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=boto3.Session().region_name,
    version="1.0-1"
)

# Hyperparameters helps to control the XGBoost algorithm behavior

In [14]:
hyperparameters={
    "max_depth":"5",
    "eta":"0.2",
    "gamma":"4",
    "min_child_weight":"6",
    "subsample":"0.7",
    "objective":"binary:logistic",
    "num_round": "100",
}

# Setup of the sagemaker estimator to configure the training model 

In [15]:
estimator = sagemaker.estimator.Estimator(
    image_uri=container,
    hyperparameters=hyperparameters,
    role=sagemaker.get_execution_role(),
    instance_count=1,  # Update this line
    instance_type='ml.m5.2xlarge',  # Update this line
    volume_size=5,  # Update this line
    output_path=output_path,
    use_spot_instances=True,  # Update this line
    max_run=300,  # Update this line
    max_wait=600,# Update this line
      # Add TrainingImageConfig to specify your private Docker registry
    training_image_config={
        "RepositoryAccessMode": "Vpc",  # Specify Vpc if it's a private Docker registry
        # Add any other required configurations for your private registry
    },
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [16]:
estimator.fit({'train':s3_input_train,'validation':s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-11-05-19-20-50-185


2023-11-05 19:20:50 Starting - Starting the training job...
2023-11-05 19:21:06 Starting - Preparing the instances for training.........
2023-11-05 19:22:38 Downloading - Downloading input data...
2023-11-05 19:23:18 Training - Training image download completed. Training in progress....
2023-11-05 19:23:49 Uploading - Uploading generated training model.[34m[2023-11-05 19:23:41.784 ip-10-0-191-95.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','

# xgb predictor helps to send input data for inference and receive prediction from the deployment in xgboost model.

In [17]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-11-05-19-29-36-403
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-11-05-19-29-36-403
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-11-05-19-29-36-403


-------!

# Making prediction that deploys in xgboost model and prints out the shape of the prediction array

In [21]:
from sagemaker.serializers import CSVSerializer
test_data_array = test_data.drop(['survived','body'],axis=1).values

xgb_predictor.serializer = CSVSerializer()

predictions = xgb_predictor.predict(test_data_array).decode('utf-8')

predictions_array = np.fromstring(predictions[1:],sep=',')
print(predictions_array.shape)

(314,)


In [22]:
predictions_array

array([0.01613514, 0.01059572, 0.01059572, 0.01613514, 0.99080044,
       0.01059572, 0.16792849, 0.09694657, 0.01059572, 0.99249333,
       0.01059572, 0.99249333, 0.99080044, 0.01973535, 0.03610218,
       0.01059572, 0.01059572, 0.99249333, 0.93900681, 0.99249333,
       0.11644396, 0.01613514, 0.98284411, 0.0086481 , 0.98598051,
       0.98598051, 0.01973535, 0.0086481 , 0.9593097 , 0.01059572,
       0.9777084 , 0.01973535, 0.0086481 , 0.01059572, 0.98176628,
       0.98598051, 0.01613514, 0.9593097 , 0.01059572, 0.99249333,
       0.11644396, 0.01613514, 0.01973535, 0.0086481 , 0.01613514,
       0.01059572, 0.93900681, 0.99249333, 0.99249333, 0.93900681,
       0.9777084 , 0.96660233, 0.01059572, 0.99249333, 0.93900681,
       0.11644396, 0.09694657, 0.01059572, 0.0086481 , 0.01973535,
       0.11644396, 0.99249333, 0.01059572, 0.01059572, 0.01059572,
       0.98598051, 0.01973535, 0.99249333, 0.11644396, 0.95889938,
       0.09694657, 0.01613514, 0.99080044, 0.99080044, 0.01973

# Confusion matrix

In [23]:
cm= pd.crosstab(index=test_data['survived'],columns=np.round(predictions_array),rownames=['Observed'],colnames=['Predict'])
tn = cm.iloc[0,0];fn=cm.iloc[1,0];tp=cm.iloc[1,1];fp=cm.iloc[0,1];p=(tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall classification rate:",p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted","No survived","Survived"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No survived",tn/(tn+fn)*100,tn,fp/(tp+fp)*100,fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("survived",fn/(tn+fn*100),fn,tp/(tp+fp)*100,tp))


Overall classification rate:92.0%

Predicted      No survived    Survived
Observed
No survived    88% (170)     1% (1)
survived        0% (24)     99% (119) 



# Clean up the resources after deploying the model

In [None]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
buckte_to_delete=boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()