# XGBoost Demo

## Download dataset

Dataset used - Titanic @ https://www.kaggle.com/competitions/titanic/data

In [1]:
!wget -nv https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv titanic.csv

2022-11-24 21:23:47 URL:https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv [60302/60302] -> "titanic.csv.1" [1]
wget: unable to resolve host address ‘titanic.csv’
FINISHED --2022-11-24 21:23:47--
Total wall clock time: 0.2s
Downloaded: 1 files, 59K in 0.002s (27.8 MB/s)


## Clean and transform data

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("titanic.csv")
df = df.dropna()

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [4]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,35.674426,0.464481,0.47541,78.682469
std,247.052476,0.470725,0.515187,15.643866,0.644159,0.754617,76.347843
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,263.5,0.0,1.0,24.0,0.0,0.0,29.7
50%,457.0,1.0,1.0,36.0,0.0,0.0,57.0
75%,676.0,1.0,1.0,47.5,1.0,1.0,90.0
max,890.0,1.0,3.0,80.0,3.0,4.0,512.3292


In [5]:
df_features = df[["Survived", "Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]]

# Binary encode gender
df_features["Sex"] = df_features["Sex"].apply(lambda x: 1 if x == "female" else 0)

# Get dummy variables for embarked
df_features = df_features.join(pd.get_dummies(df_features["Embarked"], prefix="Embarked")).drop("Embarked", axis=1)

df_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_features["Sex"] = df_features["Sex"].apply(lambda x: 1 if x == "female" else 0)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
1,1,1,1,38.0,1,0,71.2833,1,0,0
3,1,1,1,35.0,1,0,53.1,0,0,1
6,0,1,0,54.0,0,0,51.8625,0,0,1
10,1,3,1,4.0,1,1,16.7,0,0,1
11,1,1,1,58.0,0,0,26.55,0,0,1


In [6]:
df_train, df_test = train_test_split(df_features, test_size=0.1)

len(df_train), len(df_test)

(164, 19)

In [7]:
train_path = "titanic_train.csv"
test_path = "titanic_test.csv"

df_train.to_csv(train_path, header=False, index=False)
df_test.to_csv(test_path, header=False, index=False)

## Save data to S3

In [8]:
import sagemaker
import boto3
import os

In [9]:
sess = sagemaker.Session()

bucket_name = sess.default_bucket()
s3 = boto3.Session().resource("s3").Bucket(bucket_name)
role = sagemaker.get_execution_role()

bucket_prefix = "xgboost-demo"

In [27]:
train_path_s3 = os.path.join(bucket_prefix, "data", train_path)
test_path_s3 = os.path.join(bucket_prefix, "data", test_path)

s3.Object(train_path_s3).upload_file(train_path)
s3.Object(test_path_s3).upload_file(test_path)

## Train and tune model

In [15]:
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker.estimator import Estimator
from sagemaker.tuner import IntegerParameter, HyperparameterTuner

In [28]:
container = sagemaker.image_uris.retrieve("xgboost", sess.boto_region_name, "latest")

train_s3 = TrainingInput(s3_data=f"s3://{bucket_name}/{train_path_s3}", content_type="csv")
test_s3 = TrainingInput(s3_data=f"s3://{bucket_name}/{test_path_s3}", content_type="csv")

model_s3 = f"s3://{bucket_name}/{os.path.join(bucket_prefix, 'model')}"

model = Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=model_s3,
    sagemaker_session=sess
)

model.set_hyperparameters(
    objective="binary:logistic",
    num_round=50,
)

In [31]:
hyperparameter_ranges = {
    "num_round": IntegerParameter(48, 52)
}

objective_metric_name = "validation:auc"

tuner = HyperparameterTuner(
    model,
    objective_metric_name,
    hyperparameter_ranges,
    objective_type="Maximize",
    max_jobs=2,
    max_parallel_jobs=2
)

In [32]:
tuner.fit({"train": train_s3, "validation": test_s3})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


.............................................!


In [39]:
analytics = tuner.analytics()

if not analytics.dataframe().empty:
    df_analytics = analytics.dataframe().sort_values(["FinalObjectiveValue"])
    
df_analytics

Unnamed: 0,num_round,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
0,52.0,xgboost-221124-2140-002-3e497100,Completed,0.911765,2022-11-24 21:42:53+00:00,2022-11-24 21:44:00+00:00,67.0
1,50.0,xgboost-221124-2140-001-c7f4fc70,Completed,0.911765,2022-11-24 21:42:47+00:00,2022-11-24 21:43:54+00:00,67.0


## Deploy model

In [44]:
predictor = tuner.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge", serializer=CSVSerializer())


2022-11-24 21:43:55 Starting - Preparing the instances for training
2022-11-24 21:43:55 Downloading - Downloading input data
2022-11-24 21:43:55 Training - Training image download completed. Training in progress.
2022-11-24 21:43:55 Uploading - Uploading generated training model
2022-11-24 21:43:55 Completed - Resource released due to keep alive period expiry
-------!

In [46]:
test_data = [1, 1, 38.0, 1, 0, 71.2833, 1, 0, 0]

predictions = predictor.predict(test_data)

predictions

b'0.9991437196731567'

In [48]:
# Delete model to avoid incurring costs
predictor.delete_endpoint()

ClientError: An error occurred (ValidationException) when calling the DeleteEndpointConfig operation: Could not find endpoint configuration "arn:aws:sagemaker:ap-southeast-2:641490273710:endpoint-config/xgboost-221124-2140-001-c7f4fc70".