In [2]:
!pip install kaggle
!kaggle competitions download -c titanic
!unzip titanic.zip

Downloading titanic.zip to /home/ec2-user/SageMaker
  0%|                                               | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 23.6MB/s]
Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

import time
import string
import warnings
warnings.filterwarnings('ignore')

SEED = 42

In [4]:
from sagemaker.xgboost.estimator import XGBoost
from sagemaker import image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

In [5]:
####################################
# Importing data and merging
####################################

# Reading dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Adding a column in each dataset before merging
train['Type'] = 'train'
test['Type'] = 'test'

# Merging train and test
data = train.append(test)

####################################
# Missing values and new features
####################################

# Title
data['Title'] = data['Name']

# Cleaning name and extracting Title
for name_string in data['Name']:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=True)
    
# Replacing rare titles 
mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Major': 'Other', 
           'Col': 'Other', 'Dr' : 'Other', 'Rev' : 'Other', 'Capt': 'Other', 
           'Jonkheer': 'Royal', 'Sir': 'Royal', 'Lady': 'Royal', 
           'Don': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal'}
           
data.replace({'Title': mapping}, inplace=True)
titles = ['Miss', 'Mr', 'Mrs', 'Royal', 'Other', 'Master']

# Replacing missing age by median/title 
for title in titles:
    age_to_impute = data.groupby('Title')['Age'].median()[titles.index(title)]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute
    
# New feature : Family_size
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data.loc[:,'FsizeD'] = 'Alone'
data.loc[(data['Family_Size'] > 1),'FsizeD'] = 'Small'
data.loc[(data['Family_Size'] > 4),'FsizeD'] = 'Big'

# Replacing missing Fare by median/Pclass 
fa = data[data["Pclass"] == 3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

#  New feature : Child
data.loc[:,'Child'] = 1
data.loc[(data['Age'] >= 18),'Child'] =0

# New feature : Family Survival (https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83)
data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])
DEFAULT_SURVIVAL_VALUE = 0.5

data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
                               
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin == 0.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin == 0.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                    
####################################
# Encoding and pre-modeling
####################################                  

# dropping useless features
data = data.drop(columns = ['Age','Cabin','Embarked','Name','Last_Name',
                            'Parch', 'SibSp','Ticket', 'Family_Size'])

# Encoding features
target_col = ["Survived"]
id_dataset = ["Type"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
# numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col + id_dataset]
# Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
# Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]
# Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
# Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )
# Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns = num_cols)
# dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index = True,right_index = True,how = "left")
data = data.drop(columns = ['PassengerId'],axis = 1)

# Target = 1st column
cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)

# Cutting train and test
train = data[data['Type'] == 1].drop(columns = ['Type'])
test = data[data['Type'] == 0].drop(columns = ['Type'])


In [6]:
import sagemaker
from sagemaker import get_execution_role
import boto3

region = boto3.Session().region_name
session = sagemaker.Session()

role = get_execution_role()
bucket = session.default_bucket()

In [7]:
msk = np.random.rand(len(train)) < 0.8
df_train = train[msk]
df_valid = train[~msk]
df_train.shape, df_valid.shape

((736, 19), (155, 19))

In [8]:
prefix = 'sagemaker/titanic/xgboost'
train_file = 'df_train.csv';
df_train.to_csv(train_file, index=False, header=False)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'df_test.csv';
df_valid.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/xgboost/train/df_train.csv
Test data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/xgboost/test/df_test.csv


In [16]:
# initialize hyperparameters
hyperparameters = {
#         "max_depth":"5",
#         "eta":"0.2",
#         "gamma":"4",
#         "min_child_weight":"6",
#         "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"100"}


output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'titanic-xgb-built-in-algo')

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, 'latest') #"1.2-1")  #06/03/2021

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          #hyperparameters=hyperparameters, #06/03/2021
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge', 
                                          sagemaker_session=session, #06/03/2021
                                          output_path=output_path)
### Start Working 06/03/2021
estimator.set_hyperparameters(
    eval_metric='auc',
    objective='binary:logistic',
    num_round=100,
    rate_drop=0.3,
    tweedie_variance_power=1.4
)
objective_metric_name = 'validation:auc'

### Adding hyperparameter tunning
https://github.com/aws/amazon-sagemaker-examples/blob/master/hyperparameter_tuning/xgboost_random_log/hpo_xgboost_random_log.ipynb

https://github.com/aws/amazon-sagemaker-examples/tree/master/hyperparameter_tuning

In [17]:
# Working 06/03/2021
# Adding Hyperparameter Tuning
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    'lambda': ContinuousParameter(0.01, 10, scaling_type="Logarithmic")
}

In [20]:
# Working 06/03/2021
# Random Search
tuner_log = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=10,
    strategy='Random'
)
# define the data type and paths to the training and validation datasets
content_type = "csv"

train_input = TrainingInput(train_data_s3_path, content_type=content_type)
validation_input = TrainingInput(test_data_s3_path, content_type=content_type)

tuner_log.fit({'train': train_input, 'validation': validation_input}, include_cls_metadata=False)

..........................................................................................!


In [22]:
# Working 06/03/2021
# Linear Scaling
hyperparameter_ranges_linear = {
    'eta': ContinuousParameter(0, 1, scaling_type='Linear'),
    'min_child_weight': ContinuousParameter(1, 10, scaling_type='Linear'),
    'alpha': ContinuousParameter(0.01, 10, scaling_type="Linear"),
    'lambda': ContinuousParameter(0.01, 10, scaling_type="Linear")
}
tuner_linear = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges_linear,
    max_jobs=20,
    max_parallel_jobs=10,
    strategy='Random'
)

# custom job name to avoid a duplicate name
job_name = tuner_log.latest_tuning_job.job_name + 'linear'
tuner_linear.fit({'train': train_input, 'validation': validation_input}, include_cls_metadata=False, job_name=job_name)

............................................................................................!


### Analyze tuning job results - after tuning job is completed
Once the tuning jobs have completed, we can compare the distribution of the hyperparameter configurations chosen in the two cases.
https://github.com/aws/amazon-sagemaker-examples/blob/master/hyperparameter_tuning/xgboost_random_log/hpo_xgboost_random_log.ipynb

https://github.com/aws/amazon-sagemaker-examples/blob/master/hyperparameter_tuning/analyze_results/HPO_Analyze_TuningJob_Results.ipynb

In [29]:
sage_client = boto3.Session().client('sagemaker')
tuning_job_result_log = sage_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuner_log.latest_tuning_job.job_name)
tuning_job_result_linear = sage_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuner_linear.latest_tuning_job.job_name)
from pprint import pprint
if tuning_job_result_log.get('BestTrainingJob',None):
    print("Best model found so far Log:")
    pprint(tuning_job_result_log['BestTrainingJob'])
else:
    print("No training jobs have reported results yet.")

if tuning_job_result_linear.get('BestTrainingJob',None):
    print("Best model found so far Linear:")
    pprint(tuning_job_result_linear['BestTrainingJob'])
else:
    print("No training jobs have reported results yet.")


Best model found so far Log:
{'CreationTime': datetime.datetime(2021, 3, 6, 11, 44, 23, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:auc',
                                                 'Value': 0.91075199842453},
 'ObjectiveStatus': 'Succeeded',
 'TrainingEndTime': datetime.datetime(2021, 3, 6, 11, 47, 23, tzinfo=tzlocal()),
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:475414269301:training-job/xgboost-210306-1143-010-db470969',
 'TrainingJobName': 'xgboost-210306-1143-010-db470969',
 'TrainingJobStatus': 'Completed',
 'TrainingStartTime': datetime.datetime(2021, 3, 6, 11, 46, 30, tzinfo=tzlocal()),
 'TunedHyperParameters': {'alpha': '1.31666837097785',
                          'lambda': '0.011917045099302331'}}
Best model found so far Linear:
{'CreationTime': datetime.datetime(2021, 3, 6, 11, 55, 56, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:auc',
                               

### Hosting an endpoint using the tuner object and predicting
https://aws.amazon.com/blogs/machine-learning/simplify-machine-learning-with-xgboost-and-amazon-sagemaker/

https://www.youtube.com/watch?v=yiiLzvAry1o

https://www.youtube.com/watch?v=2xc-dddX0LU

In [39]:
from sagemaker.serializers import CSVSerializer
xgb_predictor = tuner_log.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    serializer = CSVSerializer())


2021-03-06 11:47:23 Starting - Preparing the instances for training
2021-03-06 11:47:23 Downloading - Downloading input data
2021-03-06 11:47:23 Training - Training image download completed. Training in progress.
2021-03-06 11:47:23 Uploading - Uploading generated training model
2021-03-06 11:47:23 Completed - Training job completed
---------------!

### Predict

In [31]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [64]:
# X Test
X_test = test.iloc[:, 1:20]
predictions = xgb_predictor.predict(X_test.values).decode('utf-8')
predictions = np.fromstring(predictions, sep=',')
#predictions = predict(X_test.values, rows=len(X_test))

array([0.067606  , 0.87085861, 0.21798635, 0.23259716, 0.91707128,
       0.1703591 , 0.78770638, 0.09917094, 0.64056659, 0.04564077,
       0.11451034, 0.66265136, 0.89045721, 0.07535743, 0.83802766,
       0.88281065, 0.07741386, 0.16620497, 0.14529867, 0.63558495,
       0.17684659, 0.90192366, 0.89971453, 0.1119828 , 0.98093814,
       0.0295784 , 0.71770281, 0.14340948, 0.08338515, 0.02911644,
       0.06721346, 0.05723634, 0.94319779, 0.41366693, 0.2404746 ,
       0.1809684 , 0.78428042, 0.03184228, 0.11451034, 0.1672986 ,
       0.15539147, 0.17184539, 0.06923978, 0.96346319, 0.9741897 ,
       0.12303372, 0.14817947, 0.067606  , 0.98671746, 0.97762555,
       0.22591685, 0.08030577, 0.98669887, 0.97659749, 0.10959198,
       0.04080033, 0.08589345, 0.14360453, 0.10428815, 0.98753244,
       0.15264778, 0.09002801, 0.13125384, 0.49471632, 0.52501965,
       0.9110229 , 0.39243606, 0.05503313, 0.30592132, 0.9728862 ,
       0.39243606, 0.11259524, 0.78770638, 0.11064842, 0.96331

In [61]:
df_test = pd.read_csv('test.csv')
submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = df_test['PassengerId']
submission_df['Survived'] = predictions
submission_df['Survived'] = submission_df['Survived'].apply(lambda x: 1 if x > 0.5 else 0)
submission_df.to_csv('submissions_xgboost.csv', header=True, index=False)
submission_df.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [62]:
submission_df['Survived'].sum()

146

In [63]:
!kaggle competitions submit -c titanic -f submissions_xgboost.csv -m "Submission using xgboost log model tunning"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 4.79kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [65]:
# Delete hosted endpoint
import boto3
sm_boto3 = boto3.client('sagemaker')
sm_boto3.delete_endpoint(EndpointName=xgb_predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': 'e5dbbfd5-dab8-45e8-8a15-e91d3f8fe286',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e5dbbfd5-dab8-45e8-8a15-e91d3f8fe286',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 06 Mar 2021 12:53:03 GMT'},
  'RetryAttempts': 0}}

Without using Hyperparameter Tuning

In [32]:
# # define the data type and paths to the training and validation datasets
# content_type = "csv"

# train_input = TrainingInput(train_data_s3_path, content_type=content_type)
# validation_input = TrainingInput(test_data_s3_path, content_type=content_type)

# # execute the XGBoost training job
# estimator.fit({'train': train_input, 'validation': validation_input})

# xgb_predictor = estimator.deploy(
#     initial_instance_count = 1,
#     instance_type = 'ml.m4.xlarge',
#     serializer = CSVSerializer())

### Using a model running locally

In [46]:
# To predict without deploying the instance
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
[K     |████████████████████████████████| 157.5 MB 28 kB/s s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-1.3.3


In [48]:
model_prefix = 'sagemaker/titanic/xgboost/titanic-xgb-built-in-algo/output/sagemaker-xgboost-2021-03-05-18-12-37-598/output/'
session.download_data(path='./', bucket='sagemaker-us-east-1-475414269301', 
                                 key_prefix = model_prefix)

In [49]:
import pickle as pkl 
import tarfile
import xgboost

t = tarfile.open('model.tar.gz', 'r:gz')
t.extractall()
model = pkl.load(open('xgboost-model', 'rb'))

In [58]:
# prediction with test data
pred = model.predict(xgboost.DMatrix(X_test.values))
df_test = pd.read_csv('test.csv')
submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = df_test['PassengerId']
submission_df['Survived'] = pred
submission_df['Survived'] = submission_df['Survived'].apply(lambda x: 1 if x > 0.55 else 0)
submission_df.to_csv('submissions_xgboost.csv', header=True, index=False)
submission_df.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [60]:
!kaggle competitions submit -c titanic -f submissions_xgboost.csv -m "Submission using xgboost local"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 5.76kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster