In [2]:
#https://www.kaggle.com/c/tabular-playground-series-apr-2021/

In [18]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.2.2-py2.py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 8.3 MB/s  eta 0:00:01
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders.cat_boost import CatBoostEncoder

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


In [24]:
plt.rc('figure', figsize=(12.0, 6.0))

In [25]:
def converter(x):
    c, n = '', ''
    x = str(x).replace('.', '').replace('/','').replace(' ', '')
    for i in x:
        if i.isnumeric():
            n += i
        else :
            c += i 
    if n != '':
        return c, int(n)
    return c, np.nan

In [62]:
####################################
# Importing data and merging
####################################

# Reading dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ids = test['PassengerId']
# Adding a column in each dataset before merging
train['Type'] = 'train'
test['Type'] = 'test'

# Merging train and test
data = train.append(test)

####################################
# Missing values and new features
####################################
    
# New feature : Family_size
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data.loc[:,'FsizeD'] = 'Alone'
data.loc[(data['Family_Size'] > 1),'FsizeD'] = 'Small'
data.loc[(data['Family_Size'] > 4),'FsizeD'] = 'Big'

# Replacing missing Fare by median/Pclass 
fa = data[data["Pclass"] == 3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

#  New feature : Child
data.loc[:,'Child'] = 1
data.loc[(data['Age'] >= 18),'Child'] =0

data['Ticket_type'] = data['Ticket'].map(lambda x: converter(x)[0])
#data['Ticket_number'] = data['Ticket'].map(lambda x: converter(x)[1])

# New feature : Family Survival 
# data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])
# DEFAULT_SURVIVAL_VALUE = 0.5

# data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
# for grp, grp_df in data[['Survived','Fare', 'Ticket', 'PassengerId',
#                            'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
                               
#     if (len(grp_df) != 1):
#         # A Family group is found.
#         for ind, row in grp_df.iterrows():
#             smax = grp_df.drop(ind)['Survived'].max()
#             smin = grp_df.drop(ind)['Survived'].min()
#             passID = row['PassengerId']
#             if (smax == 1.0):
#                 data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
#             elif (smin == 0.0):
#                 data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                
# for _, grp_df in data.groupby('Ticket'):
#     if (len(grp_df) != 1):
#         for ind, row in grp_df.iterrows():
#             if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
#                 smax = grp_df.drop(ind)['Survived'].max()
#                 smin = grp_df.drop(ind)['Survived'].min()
#                 passID = row['PassengerId']
#                 if (smax == 1.0):
#                     data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
#                 elif (smin == 0.0):
#                     data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                    
####################################
# Encoding and pre-modeling
####################################                  

# dropping useless features
data = data.drop(columns = ['Age','Cabin','Embarked', 'Name',
                            'Parch', 'SibSp','Ticket', 'Family_Size'])

# Encoding features
target_col = ["Survived"]
id_dataset = ["Type"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
# numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col + id_dataset]

# Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()

# Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]

# Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
    
# Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )

# Scaling Numerical columns
print(num_cols)
num_cols.remove('Ticket_type')
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns = num_cols)

# dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index = True,right_index = True,how = "left")
data = data.drop(columns = ['PassengerId'],axis = 1)

# Target = 1st column
cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)

# Cutting train and test
train = data[data['Type'] == 1].drop(columns = ['Type'])
test = data[data['Type'] == 0].drop(columns = ['Type'])

ce = CatBoostEncoder()
train['Ticket_type'] = ce.fit_transform(train['Ticket_type'], train['Survived'])
test['Ticket_type'] = ce.transform(test['Ticket_type'])
test.drop('Survived', axis=1, inplace=True)

['PassengerId', 'Fare', 'Ticket_type']


In [63]:
train.shape, test.shape

((100000, 11), (100000, 10))

In [54]:
train.isna().any().sum(), test.isna().any().sum()

(0, 0)

In [60]:
train.head()

Unnamed: 0,Sex,Child,Ticket_type,Pclass_1,Pclass_2,Pclass_3,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Fare
0,1,1,0.42774,1,0,0,0,0,1,-0.259156
1,1,1,0.71387,0,0,1,1,0,0,-0.463749
2,1,1,0.42774,0,0,1,0,0,1,0.39587
3,1,0,0.42774,0,0,1,1,0,0,-0.468349
4,1,0,0.475913,0,0,1,1,0,0,-0.546685


pretty neat

## Predicting using simple classification models

In [55]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier, RandomForestClassifier, StackingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [32]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('Survived', axis=1), train.Survived, test_size=0.2, random_state=0, stratify=train.Survived)

### Simple function to train and display metrics of generalised models using GridSearch to find good parameters

In [61]:
def FitModel(algorithm,gridSearchParams,cv):
    grid = GridSearchCV(
        estimator=algorithm,
        param_grid=gridSearchParams,
        cv=cv,  verbose=1)
    
    grid_result = grid.fit(X_train, y_train)
    best_params = grid_result.best_params_
    
    print('Best Params :',best_params)
    
    pred = grid.predict(X_test) 
    print(accuracy_score(y_test, pred))
    print(classification_report(y_test, pred))
    print(confusion_matrix(y_test, pred))
    
    return grid_result # returning the model

pd.options.display.float_format = '{:.2f}'.format

### Starting with Logistic Regression

In [64]:
lr = FitModel(LogisticRegression(), {}, 5)
#76

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params : {}
0.76345
              precision    recall  f1-score   support

           0       0.79      0.80      0.79     11445
           1       0.73      0.72      0.72      8555

    accuracy                           0.76     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.76      0.76      0.76     20000

[[9132 2313]
 [2418 6137]]


## AdaBoostClassifier

In [74]:
%%time
params = {'n_estimators': [30,50,100,500], 'learning_rate': [0.1,0.5,1.0]}
ada = FitModel(AdaBoostClassifier(), {}, 5)
#75 Best Params : {'learning_rate': 0.1, 'n_estimators': 30}

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Params : {}
0.7614
              precision    recall  f1-score   support

           0       0.78      0.82      0.80     11445
           1       0.74      0.69      0.71      8555

    accuracy                           0.76     20000
   macro avg       0.76      0.75      0.75     20000
weighted avg       0.76      0.76      0.76     20000

[[9365 2080]
 [2692 5863]]
CPU times: user 12.8 s, sys: 0 ns, total: 12.8 s
Wall time: 12.8 s


## RandomForestClassifier
StackingClassifier, VotingClassifier

In [57]:
%%time
params = {'n_estimators': [30,50,100,500]}
rf = FitModel(RandomForestClassifier(), params, 5)
#69 Best Params : {'n_estimators': 500}

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Params : {'n_estimators': 500}
0.68852
              precision    recall  f1-score   support

           0       0.72      0.74      0.73     14307
           1       0.64      0.62      0.63     10693

    accuracy                           0.69     25000
   macro avg       0.68      0.68      0.68     25000
weighted avg       0.69      0.69      0.69     25000

[[10614  3693]
 [ 4094  6599]]
CPU times: user 54 s, sys: 540 ms, total: 54.5 s
Wall time: 3min 36s


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'n_estimators': [30, 50, 100, 500]}, verbose=1)

## GradientBoostClassifier

In [71]:
%%time
params={"learning_rate"    : [0.01, 0.1, 0.3] ,
 "max_depth"        : [3,5,10]}
gb = FitModel(GradientBoostingClassifier(), params, 5)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best Params : {'learning_rate': 0.1, 'max_depth': 3}
0.76565
              precision    recall  f1-score   support

           0       0.80      0.79      0.79     11445
           1       0.72      0.73      0.73      8555

    accuracy                           0.77     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.77      0.77      0.77     20000

[[9028 2417]
 [2270 6285]]


## XGBoostClassifier

In [79]:
from xgboost import XGBClassifier

ModuleNotFoundError: No module named 'xgboost'

In [63]:
%%time
params={"learning_rate"    : [0.1] ,
 "max_depth"        : [5],
 "min_child_weight" : [3],
 "gamma"            : [0.3],
 "colsample_bytree" : [0.5] }
xgb = FitModel(XGBClassifier(), params, 4)
# 0.76 Best Params : {'colsample_bytree': 0.5, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3}

Fitting 4 folds for each of 12 candidates, totalling 48 fits
Best Params : {'colsample_bytree': 0.5, 'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3}
0.76204
              precision    recall  f1-score   support

           0       0.80      0.77      0.79     14307
           1       0.71      0.75      0.73     10693

    accuracy                           0.76     25000
   macro avg       0.76      0.76      0.76     25000
weighted avg       0.76      0.76      0.76     25000

[[11067  3240]
 [ 2709  7984]]
CPU times: user 3.48 s, sys: 28.1 ms, total: 3.51 s
Wall time: 3min 20s


GridSearchCV(cv=4,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None,

In [56]:
import lightgbm as lgb

In [75]:
%%time
params=dict(num_leaves=[10,50,100,500], max_depth=[3,5,10])
FitModel(lgb.LGBMClassifier(), params, 5)
# NO parameter tunning 0,766
# With parameter tunning (best params: ) Accuracy: 0,765

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params : {'max_depth': 5, 'num_leaves': 10}
0.7658
              precision    recall  f1-score   support

           0       0.80      0.79      0.79     11445
           1       0.72      0.73      0.73      8555

    accuracy                           0.77     20000
   macro avg       0.76      0.76      0.76     20000
weighted avg       0.77      0.77      0.77     20000

[[9037 2408]
 [2276 6279]]
CPU times: user 54.2 s, sys: 616 ms, total: 54.8 s
Wall time: 28.5 s


GridSearchCV(cv=5, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             n_jobs=None,
             param_grid={'max_depth': [3, 5, 10],
                         'num_leaves': [10, 50, 100, 500]},
  

## VotingClassifier + StackingClassifier + First submissions (for each model)

In [57]:
ada_clf = AdaBoostClassifier(n_estimators=30, learning_rate=0.1)
lr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
lgb_clf = lgb.LGBMClassifier()
gb_clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=3)
#xgb_clf = XGBClassifier(learning_rate=0.1, max_depth=5, min_child_weight=3, gamma=0.3, colsample_bytree=0.5)
estimators = [('lr',lr_clf),('lgb',lgb_clf),('rf',rf_clf),('gb',gb_clf)]
voting_clf = VotingClassifier(estimators=estimators)
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=lgb.LGBMClassifier())

In [49]:
def generate_submission(predictions, csv_name):
    df = pd.DataFrame(columns=['PassengerId','Survived'])
    df['PassengerId'] = ids
    df['Survived'] = predictions#pd.Series(predictions).astype(int)
    df.to_csv(csv_name, header=True, index=False)
    return df.head(3)

In [66]:
train.head()

Unnamed: 0,Survived,Sex,Child,Ticket_type,Pclass_1,Pclass_2,Pclass_3,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Fare
0,1,1,1,0.42774,1,0,0,0,0,1,-0.259156
1,0,1,1,0.71387,0,0,1,1,0,0,-0.463749
2,0,1,1,0.42774,0,0,1,0,0,1,0.39587
3,0,1,0,0.42774,0,0,1,1,0,0,-0.468349
4,1,1,0,0.475913,0,0,1,1,0,0,-0.546685


In [68]:
X = train.drop('Survived', axis=1)
y = train.Survived

In [71]:
for name, clf in estimators+[('voting',voting_clf), ('stacking',stacking_clf)]:
    clf.fit(X, y)
    y_pred = clf.predict(test)
#     print(clf.__class__.__name__,
#           accuracy_score(y_test, y_pred))
    generate_submission(y_pred, f'{name}.csv')

In [82]:
for name, clf in estimators+[('voting',voting_clf), ('stacking',stacking_clf)]:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__,
          accuracy_score(y_test, y_pred))
    #y_pred = clf.predict(test.drop('Survived', axis=1))
    #generate_submission(y_pred, f'{name}.csv')

LogisticRegression 0.76345
LGBMClassifier 0.76625
RandomForestClassifier 0.7279
GradientBoostingClassifier 0.76565
VotingClassifier 0.76525
StackingClassifier 0.76585


## KerasClassifier (ROOM FOR IMPROVEMENT)

In [84]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.4.1-cp36-cp36m-manylinux2010_x86_64.whl (394.3 MB)
[K     |████████████████████████████████| 394.3 MB 7.2 kB/s  eta 0:00:01    |██▋                             | 32.0 MB 14.6 MB/s eta 0:00:25
[?25hCollecting astunparse~=1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting tensorflow-estimator<2.5.0,>=2.4.0
  Downloading tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 49.3 MB/s eta 0:00:01
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0-cp36-cp36m-manylinux2014_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 55.6 MB/s eta 0:00:01
[?25hCollecting absl-py~=0.10
  Downloading absl_py-0.12.0-py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 69.7 MB/s eta 0:00:01
[?25hCollecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting keras-preprocessing~=1.1.2
  Download

In [85]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [91]:
def create_model():
    dim=X_train.shape[1]
    model = Sequential()
    model.add(Dense(dim, activation='relu'))
    #model.add(Dropout(0.3))
              
    model.add(Dense(dim/2, activation='relu'))
   # model.add(Dropout(0.5))
              
    model.add(Dense(1, activation='sigmoid'))
    
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [92]:
model = create_model()

In [93]:
model.fit(X_train, y_train, epochs=40, validation_data=(X_test,y_test), callbacks=[EarlyStopping(patience=4, monitor='accuracy')])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40


<tensorflow.python.keras.callbacks.History at 0x7f572bfabbe0>

In [57]:
pred = model.predict(test.drop('Survived', axis=1)).flatten()
pred = pred>0.5

In [58]:
pred = pred.astype(int)
generate_submission(pred, 'keras.csv')

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,0
2,100002,1


In [75]:
!kaggle competitions submit -c tabular-playground-series-apr-2021 -f voting.csv -m "Voting Full dataset"

100%|████████████████████████████████████████| 879k/879k [00:00<00:00, 2.00MB/s]
Successfully submitted to Tabular Playground Series - Apr 2021

## SageMaker XGBoost + Hiperparameter tunning job  

In [3]:
# Imports
from sagemaker.xgboost.estimator import XGBoost
from sagemaker import image_uris
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
import boto3

region = boto3.Session().region_name
session = sagemaker.Session()

role = get_execution_role()
bucket = session.default_bucket()

In [103]:
# Prepare datasets
df_train = X_train.copy()
df_valid = X_test.copy()
df_train['Survived'] = y_train
df_valid['Survived'] = y_test
cols = df_train.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
df_train = df_train.reindex(columns= cols)
df_valid = df_valid.reindex(columns= cols)
df_train.head()

Unnamed: 0,Survived,Sex,Child,Ticket_type,Pclass_1,Pclass_2,Pclass_3,FsizeD_Alone,FsizeD_Big,FsizeD_Small,Fare
62069,1,0,1,0.44,0,1,0,1,0,0,-0.23
24290,1,0,1,0.44,0,0,1,1,0,0,-0.25
99145,1,0,0,0.44,1,0,0,1,0,0,-0.21
47991,1,0,0,0.44,1,0,0,1,0,0,2.12
34366,1,0,0,0.44,1,0,0,0,0,1,0.64


In [104]:
# Sending data to s3
prefix = 'sagemaker/tps-titanic/xgboost'
train_file = 'df_train.csv';
df_train.to_csv(train_file, index=False, header=False)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'df_valid.csv';
df_valid.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/tps-titanic/xgboost/train/df_train.csv
Test data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/tps-titanic/xgboost/test/df_valid.csv


In [119]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"100"}

output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'titanic-xgb-built-in-algo')

xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, 'latest') 

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters, 
                                          role=role,
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          sagemaker_session=session, 
                                          output_path=output_path)
objective_metric_name = 'validation:auc'

### Adding hyperparameter tunning  
https://github.com/aws/amazon-sagemaker-examples/blob/master/hyperparameter_tuning/xgboost_random_log/hpo_xgboost_random_log.ipynb  

https://github.com/aws/amazon-sagemaker-examples/tree/master/hyperparameter_tuning

In [116]:
# Adding Hyperparameter Tuning
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    'lambda': ContinuousParameter(0.01, 10, scaling_type="Logarithmic"),
    'max_depth': IntegerParameter(1, 8, scaling_type="Linear"),
    'eta': ContinuousParameter(0.01, 0.5, scaling_type="Logarithmic"),
    'min_child_weight': IntegerParameter(1, 8, scaling_type="Linear"),
    'gamma': ContinuousParameter(0.01, 0.5, scaling_type="Logarithmic")
}

In [110]:
# Creating the inputs
content_type = "csv"

train_input = TrainingInput(train_data_s3_path, content_type=content_type)
validation_input = TrainingInput(test_data_s3_path, content_type=content_type)

In [120]:
tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    max_jobs=20,
    max_parallel_jobs=10,
    strategy='Random'
)

In [121]:
%%time
tuner.fit({'train': train_input, 'validation': validation_input}, include_cls_metadata=False)

...................................................................................................!


In [126]:
# Analysing results
from pprint import pprint
sage_client = boto3.Session().client('sagemaker')
tuning_job_result = sage_client.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tuner.latest_tuning_job.job_name)
if tuning_job_result.get('BestTrainingJob',None):
    print("Best model found so far:")
    pprint(tuning_job_result['BestTrainingJob'])
else:
    print("No training jobs have reported results yet.")

Best model found so far:
{'CreationTime': datetime.datetime(2021, 4, 8, 12, 27, 28, tzinfo=tzlocal()),
 'FinalHyperParameterTuningJobObjectiveMetric': {'MetricName': 'validation:auc',
                                                 'Value': 0.8331210017204285},
 'ObjectiveStatus': 'Succeeded',
 'TrainingEndTime': datetime.datetime(2021, 4, 8, 12, 30, 56, tzinfo=tzlocal()),
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:475414269301:training-job/xgboost-210408-1223-019-2027c63b',
 'TrainingJobName': 'xgboost-210408-1223-019-2027c63b',
 'TrainingJobStatus': 'Completed',
 'TrainingStartTime': datetime.datetime(2021, 4, 8, 12, 29, 54, tzinfo=tzlocal()),
 'TunedHyperParameters': {'alpha': '7.7160706497411',
                          'eta': '0.07923087742761344',
                          'gamma': '0.3579007773695287',
                          'lambda': '0.3147415941655465',
                          'max_depth': '6',
                          'min_child_weight': '6'}}


In [127]:
# Creating Predictor
%%time
from sagemaker.serializers import CSVSerializer
xgb_predictor = tuner.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    serializer = CSVSerializer())


2021-04-08 12:30:56 Starting - Preparing the instances for training
2021-04-08 12:30:56 Downloading - Downloading input data
2021-04-08 12:30:56 Training - Training image download completed. Training in progress.
2021-04-08 12:30:56 Uploading - Uploading generated training model
2021-04-08 12:30:56 Completed - Training job completed
-----------------!

In [133]:
# Predict
predictions = xgb_predictor.predict(X_test.values).decode('utf-8')
predictions = np.fromstring(predictions, sep=',')
predictions = predictions>0.5
predictions.astype(int)
accuracy_score(y_test, predictions)

0.7661

In [151]:
half_test_len = int(len(test)/2)
predictions_p1 = xgb_predictor.predict(test[:half_test_len].values).decode('utf-8')
predictions_p1 = np.fromstring(predictions_p1, sep=',')
predictions_p1 = predictions_p1>0.5
predictions_p1 = predictions_p1.astype(int)
predictions_p2 = xgb_predictor.predict(test[half_test_len:].values).decode('utf-8')
predictions_p2 = np.fromstring(predictions_p2, sep=',')
predictions_p2 = predictions_p2>0.5
predictions_p2 = predictions_p2.astype(int)

50000

In [163]:
predictions_final = predictions_p1.tolist() + predictions_p2.tolist()
len(predictions_final)

100000

In [164]:
generate_submission(predictions_final, 'hpt.csv')

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1


In [167]:
!kaggle competitions submit -c tabular-playground-series-apr-2021 -f hpt.csv -m "Hyperparameter Tunning XGB SageMaker alpha: 7.716 eta: 0.079 gamma: 0.35 lambda: 0.314 max_depth: 6 min_child_weight: 6"

100%|████████████████████████████████████████| 879k/879k [00:00<00:00, 1.98MB/s]
Successfully submitted to Tabular Playground Series - Apr 2021

In [168]:
# Delete hosted endpoint
sage_client.delete_endpoint(EndpointName=xgb_predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': '2c2fb9c4-70be-42ce-a6ff-12fa122e49b0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2c2fb9c4-70be-42ce-a6ff-12fa122e49b0',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 08 Apr 2021 13:25:21 GMT'},
  'RetryAttempts': 0}}

## SageMaker Autopilot

In [169]:
# Autopilot config
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'Survived'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }

In [5]:
sm = boto3.Session().client('sagemaker')

In [4]:
from time import gmtime, strftime, sleep

In [173]:
# Sending the data with header to s3 (autopilot needs the header, xgboost / autotunning doesnt)
df_train.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/tps-titanic/xgboost/train/df_train.csv


In [174]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'titanic-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig={'CompletionCriteria':
                                       {'MaxCandidates': 100}
                                      },
                      RoleArn=role)

AutoMLJobName: titanic-08-13-32-56


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:475414269301:automl-job/titanic-08-13-32-56',
 'ResponseMetadata': {'RequestId': 'a9c07fe4-f03b-4ae3-a7b8-bf3ab05f3c2e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a9c07fe4-f03b-4ae3-a7b8-bf3ab05f3c2e',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '90',
   'date': 'Thu, 08 Apr 2021 13:32:56 GMT'},
  'RetryAttempts': 0}}

In [11]:
auto_ml_job_name = 'titanic-08-13-32-56'

In [12]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineerin

In [13]:
from pprint import pprint
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print("CandidateName: " + best_candidate_name)
pprint("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
pprint("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

CandidateName: tuning-job-1-8fb3334eb2ac482dad-075-0cc189ff
'FinalAutoMLJobObjectiveMetricName: validation:f1'
'FinalAutoMLJobObjectiveMetricValue: 0.7666800022125244'


In [14]:
sm_dict =sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [18]:
model_name = best_candidate_name + "1145-model"
model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

epc_name = best_candidate_name + "-epc"
ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants=[{'InstanceType': 'ml.m5.xlarge',
                                                           'InitialInstanceCount': 1,
                                                           'ModelName': model_name,
                                                           'VariantName': 'main'}])

ep_name = best_candidate_name + "-ep"
create_endpoint_response = sm.create_endpoint(EndpointName=ep_name,
                                              EndpointConfigName=epc_name)

In [19]:
sm.get_waiter('endpoint_in_service').wait(EndpointName=ep_name)

In [20]:
from io import StringIO
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
predictor = Predictor(
    endpoint_name=ep_name,
    sagemaker_session=session,
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer())

In [44]:
# Predict
predictions = np.array(predictor.predict(X_test.values), dtype='int8')
accuracy_score(y_test, predictions)

0.76655

In [47]:
predictions = np.array(predictor.predict(test.values), dtype='int8')

In [50]:
generate_submission(predictions, 'auto_ml.csv')

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1


In [51]:
!kaggle competitions submit -c tabular-playground-series-apr-2021 -f auto_ml.csv -m "SageMaker AutoPilot "

100%|████████████████████████████████████████| 879k/879k [00:00<00:00, 1.94MB/s]
Successfully submitted to Tabular Playground Series - Apr 2021