### Reference
https://www.kaggle.com/vincentlugat/titanic-neural-networks-keras-81-8

https://gitlab.com/juliensimon/dlnotebooks/-/blob/master/keras/05-keras-blog-post/Fashion%20MNIST-SageMaker.ipynb

In [5]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.10.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 3.6 MB/s  eta 0:00:01
Collecting python-slugify
  Downloading python-slugify-4.0.1.tar.gz (11 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 1.2 MB/s  eta 0:00:01
Building wheels for collected packages: kaggle, python-slugify
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.10-py3-none-any.whl size=73269 sha256=5afb0841f65574ebad03ba027d628d470e78430b5e35f09fe5c5a4975d150967
  Stored in directory: /home/ec2-user/.cache/pip/wheels/1c/dd/dd/c493e6f981182c1411e288c553310f76e212bac3afbdac1294
  Building wheel for python-slugify (setup.py) ... [?25ldone
[?25h  Created wheel for python-slugify: filename=python_slugify-4.0.1-py2.py3-none-any.whl size=6767 sha256=b87edb4610ef903c781f9582b4964c98df4d5d4cbd979a325cdaa28b673f59b3
 

In [6]:
!kaggle competitions download -c titanic

Downloading titanic.zip to /home/ec2-user/SageMaker
  0%|                                               | 0.00/34.1k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 34.1k/34.1k [00:00<00:00, 22.8MB/s]


In [7]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [1]:
####################################
#  Libraries
####################################

import numpy as np 
import pandas as pd 
# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
# Reproductibility
from numpy.random import seed
seed(1002)
import tensorflow
tensorflow.random.set_seed(1002)


Using TensorFlow backend.


### All EDA and FE steps combined in one cell
Not a good practice, but my purpose with this notebook is to build and deploy model in sagemaker

In [2]:
####################################
# Importing data and merging
####################################

# Reading dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Adding a column in each dataset before merging
train['Type'] = 'train'
test['Type'] = 'test'

# Merging train and test
data = train.append(test)

####################################
# Missing values and new features
####################################

# Title
data['Title'] = data['Name']

# Cleaning name and extracting Title
for name_string in data['Name']:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=True)
    
# Replacing rare titles 
mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Major': 'Other', 
           'Col': 'Other', 'Dr' : 'Other', 'Rev' : 'Other', 'Capt': 'Other', 
           'Jonkheer': 'Royal', 'Sir': 'Royal', 'Lady': 'Royal', 
           'Don': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal'}
           
data.replace({'Title': mapping}, inplace=True)
titles = ['Miss', 'Mr', 'Mrs', 'Royal', 'Other', 'Master']

# Replacing missing age by median/title 
for title in titles:
    age_to_impute = data.groupby('Title')['Age'].median()[titles.index(title)]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute
    
# New feature : Family_size
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data.loc[:,'FsizeD'] = 'Alone'
data.loc[(data['Family_Size'] > 1),'FsizeD'] = 'Small'
data.loc[(data['Family_Size'] > 4),'FsizeD'] = 'Big'

# Replacing missing Fare by median/Pclass 
fa = data[data["Pclass"] == 3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

#  New feature : Child
data.loc[:,'Child'] = 1
data.loc[(data['Age'] >= 18),'Child'] =0

# New feature : Family Survival (https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83)
data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])
DEFAULT_SURVIVAL_VALUE = 0.5

data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
                               
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin == 0.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin == 0.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                    
####################################
# Encoding and pre-modeling
####################################                  

# dropping useless features
data = data.drop(columns = ['Age','Cabin','Embarked','Name','Last_Name',
                            'Parch', 'SibSp','Ticket', 'Family_Size'])

# Encoding features
target_col = ["Survived"]
id_dataset = ["Type"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
# numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col + id_dataset]
# Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
# Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]
# Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
# Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )
# Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns = num_cols)
# dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index = True,right_index = True,how = "left")
data = data.drop(columns = ['PassengerId'],axis = 1)

# Target = 1st column
cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)

# Cutting train and test
train = data[data['Type'] == 1].drop(columns = ['Type'])
test = data[data['Type'] == 0].drop(columns = ['Type'])


In [3]:
train.shape, test.shape

((891, 19), (418, 19))

In [4]:
import sagemaker
from sagemaker import get_execution_role
session = sagemaker.Session()
role = get_execution_role()

In [40]:
%%writefile script_keras.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold
from tensorflow import keras
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
# Reproductibility
from numpy.random import seed
seed(1002)
import tensorflow
tensorflow.random.set_seed(1002)


def keras_model_fn():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim = 18, activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation = 'relu'))
    model.add(Dense(1, activation = 'sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model


if __name__ =='__main__':

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # Data, model, and output directories
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='df_train.csv')
    parser.add_argument('--test-file', type=str, default='df_test.csv')

    args, _ = parser.parse_known_args()
    print(args.train)
    SEED = 42
    print('reading data')
    df_train = pd.read_csv(os.path.join(args.train, args.train_file))
    df_test = pd.read_csv(os.path.join(args.test, args.test_file))
    
    # X and Y
    X_train = df_train.iloc[:, 1:20].values
    y_train = df_train.iloc[:,0].values

    model = keras_model_fn()
    model.fit(X_train, y_train, epochs = 20, batch_size = 10)
    
    model_dir = os.environ.get('SM_MODEL_DIR')
    
    print(f'model_dir {model_dir}')
    # save Keras model for Tensorflow Serving
    version='0000'
    tensorflow.saved_model.save(model, os.path.join(model_dir, version))

Overwriting script_keras.py


In [6]:
prefix = 'sagemaker/titanic/keras'

# Send Train data to s3
train_file = 'df_train.csv'
train.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

# Send Test data to s3
test_file = 'df_test.csv';
test.to_csv(test_file, index=False, header=True)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/keras/train/df_train.csv
Test data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/keras/test/df_test.csv


In [7]:
from sagemaker.tensorflow import TensorFlow

In [20]:
estimator = TensorFlow(
    entry_point='script_keras.py',
    #source_dir='code', # directory of your training script
    role=role,
    framework_version='2.3.0',
    #model_dir=False, # don't pass --model_dir to your training script
    py_version='py37',
    instance_type='ml.m5.xlarge',
    instance_count=1,
    #output_path=output_path,
)

In [21]:
estimator.fit({
    'train': train_data_s3_path,
    'test': test_data_s3_path
})

2021-03-06 15:20:19 Starting - Starting the training job...
2021-03-06 15:20:29 Starting - Launching requested ML instancesProfilerReport-1615044019: InProgress
.........
2021-03-06 15:22:11 Starting - Preparing the instances for training...
2021-03-06 15:22:40 Downloading - Downloading input data...
2021-03-06 15:23:14 Training - Training image download completed. Training in progress..[34m2021-03-06 15:23:17,972 sagemaker-training-toolkit INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2021-03-06 15:23:17,980 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-06 15:23:21,690 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-06 15:23:21,706 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-06 15:23:21,722 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-03-06 15

In [23]:
estimator.model_data # where my model is located

's3://sagemaker-us-east-1-475414269301/tensorflow-training-2021-03-06-15-20-19-021/output/model.tar.gz'

In [11]:
# X Test
X_test = test.iloc[:, 1:20].values

In [24]:
from sagemaker.serializers import CSVSerializer
predictor = estimator.deploy(instance_type='ml.m4.xlarge',
                           initial_instance_count=1,
                           serializer = CSVSerializer())

update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


---------------!

In [25]:
predictor.endpoint_name # in order to use with invoke_endpoint in lambda with API gateway

'tensorflow-training-2021-03-06-15-44-49-854'

In [26]:
X_test[:1]

array([[ 1.       ,  0.       ,  0.       ,  0.       ,  1.       ,
         0.       ,  0.       ,  1.       ,  0.       ,  0.       ,
         0.       ,  1.       ,  0.       ,  0.       ,  0.       ,
         1.       ,  0.       , -0.5031762]])

In [27]:
predictions = predictor.predict(X_test)
prediction_df = pd.DataFrame(predictions['predictions'])
prediction_df[0] = prediction_df[0].astype(float)
prediction_df[0] = prediction_df[0].apply(lambda x: 1 if x > 0.5 else 0)
y_pred = prediction_df[0].astype(int)

df_test = pd.read_csv('test.csv')
submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = df_test['PassengerId']
submission_df['Survived'] = y_pred.values
submission_df.to_csv('submission_keras.csv', header=True, index=False)
submission_df.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [76]:
!kaggle competitions submit -c titanic -f submission_keras.csv -m "Submission using Keras Custom"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 10.8kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [28]:
import boto3
sm_boto3 = boto3.client('sagemaker')
sm_boto3.delete_endpoint(EndpointName=predictor.endpoint_name)

{'ResponseMetadata': {'RequestId': 'a2591bcb-ce75-43ff-a035-2918a3191a56',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a2591bcb-ce75-43ff-a035-2918a3191a56',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 06 Mar 2021 16:18:01 GMT'},
  'RetryAttempts': 0}}