In [1]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.10.tar.gz (59 kB)
[K     |████████████████████████████████| 59 kB 9.1 MB/s  eta 0:00:01
Collecting python-slugify
  Downloading python-slugify-4.0.1.tar.gz (11 kB)
Collecting text-unidecode>=1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 738 kB/s  eta 0:00:01
Building wheels for collected packages: kaggle, python-slugify
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.10-py3-none-any.whl size=73269 sha256=0db242022c50840e4218c022c1e0c0b518cadcac3c4ef061e4bf27c08de14ad7
  Stored in directory: /home/ec2-user/.cache/pip/wheels/1c/dd/dd/c493e6f981182c1411e288c553310f76e212bac3afbdac1294
  Building wheel for python-slugify (setup.py) ... [?25ldone
[?25h  Created wheel for python-slugify: filename=python_slugify-4.0.1-py2.py3-none-any.whl size=6767 sha256=aa9c73266cfe4e0192bba24af63aafaed2b6fc68007bd32a956a015700e21a2c
 

In [8]:
!kaggle competitions download -c titanic

/bin/sh: kaggle: command not found


In [11]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

import time
import string
import warnings
warnings.filterwarnings('ignore')

SEED = 42

In [2]:
####################################
# Importing data and merging
####################################

# Reading dataset
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Adding a column in each dataset before merging
train['Type'] = 'train'
test['Type'] = 'test'

# Merging train and test
data = train.append(test)

####################################
# Missing values and new features
####################################

# Title
data['Title'] = data['Name']

# Cleaning name and extracting Title
for name_string in data['Name']:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=True)
    
# Replacing rare titles 
mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Major': 'Other', 
           'Col': 'Other', 'Dr' : 'Other', 'Rev' : 'Other', 'Capt': 'Other', 
           'Jonkheer': 'Royal', 'Sir': 'Royal', 'Lady': 'Royal', 
           'Don': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal'}
           
data.replace({'Title': mapping}, inplace=True)
titles = ['Miss', 'Mr', 'Mrs', 'Royal', 'Other', 'Master']

# Replacing missing age by median/title 
for title in titles:
    age_to_impute = data.groupby('Title')['Age'].median()[titles.index(title)]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute
    
# New feature : Family_size
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data.loc[:,'FsizeD'] = 'Alone'
data.loc[(data['Family_Size'] > 1),'FsizeD'] = 'Small'
data.loc[(data['Family_Size'] > 4),'FsizeD'] = 'Big'

# Replacing missing Fare by median/Pclass 
fa = data[data["Pclass"] == 3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

#  New feature : Child
data.loc[:,'Child'] = 1
data.loc[(data['Age'] >= 18),'Child'] =0

# New feature : Family Survival (https://www.kaggle.com/konstantinmasich/titanic-0-82-0-83)
data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])
DEFAULT_SURVIVAL_VALUE = 0.5

data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
                               
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin == 0.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin == 0.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
                    
####################################
# Encoding and pre-modeling
####################################                  

# dropping useless features
data = data.drop(columns = ['Age','Cabin','Embarked','Name','Last_Name',
                            'Parch', 'SibSp','Ticket', 'Family_Size'])

# Encoding features
target_col = ["Survived"]
id_dataset = ["Type"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()
cat_cols   = [x for x in cat_cols ]
# numerical columns
num_cols   = [x for x in data.columns if x not in cat_cols + target_col + id_dataset]
# Binary columns with 2 values
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
# Columns more than 2 values
multi_cols = [i for i in cat_cols if i not in bin_cols]
# Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])
# Duplicating columns for multi value columns
data = pd.get_dummies(data = data,columns = multi_cols )
# Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns = num_cols)
# dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index = True,right_index = True,how = "left")
data = data.drop(columns = ['PassengerId'],axis = 1)

# Target = 1st column
cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)

# Cutting train and test
train = data[data['Type'] == 1].drop(columns = ['Type'])
test = data[data['Type'] == 0].drop(columns = ['Type'])


In [3]:
import sagemaker
from sagemaker import get_execution_role
import boto3

region = boto3.Session().region_name
session = sagemaker.Session()

role = get_execution_role()
bucket = session.default_bucket()

In [4]:
msk = np.random.rand(len(train)) < 0.8
df_train = train[msk]
df_valid = train[~msk]
df_train.shape, df_valid.shape

((722, 19), (169, 19))

In [7]:
prefix = 'sagemaker/titanic/autopilot'
train_file = 'df_train.csv';
#I'll pass the hole training dataset as autopilot doenst need validation dataset
train.to_csv(train_file, index=False, header=True)
train_data_s3_path = session.upload_data(path=train_file, key_prefix=prefix + "/train")
print('Train data uploaded to: ' + train_data_s3_path)

test_file = 'df_test.csv';
df_valid.to_csv(test_file, index=False, header=False)
test_data_s3_path = session.upload_data(path=test_file, key_prefix=prefix + "/test")
print('Test data uploaded to: ' + test_data_s3_path)

Train data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/autopilot/train/df_train.csv
Test data uploaded to: s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/autopilot/test/df_test.csv


In [41]:
### END OF FEATURE ENGINEERING NOW WE ARE GOING TO TRY DIFFERENT APPROACHES TO 
# Following the Kaggle Kernel, this is where he start to train models, he uses Random Forest.

In [6]:
# import libraries
import re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role                                           
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   


sm = boto3.Session().client(service_name='sagemaker',region_name=region)

### START: AUTOPILOT FOR THIS FIRST SUBMISSION

In [8]:
# Autopilot config
input_data_config = [{
      'DataSource': {
        'S3DataSource': {
          'S3DataType': 'S3Prefix',
          'S3Uri': 's3://{}/{}/train'.format(bucket,prefix)
        }
      },
      'TargetAttributeName': 'Survived'
    }
  ]

output_data_config = {
    'S3OutputPath': 's3://{}/{}/output'.format(bucket,prefix)
  }

In [9]:
timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())

auto_ml_job_name = 'titanic-' + timestamp_suffix
print('AutoMLJobName: ' + auto_ml_job_name)

sm.create_auto_ml_job(AutoMLJobName=auto_ml_job_name,
                      InputDataConfig=input_data_config,
                      OutputDataConfig=output_data_config,
                      AutoMLJobConfig={'CompletionCriteria':
                                       {'MaxCandidates': 100}
                                      },
                      RoleArn=role)

AutoMLJobName: titanic-06-16-54-20


{'AutoMLJobArn': 'arn:aws:sagemaker:us-east-1:475414269301:automl-job/titanic-06-16-54-20',
 'ResponseMetadata': {'RequestId': '3c5d42ad-f31c-4686-9c43-23f02b71614d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '3c5d42ad-f31c-4686-9c43-23f02b71614d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '90',
   'date': 'Sat, 06 Mar 2021 16:54:20 GMT'},
  'RetryAttempts': 0}}

In [10]:
#Store the AutoMLJobName name for use in subsequent notebooks 
%store auto_ml_job_name
auto_ml_job_name

Stored 'auto_ml_job_name' (str)


'titanic-06-16-54-20'

In [12]:
print ('JobStatus - Secondary Status')
print('------------------------------')


describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
job_run_status = describe_response['AutoMLJobStatus']
    
while job_run_status not in ('Failed', 'Completed', 'Stopped'):
    describe_response = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response['AutoMLJobStatus']
    
    print (describe_response['AutoMLJobStatus'] + " - " + describe_response['AutoMLJobSecondaryStatus'])
    time.sleep(30)

JobStatus - Secondary Status
------------------------------
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - AnalyzingData
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
InProgress - FeatureEngineering
I

In [13]:
print(describe_response['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation'])
print(describe_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation'])

candidate_nbk = describe_response['AutoMLJobArtifacts']['CandidateDefinitionNotebookLocation']
data_explore_nbk = describe_response['AutoMLJobArtifacts']['DataExplorationNotebookLocation']

s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/autopilot/output/titanic-06-16-54-20/sagemaker-automl-candidates/pr-1-d909d0738fc7453f995410df019bbd84cb52a8cfa0e44da4836dd5fa1e/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb
s3://sagemaker-us-east-1-475414269301/sagemaker/titanic/autopilot/output/titanic-06-16-54-20/sagemaker-automl-candidates/pr-1-d909d0738fc7453f995410df019bbd84cb52a8cfa0e44da4836dd5fa1e/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


In [49]:
def split_s3_path(s3_path):
    path_parts=s3_path.replace("s3://","").split("/")
    bucket=path_parts.pop(0)
    key="/".join(path_parts)
    return bucket, key

s3_bucket, candidate_nbk_key = split_s3_path(candidate_nbk)
_, data_explore_nbk_key = split_s3_path(data_explore_nbk)

print(s3_bucket, candidate_nbk_key, data_explore_nbk_key)

session.download_data(path='./', bucket=s3_bucket, 
                                 key_prefix = candidate_nbk_key)

session.download_data(path='./', bucket=s3_bucket, 
                                 key_prefix = data_explore_nbk_key)

sagemaker-us-east-1-475414269301 sagemaker/titanic/output/automl-churn-03-15-42-12/sagemaker-automl-candidates/pr-1-c4cf1a7550d04a7ba4a1e624712982dc421136ae7c4345a1a981f305f9/notebooks/SageMakerAutopilotCandidateDefinitionNotebook.ipynb sagemaker/titanic/output/automl-churn-03-15-42-12/sagemaker-automl-candidates/pr-1-c4cf1a7550d04a7ba4a1e624712982dc421136ae7c4345a1a981f305f9/notebooks/SageMakerAutopilotDataExplorationNotebook.ipynb


In [14]:
best_candidate = sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)['BestCandidate']
best_candidate_name = best_candidate['CandidateName']
print(best_candidate)
print('\n')
print("CandidateName: " + best_candidate_name)
print("FinalAutoMLJobObjectiveMetricName: " + best_candidate['FinalAutoMLJobObjectiveMetric']['MetricName'])
print("FinalAutoMLJobObjectiveMetricValue: " + str(best_candidate['FinalAutoMLJobObjectiveMetric']['Value']))

{'CandidateName': 'tuning-job-1-14df8e6e0b94403b80-083-d8dd10fb', 'FinalAutoMLJobObjectiveMetric': {'MetricName': 'validation:f1', 'Value': 0.8369100093841553}, 'ObjectiveStatus': 'Succeeded', 'CandidateSteps': [{'CandidateStepType': 'AWS::SageMaker::ProcessingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:475414269301:processing-job/db-1-d1e5fd82b1fe4d6f97cb244c23571409ab46f4fa6fe64d3481e28991d1', 'CandidateStepName': 'db-1-d1e5fd82b1fe4d6f97cb244c23571409ab46f4fa6fe64d3481e28991d1'}, {'CandidateStepType': 'AWS::SageMaker::TrainingJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:475414269301:training-job/titanic-06-dpp4-1-88a2c8acde80405c974a62f0205b8920378983434e0a4', 'CandidateStepName': 'titanic-06-dpp4-1-88a2c8acde80405c974a62f0205b8920378983434e0a4'}, {'CandidateStepType': 'AWS::SageMaker::TransformJob', 'CandidateStepArn': 'arn:aws:sagemaker:us-east-1:475414269301:transform-job/titanic-06-dpp4-csv-1-afe3af40db614a63b871a973523d3ab44417e2674', 'CandidateStepName': '

In [15]:
#sm.describe_auto_ml_job(AutoMLJobName=auto_ml_job_name)
#sm.list_auto_ml_jobs()
sm_dict =sm.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name)

In [16]:
for item in sm_dict['Candidates']:
    print(item['CandidateName'], item['FinalAutoMLJobObjectiveMetric'])
    print(item['InferenceContainers'][1]['Image'], "\n")

tuning-job-1-14df8e6e0b94403b80-097-18e80781 {'MetricName': 'validation:f1', 'Value': 0.7937899827957153}
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3 

tuning-job-1-14df8e6e0b94403b80-098-bc33fa59 {'MetricName': 'validation:f1', 'Value': 0.8098000288009644}
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3 

tuning-job-1-14df8e6e0b94403b80-099-8008e3da {'MetricName': 'validation:f1', 'Value': 0.8162999749183655}
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3 

tuning-job-1-14df8e6e0b94403b80-096-f459ef34 {'MetricName': 'validation:f1', 'Value': 0.8194000124931335}
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3 

tuning-job-1-14df8e6e0b94403b80-094-961beb7b {'MetricName': 'validation:f1', 'Value': 0.8235899806022644}
683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.0-1-cpu-py3 

tuning-job-1-14df8e6e0b94403b80-093-5d5a7977 {'MetricName': 'validation:bin

In [17]:
model_name = best_candidate_name + timestamp_suffix + "-model"
model_arn = sm.create_model(Containers=best_candidate['InferenceContainers'],
                            ModelName=model_name,
                            ExecutionRoleArn=role)

epc_name = best_candidate_name + timestamp_suffix + "-epc"
ep_config = sm.create_endpoint_config(EndpointConfigName = epc_name,
                                      ProductionVariants=[{'InstanceType': 'ml.m5.xlarge',
                                                           'InitialInstanceCount': 1,
                                                           'ModelName': model_name,
                                                           'VariantName': 'main'}])

ep_name = best_candidate_name + timestamp_suffix + "-ep"
create_endpoint_response = sm.create_endpoint(EndpointName=ep_name,
                                              EndpointConfigName=epc_name)


In [None]:
sm.get_waiter('endpoint_in_service').wait(EndpointName=ep_name)

In [20]:
from io import StringIO
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer
predictor = Predictor(
    endpoint_name=ep_name,
    sagemaker_session=session,
    serializer=CSVSerializer(),
    deserializer=CSVDeserializer())


In [21]:
predictor.endpoint_name

'tuning-job-1-14df8e6e0b94403b80-083-d8dd10fb06-16-54-20-ep'

In [24]:
# X Test
X_test = test.iloc[:, 1:20]
predictions = predictor.predict(X_test.values).decode('utf-8')
predictions = np.fromstring(predictions, sep=',')

ValidationError: An error occurred (ValidationError) when calling the InvokeEndpoint operation: Endpoint tuning-job-1-14df8e6e0b94403b80-083-d8dd10fb06-16-54-20-ep of account 475414269301 not found.

In [None]:
# Remove the target column from the test data
test_data_inference = X_train_df.drop('Survived', axis=1)
#print(test_data_inference.head())

# Obtain predictions from SageMaker endpoint
prediction = predictor.predict(test_data_inference.to_csv(sep=',', header=False, index=False))

# Load prediction in pandas and compare to ground truth
prediction_df = pd.DataFrame(prediction)
y_train_df = pd.DataFrame(y_train)
prediction_df[0] = prediction_df[0].astype(float).apply(lambda x: 1.0 if x > 0.6 else 0.0)
accuracy = (y_train_df[0] == prediction_df[0]).sum() / len(test_data_inference)
print('Accuracy: {}'.format(accuracy))

In [62]:
# Predict on the test data
test_data_inference = X_test_df.drop('Survived', axis=1)
prediction = predictor.predict(test_data_inference.to_csv(sep=',', header=False, index=False))
prediction_df = pd.DataFrame(prediction)
prediction_df[0] = prediction_df[0].astype(float)

In [63]:
df_test = pd.read_csv('test.csv')
submission_df = pd.DataFrame(columns=['PassengerId', 'Survived'])
submission_df['PassengerId'] = df_test['PassengerId']
submission_df['Survived'] = predictions
submission_df['Survived'] = submission_df['Survived'].apply(lambda x: 1 if x > 0.5 else 0)
submission_df.to_csv('submissions.csv', header=True, index=False)
submission_df.head(10)

Unnamed: 0,PassengerId,Survived
891,892,0
892,893,1
893,894,0
894,895,0
895,896,1
896,897,0
897,898,0
898,899,0
899,900,1
900,901,0


In [68]:
!kaggle competitions submit -c titanic -f submissions.csv -m "Submission using autopilot model with 50 candidates only"

100%|██████████████████████████████████████| 2.77k/2.77k [00:00<00:00, 6.10kB/s]
Successfully submitted to Titanic - Machine Learning from Disaster

In [22]:
# Deleting the hosted model and deleting the model (Frugality)
sm.delete_endpoint(EndpointName=ep_name) 
sm.delete_endpoint_config(EndpointConfigName=epc_name) 
sm.delete_model(ModelName=model_name)

{'ResponseMetadata': {'RequestId': 'b7bcd880-4054-4de0-b54b-e21152f3d400',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b7bcd880-4054-4de0-b54b-e21152f3d400',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sat, 06 Mar 2021 19:32:23 GMT'},
  'RetryAttempts': 0}}

### END Autopilot submission