#### Preprocess data

In [1]:
# read data into dataframe
import pandas as pd

bucket='imba-andy'
data_key = 'features-output/data.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)
data = pd.read_csv(data_location)

In [2]:
# load order product train dataframe
order_product_train_key = 'data/orders_products/order_products__train.csv.gz'
order_product_train_location = 's3://{}/{}'.format(bucket, order_product_train_key)
order_product_train = pd.read_csv(order_product_train_location)
order_product_train.head()

In [4]:
# load orders dataframe
orders_key = 'data/orders/orders.csv'
orders_location = 's3://{}/{}'.format(bucket, orders_key)
orders = pd.read_csv(orders_location)

In [5]:
# attach user_id to order_product_train
order_product_train = order_product_train.merge(orders[['user_id', 'order_id']])
# attach eval_set to data
data = data.merge(orders[orders.eval_set != 'prior'][['user_id','eval_set']])
# attach target variable: reordered
data = data.merge(order_product_train[['user_id', 'product_id', 'reordered']], how = 'left')

In [8]:
# feature engineering
data['prod_reorder_probability'] = data.prod_second_orders / data.prod_first_orders
data['prod_reorder_times'] = 1 + data.prod_reorders / data.prod_first_orders
data['prod_reorder_ratio'] = data.prod_reorders / data.prod_orders
data.drop(['prod_reorders', 'prod_first_orders', 'prod_second_orders'], axis=1, inplace=True)
data['user_average_basket'] = data.user_total_products / data.user_orders
data['up_order_rate'] = data.up_order / data.user_orders
data['up_orders_since_last_order'] = data.user_orders - data.up_last_order
data['up_order_rate_since_first_order'] = data.up_order / (data.user_orders - data.up_first_order + 1)

In [10]:
# split into training and test set, test set does not have target variable
train = data[data.eval_set == 'train'].copy()
test = data[data.eval_set == 'test'].copy()

In [12]:
# test dataframe
test_id = test[['product_id','user_id', 'eval_set']]
test.drop(['product_id','user_id', 'eval_set', 'reordered'], axis=1, inplace=True)

In [13]:
# training dataframe
train['reordered'] = train['reordered'].fillna(0)
train['reordered'] = train.reordered.astype(int)
train.drop(['eval_set', 'user_id', 'product_id'], axis=1, inplace=True)
train_y = train[['reordered']]
train_X = train.drop(['reordered'], axis = 1)

#### Classification

In [16]:
import pandas as pd

val_X = train_X[:20000]
train_X = train_X[20000:]

val_y = train_y[:20000]
train_y = train_y[20000:]

In [17]:
import os
data_dir = 'data/xgboost'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [18]:
# save the test data to test.csv 
pd.DataFrame(test).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

# save the training and validation set into local disk
pd.concat([val_y, val_X], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [19]:
# set text_X, train_X, val_X, train_y and val_y to None.
train_X = val_X = train_y = val_y = None

#### Uploading Training / Validation files to S3


In [20]:
import sagemaker

session = sagemaker.Session() # Store the current SageMaker session
prefix = 'imba-xgboost'
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

#### Creating a tuned XGBoost model

In [21]:
from sagemaker import get_execution_role
role = get_execution_role()

In [None]:
# retrieve the location of the container which is provided by Amazon for using XGBoost.
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(session.boto_region_name, 'xgboost', '0.90-1')

In [23]:
# retrieve the location of the container which is provided by Amazon for using XGBoost.
import sagemaker
container = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, 'latest')

In [24]:
# Create a SageMaker estimator 
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    instance_count=1,                  # How many compute instances
                                    instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# Solution:
xgb.set_hyperparameters(max_depth=5,
                        eta=0.1,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

#### Create the hyperparameter tuner

In [25]:
# import the relevant objects
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner


# create the tuner object
xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, 
                                               objective_metric_name = 'validation:rmse', 
                                               objective_type = 'Minimize', 
                                               max_jobs = 4, 
                                               max_parallel_jobs = 2, 
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

#### Fit the hyperparameter tuner

In [None]:
s3_input_train = sagemaker.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.TrainingInput(s3_data=val_location, content_type='csv')
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

#### Testing model

In [None]:
# attach the model:
xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

In [28]:
# Create a transformer object from the attached estimator
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
# Start the transform job. Make sure to specify the content type and the split type of the test data.
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [None]:
prediction=pd.read_csv('data/xgboost/test.csv.out', header=None, names=["prob"])
test_id = test_id.reset_index().drop(['index','eval_set'],axis = 1)
test = test.reset_index().drop(['index'],axis = 1)
pd.concat([test_id,test],axis=1).to_csv('data/xgboost/test_final.csv', index = False)
!aws s3 cp 'data/xgboost/test_final.csv' 's3://imba-andy/model_output/test_final.csv'

#### deploy model

In [33]:
import boto3

runtime = boto3.Session().client('sagemaker-runtime')

In [None]:
# deploy the model as an endpoint
xgb_predictor = xgb_attached.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
# check endpoint been created
xgb_predictor.endpoint

In [None]:
# invoke endpoint
response = runtime.invoke_endpoint(EndpointName = xgb_predictor.endpoint, 
                                       ContentType = 'text/csv',                     
                                       Body = '1,6.67578125,3418,209,0.595703125,514,10.0,11,57,11,1599,0.1498791297340854,1.2884770346494763,0.22388993120700437,9.017543859649123,0.017543859649122806,46,0.02127659574468085')

In [None]:
# the probablility of a user buy the product
response['Body'].read().decode('utf-8')

#### Output data

In [37]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]