In [22]:
# S3 prefix
prefix = 'battleship'
bucket = 'battleship-do-not-delete'
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

In [23]:
train_input = 's3://battleship-do-not-delete/board-raw-data/board.csv'

In [26]:
from sagemaker.sklearn.estimator import SKLearn

sklearn = SKLearn(
    entry_point= 'model.py',
    train_instance_type="local",
    role=role,
    output_path='s3://{}/{}/output'.format(bucket, prefix) )

#     sagemaker_session=sagemaker_session)

In [27]:
sklearn.fit({'train': train_input})

Creating tmpxcyo_12x_algo-1-z4z3b_1 ... 
[1BAttaching to tmpxcyo_12x_algo-1-z4z3b_12mdone[0m
[36malgo-1-z4z3b_1  |[0m 2019-07-14 05:14:52,458 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
[36malgo-1-z4z3b_1  |[0m 2019-07-14 05:14:52,460 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)
[36malgo-1-z4z3b_1  |[0m 2019-07-14 05:14:52,471 sagemaker_sklearn_container.training INFO     Invoking user training script.
[36malgo-1-z4z3b_1  |[0m 2019-07-14 05:14:52,588 sagemaker-containers INFO     Module model does not provide a setup.py. 
[36malgo-1-z4z3b_1  |[0m Generating setup.py
[36malgo-1-z4z3b_1  |[0m 2019-07-14 05:14:52,588 sagemaker-containers INFO     Generating setup.cfg
[36malgo-1-z4z3b_1  |[0m 2019-07-14 05:14:52,588 sagemaker-containers INFO     Generating MANIFEST.in
[36malgo-1-z4z3b_1  |[0m 2019-07-14 05:14:52,588 sagemaker-containers INFO     Installing module with the following command:
[36malg

## Using the trained model to make inference requests <a class="anchor" id="inference"></a>

### Deploy the model <a class="anchor" id="deploy"></a>

Deploying the model to SageMaker hosting just requires a `deploy` call on the fitted model. This call takes an instance count and instance type.

In [None]:
predictor = sklearn.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

### Choose some data and use it for a prediction <a class="anchor" id="prediction_request"></a>

In order to do some predictions, we'll extract some of the data we used for training and do predictions against it. This is, of course, bad statistical practice, but a good way to see how the mechanism works.

In [None]:
import itertools
import pandas as pd

shape = pd.read_csv("data/iris.csv", header=None)

a = [50*i for i in range(3)]
b = [40+i for i in range(10)]
indices = [i+j for i,j in itertools.product(a,b)]

test_data = shape.iloc[indices[:-1]]
test_X = test_data.iloc[:,1:]
test_y = test_data.iloc[:,0]

Prediction is as easy as calling predict with the predictor we got back from deploy and the data we want to do predictions with. The output from the endpoint return an numerical representation of the classification prediction; in the original dataset, these are flower names, but in this example the labels are numerical. We can compare against the original label that we parsed.

In [None]:
print(predictor.predict(test_X.values))
print(test_y.values)

### Endpoint cleanup <a class="anchor" id="endpoint_cleanup"></a>

When you're done with the endpoint, you'll want to clean it up.

In [None]:
sklearn.delete_endpoint()

## Batch Transform <a class="anchor" id="batch_transform"></a>
We can also use the trained model for asynchronous batch inference on S3 data using SageMaker Batch Transform.

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer = sklearn.transformer(instance_count=1, instance_type='ml.m4.xlarge')

### Prepare Input Data <a class="anchor" id="prepare_input_data"></a>
We will extract 10 random samples of 100 rows from the training data, then split the features (X) from the labels (Y). Then upload the input data to a given location in S3.

In [None]:
%%bash
# Randomly sample the iris dataset 10 times, then split X and Y
mkdir -p batch_data/XY batch_data/X batch_data/Y
for i in {0..9}; do
    cat data/iris.csv | shuf -n 100 > batch_data/XY/iris_sample_${i}.csv
    cat batch_data/XY/iris_sample_${i}.csv | cut -d',' -f2- > batch_data/X/iris_sample_X_${i}.csv
    cat batch_data/XY/iris_sample_${i}.csv | cut -d',' -f1 > batch_data/Y/iris_sample_Y_${i}.csv
done

In [None]:
# Upload input data from local filesystem to S3
batch_input_s3 = sagemaker_session.upload_data('batch_data/X', key_prefix=prefix + '/batch_input')

### Run Transform Job <a class="anchor" id="run_transform_job"></a>
Using the Transformer, run a transform job on the S3 input data.

In [None]:
# Start a transform job and wait for it to finish
transformer.transform(batch_input_s3, content_type='text/csv')
print('Waiting for transform job: ' + transformer.latest_transform_job.job_name)
transformer.wait()

### Check Output Data  <a class="anchor" id="check_output_data"></a>
After the transform job has completed, download the output data from S3. For each file "f" in the input data, we have a corresponding file "f.out" containing the predicted labels from each input row. We can compare the predicted labels to the true labels saved earlier.

In [None]:
# Download the output data from S3 to local filesystem
batch_output = transformer.output_path
!mkdir -p batch_data/output
!aws s3 cp --recursive $batch_output/ batch_data/output/
# Head to see what the batch output looks like
!head batch_data/output/*

In [None]:
%%bash
# For each sample file, compare the predicted labels from batch output to the true labels
for i in {1..9}; do
    diff -s batch_data/Y/iris_sample_Y_${i}.csv \
        <(cat batch_data/output/iris_sample_X_${i}.csv.out | sed 's/[["]//g' | sed 's/, \|]/\n/g') \
        | sed "s/\/dev\/fd\/63/batch_data\/output\/iris_sample_X_${i}.csv.out/"
done