## Import necessary modules, specify bucket, construct error handling structure.

In [None]:
import botocore
import boto3
import sagemaker
import sys
import pandas as pd
import numpy as np
from sklearn.datasets import make_blobs
import matploblib.pyplot as plt
from sklearn.model_selection import train_test_split
import io
import sagemaker.amazon.common as smac
import os
from sagemaker.amazon.amazon_estimator import get_image_uri
import time
import matplotlib.pyplot as plt
from sagemaker import get_execution_role
from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker.amazon.amazon_estimator import get_image_uri

bucket = 'es-code-backup422'
prefix = 'SageMaker'
execution_role = sagemaker.get_execution_role()

try:
    boto3.Session().client('s3').head_bucket(Bucket=bucket)
except botocore.exceptions.ParamValidationError as e:
    print('No bucket specified/wrong name error.')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == '403':
print("Permission error for this bucket, {}.".format(bucket))
    elif e.response['Error']['Code'] == '404':
print("Bucket was not found.".format(bucket))
    else:
        raise
else:
    print('All data and output will be stored in: s3://{}/{}'.format(bucket, prefix))


### Create some sample data in 2D spacec using make_blobs

In [None]:
n_samples = 50000
centers = [(-5, -5), (0, 0), (2, 2)]

X, y = make_blobs(n_samples = n_samples, centers = centers, shuffle = False, random_state = 42)

X = np.array(X.astype('float32'))
y = np.array(y.astype('float32'))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 42)

### Specify kNN, specify training data bucket as well as test data bucket, verify successfull upload

In [None]:
key = 'kNN'

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_train, y_train)
buf.seek(0)

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

s3_train_data = 's3://{}/{}/train/{}'.format(bucket,prefix,key)
print('uploading training data to location: {}'.format(s3_train_data))

buf = io.BytesIO()
smac.write_numpty_to_dense_tensor(buf, X_test, y_test)
buf.seek(0)

boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test', key)).upload_fileobj(buf)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, prefix, key)
print('uploaded test data location: {}'.format(s3_test_data))

### Specify hyperparameters for the kNN model, and output path for the model itself. Finally, fit kNN to data.

In [None]:
hyperparams = {
    'feature_dim': 2,
    'k':3,
    'sample_size':500,
    'predictor_type':'classifier'
}

output_path = 's3://' + bucket + '/' + prefix + '/default_example/output'

image_for_knn = get_image_uri(boto3.Session().region_name, "knn")

knn = sagemaker.estimator.Estimator(image_for_knn,
                                   execution_role,
                                   train_instance_count = 1,
                                   train_instance_type = 'ml.m5.2xlarge', # $.583/hour
                                   output_path = output_path,
                                   sagemaker_session = sagemaker.Session())

knn.set_hyperparameters(**hyperparams)

fit_input = {'train':s3_train_data}
if s3_test_data is not None:
    fit_input['test'] = s3_test_data
    
knn.fit(fit_input)

### Declare instancec type, construct name and endpoint. Deploy kNN predictor with the specified information.

In [None]:
instance_type = 'ml.m4.xlarge'
model_name = 'knn_%s' % instance_type
endpoint_name = 'knn-ml-m4-xlarge-%s'% (str(time.time()).replace('.','-'))
print('setting up endpoint...')

knn_predictor = knn.deploy(initial_instance_count = 1, instance_type = instance_type, endpoint_name = endpoint_name)

knn_predictor.content_type = 'text/csv'
knn_predictor.serializer = csv_serializer
knn_predictor.deserializer = json_deserializer

### Models are optimized to run on batches, so split testing data into batches of size 100 each. Print accuracy upon conclusion of fitting and prediction.

In [None]:
batches = np.array_split(X_test, 100)

start_time = time.time()
predictions = []
for batch in batches:
    result = knn_predictor.predict(batch)
    cur_predictions = np.array([result['predictions'][i]['predicted_label'] for i in range(len(result['predictions']))])
    predictions.append(cur_predictions)
predictions = np.concatenate(predictions)
run_time = time.time() - start_time

test_size = y_test.shape[0]
num_correct = sum(predictions == y_test)
accuracy = num_correct / float(test_size)
print('time required for predicting %d data point: %.2f seconds' % (test_size, run_time))
print('accuracy of model: %.1f%%' % (accuracy*100))

### Visualize the classification boundaries of the kNN model 

In [None]:
X_test = pd.DataFrame(X_test)
predictions = pd.DataFrame(predictions)

df = pd.concat([X_test, predictions], axis = 1)

plt.figure(figsize=(10,10))

df.columns = ['Feature_1', 'Feature_2', 'Class']

plt.scatter(df.iloc[:,0], df.iloc[:, 1], alpha = .2, c = df.Class, cmap = 'cividis')

### Visualize the actual class membership

In [None]:
X_test = pd.DataFrame(X_test)
y _test = pd.DataFrame(y_test)

df = pd.concat([X_test, y_test], axis = 1)

plt.figure(figsize = (10,10))
df.columns = ['Feature_1', 'Feature_2', 'Class']

plt.scatter(df.iloc[:,0], df.iloc[:, 1], alpha = 0.2, c = df.Class, cmap = 'cividis')

### Conclude by deleting the endpoint

In [None]:
def delete_endpoint(predictor):
        try:
            boto3.client('sagemaker').delete_endpoint(EndpointName = predictor.endpoint)
            print('Deleted {}'.format(predictor.endpoint))
        except:
            print('Already deleted: {}'.format(predictor.endpoint))

delete_endpoint(predictor)