This notebook was modified from the DSBA-6190 Project 2 notebook to incorporate use of Amazon Sagemaker.

The Wine Quality Dataset (https://archive.ics.uci.edu/ml/datasets/Wine+Quality) was used as the data input. Several regression models were generated, using the provided data to predict red wine quality.

# Import

## Libraries / Packages

In [44]:
# General
import pandas as pd
import numpy as np

import json
import os
import tarfile
import datetime

#SageMaker
import boto3
import io
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import IntegerParameter, CategoricalParameter

# Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Modeling
from sklearn.ensemble import RandomForestRegressor

# Output
from sklearn.metrics import mean_squared_error,r2_score

# Export
from sklearn.externals import joblib

seed = 5590

## Sagemaker Settings

In [88]:
region = boto3.session.Session().region_name
print('Region: {}'.format(region))

role = get_execution_role()
print('Role: {}'.format(role))

sm_boto3 = boto3.client('sagemaker')
print(sm_boto3)

Region: us-east-1
Role: arn:aws:iam::726963482731:role/service-role/AmazonSageMaker-ExecutionRole-20200222T172260
<botocore.client.SageMaker object at 0x7f4f28ec37f0>


## Data

Initialize the S3 connection and name the target bucket.

In [3]:
s3_client = boto3.client('s3')
data_bucket_name='dsba-6190-final-team-project'

Check the contents of the bucket, and assign the import data file path and name to the variable **file_data**.

In [4]:
obj_list=s3_client.list_objects(Bucket=data_bucket_name)
file=[]
for contents in obj_list['Contents']:
    file.append(contents['Key'])
print(file)
file_data=file[1]

['data/', 'data/winequality-red.csv']


In [5]:
file_data

'data/winequality-red.csv'

Import the data into the variable **response_body** then read into a dataframe.

In [6]:
response = s3_client.get_object(Bucket=data_bucket_name, Key=file_data)
print(list(response))
response_body = response['Body'].read()
df_wine_red = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=";", low_memory=False) 

['ResponseMetadata', 'AcceptRanges', 'LastModified', 'ContentLength', 'ETag', 'ContentType', 'Metadata', 'Body']


# Preprocessing

We need to define input file as a string to the S3 bucket location for the SKLearn process to be able to call.

In [7]:
# Input File Path Check
input_data = 's3://dsba-6190-final-team-project/data/winequality-red.csv'.format(region)

print('The input data is found at the following path: {}'.format(input_data))

# Pull input data locally to check correct data is used. Full data set not needed. Pull 5 rows.w
df_verify_input = pd.read_csv(input_data, header=0, delimiter=";", low_memory=False, nrows = 5)
print('Shape of Verification Dataframe: {}'.format(df_verify_input.shape))
df_verify_input.head()

The input data is found at the following path: s3://dsba-6190-final-team-project/data/winequality-red.csv
Shape of Verification Dataframe: (5, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


## Preprocessing

In [14]:
#Initialize SKLearn Preprocessing Function
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role, 
                                     base_job_name = "scikit-preprocessing",
                                     instance_type='ml.m4.xlarge',
                                     instance_count=1)

### Preprocessing Script
The following script is used to run the preproocessing stage.

In [12]:
%%writefile preprocessing.py

import argparse
import os
import warnings

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

seed = 5590

if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-test-split-ratio', type=float, default=0.2)
    args, _ = parser.parse_known_args()
    
    print('Received arguments {}'.format(args))
    
    input_data_path = os.path.join('/opt/ml/processing/input', 'winequality-red.csv')
    
    print('Reading input data from {}'.format(input_data_path))
    df = pd.read_csv(input_data_path, header=0, delimiter=";", low_memory=False)
    
      
    split_ratio = args.train_test_split_ratio
    
    print('Splitting data into train and test sets with ratio {}'.format(split_ratio))
    
    #Split Data into training / test
    X_train, X_test, y_train, y_test = train_test_split(df.drop('quality', axis=1), 
                                                        df['quality'], 
                                                        test_size=split_ratio, 
                                                        random_state=seed)
    
    # Transform Features
    scaler = StandardScaler().fit(X_train)
    train_features = scaler.transform(X_train)
    test_features = scaler.transform(X_test)
    
    # Convert Data to Float32
    train_features = train_features.astype('float32')
    test_features = test_features.astype('float32')
    
    # Create local output directories
    try:
        os.makedirs('/opt/ml/processing/output/train')
        os.makedirs('/opt/ml/processing/output/test')
    except:
        pass
    
    print('Train Features shape after preprocessing: {}'.format(train_features.shape))
    print('Train Labels shape after preprocessing: {}'.format(y_train.shape))
    print('Test Features shape after preprocessing: {}'.format(test_features.shape))
    print('Test Labels shape after preprocessing: {}'.format(y_test.shape))
    
    train_features_output_path = os.path.join('/opt/ml/processing/output/train', 'train_features.csv')
    train_labels_output_path = os.path.join('/opt/ml/processing/output/train', 'train_labels.csv')
    
    test_features_output_path = os.path.join('/opt/ml/processing/output/test', 'test_features.csv')
    test_labels_output_path = os.path.join('/opt/ml/processing/output/test', 'test_labels.csv')
    
    print('Saving training features to {}'.format(train_features_output_path))
    pd.DataFrame(train_features).to_csv(train_features_output_path, header= False, index=False)
    
    print('Saving test features to {}'.format(test_features_output_path))
    pd.DataFrame(test_features).to_csv(test_features_output_path, header=False, index=False)
    
    print('Saving training labels to {}'.format(train_labels_output_path))
    y_train.to_csv(train_labels_output_path, header=False, index=False)
    
    print('Saving test labels to {}'.format(test_labels_output_path))
    y_test.to_csv(test_labels_output_path, header=False, index=False)

Overwriting preprocessing.py


The preprocessing script can now be run with the SKLearnProcessor.run() method. 

In [52]:
hyperparameters = {
    'train-test-split-ratio': 0.3
}

In [53]:
hyperparameters

{'train-test-split-ratio': 0.3}

In [15]:
sklearn_processor.run(code='preprocessing.py',
                      arguments=['--train-test-split-ratio', '0.3'],
                      inputs=[ProcessingInput(
                        source=input_data,
                        destination='/opt/ml/processing/input')],
                      outputs=[ProcessingOutput(output_name='train_data',
                                                source='/opt/ml/processing/output/train'),
                               ProcessingOutput(output_name='test_data',
                                                source='/opt/ml/processing/output/test')]
                     )

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_data':
        preprocessed_test_data = output['S3Output']['S3Uri']


Job Name:  rf-scikit-preprocessing-2020-02-29-12-45-13-372
Inputs:  [{'InputName': 'input-1', 'S3Input': {'S3Uri': 's3://dsba-6190-final-team-project/data/winequality-red.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-726963482731/rf-scikit-preprocessing-2020-02-29-12-45-13-372/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'train_data', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-726963482731/rf-scikit-preprocessing-2020-02-29-12-45-13-372/output/train_data', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'test_data', 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-72

### Check Preprocessing Outputs

In [16]:
output_config

{'Outputs': [{'OutputName': 'train_data',
   'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-726963482731/rf-scikit-preprocessing-2020-02-29-12-45-13-372/output/train_data',
    'LocalPath': '/opt/ml/processing/output/train',
    'S3UploadMode': 'EndOfJob'}},
  {'OutputName': 'test_data',
   'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-726963482731/rf-scikit-preprocessing-2020-02-29-12-45-13-372/output/test_data',
    'LocalPath': '/opt/ml/processing/output/test',
    'S3UploadMode': 'EndOfJob'}}]}

Check to see that the training features were processed.

In [17]:
training_features = pd.read_csv(preprocessed_training_data + '/train_features.csv', nrows=5,  header=None)
print('Training features shape: {}'.format(training_features.shape))
training_features

Training features shape: (5, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.489548,-0.961003,0.659811,-0.499674,-0.588405,0.493872,0.295467,-0.919712,-0.873225,0.963819,0.517527
1,-0.26886,0.191906,0.245717,-0.499674,-0.393727,0.96992,2.319748,-0.234313,-0.160127,-0.768973,-1.045407
2,-0.152182,-0.082596,-1.203608,-0.567101,-0.372096,-0.553431,-0.327389,-0.497928,0.164009,-0.529967,-0.401846
3,-0.735573,0.850711,-0.996562,-0.027688,0.082155,-0.077384,-0.763388,-0.513745,0.423317,-0.350713,0.793339
4,-0.560555,-0.302198,0.297479,-0.297394,-0.545143,1.445967,0.264324,0.308733,-0.0953,-0.051955,-0.401846


In [18]:
training_labels = pd.read_csv(preprocessed_training_data + '/train_labels.csv', nrows=5,  header=None)
print('Training labels shape: {}'.format(training_labels.shape))
training_labels

Training labels shape: (5, 1)


Unnamed: 0,0
0,7
1,5
2,5
3,6
4,7


In [19]:
X_train = pd.read_csv(preprocessed_training_data + '/train_features.csv', header=None)

#Checks are for previous inconsistencies in data shape
print('The Training Data Features Dataframe has the following shape: {}'.format(X_train.shape))
print('The Training Data Features Numpy Array has the following shape: {}'.format(np.array(X_train).shape))

X_train.head(5)

The Training Data Features Dataframe has the following shape: (1119, 11)
The Training Data Features Numpy Array has the following shape: (1119, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.489548,-0.961003,0.659811,-0.499674,-0.588405,0.493872,0.295467,-0.919712,-0.873225,0.963819,0.517527
1,-0.26886,0.191906,0.245717,-0.499674,-0.393727,0.96992,2.319748,-0.234313,-0.160127,-0.768973,-1.045407
2,-0.152182,-0.082596,-1.203608,-0.567101,-0.372096,-0.553431,-0.327389,-0.497928,0.164009,-0.529967,-0.401846
3,-0.735573,0.850711,-0.996562,-0.027688,0.082155,-0.077384,-0.763388,-0.513745,0.423317,-0.350713,0.793339
4,-0.560555,-0.302198,0.297479,-0.297394,-0.545143,1.445967,0.264324,0.308733,-0.0953,-0.051955,-0.401846


In [20]:
y_train = pd.read_csv(preprocessed_training_data + '/train_labels.csv', header=None)

#Checks are for previous inconsistencies in data shape
print('The Training Data Labels Dataframe has the following shape: {}'.format(y_train.shape))
print('The Training Data Labels Numpy Array has the following shape: {}'.format(np.array(y_train).ravel().shape))

y_train.head(5)

The Training Data Labels Dataframe has the following shape: (1119, 1)
The Training Data Labels Numpy Array has the following shape: (1119,)


Unnamed: 0,0
0,7
1,5
2,5
3,6
4,7


## Training

### Training Script

In [57]:
%%writefile train.py

import argparse
import os

import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

seed = 5590

# Inference Function
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__=="__main__":
    
    print('Extracting Arguments')
    parser = argparse.ArgumentParser()
    
    # Hyperparameters
    parser.add_argument('--n-estimators', type=int, default=750)
    parser.add_argument('--max-features', type=str, default='sqrt')
    
    # Directories
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
    parser.add_argument('--train_file_features', type=str, default = 'train_features.csv')
    parser.add_argument('--train_file_labels', type=str, default = 'train_labels.csv')
    parser.add_argument('--test_file_features', type=str, default = 'test_features.csv')
    parser.add_argument('--test_file_labels', type=str, default = 'test_labels.csv')
    
    args = parser.parse_args()
    
    # Data Paths
    train_features_data = os.path.join(args.train, args.train_file_features)
    train_labels_data = os.path.join(args.train, args.train_file_labels)
    
    #test_features_data = os.path.join(args.test, args.test_file_features)
    #test_labels_data = os.path.join(args.test, args.test_file_labels)
    
    
    print('Reading input data')
    # Import Data
    ## Training
    X_train = pd.read_csv(train_features_data)
    y_train = pd.read_csv(train_labels_data)
    
    ## Test
    #X_test = pd.read_csv(test_features_data)
    #y_test = pd.read_csv(test_labels_data)
    
    # Convert Data to NP Array
    ## Training
    X_train = np.array(X_train)
    y_train = np.array(y_train).ravel()
    
    ## Test
    #X_test = np.array(X_test)
    #y_test = np.array(y_test).ravel()
    
    # Check Data Shape
    print(X_train.shape)
    print(y_train.shape)
    
    # Initialize Random Forest Regressor Model. 
    # Hyperparameters based on earlier local analysis
        
    model = RandomForestRegressor(n_estimators = args.n_estimators,
                                  max_features = args.max_features,
                                  random_state = seed)
    
    print('Training Random Forest Regression model')
    model.fit(X_train, y_train)
    
    print('Training Complete')
    
    # Metrics
    
    mse_neg = cross_val_score(model, X_train, y_train, scoring = 'neg_mean_squared_error', cv=10)
    mse_abs = np.absolute(mse_neg)
    rsme = np.sqrt(mse_abs)
    rsme_avg = np.mean(rsme)
    print('RMSE: {}'.format(rsme_avg))
    
    model_output_directory = os.path.join(args.model_dir, "model.joblib")
    print('Saving model to {}'.format(model_output_directory))
    joblib.dump(model, model_output_directory)

Overwriting train.py


### Create SKLearn Estimator For Training

In [58]:
sklearn_est = SKLearn(
    entry_point='train.py',
    role=role,
    train_instance_count=1,
    train_instance_type="ml.c5.xlarge",
    framework_version='0.20.0',
    base_job_name='red-wine-rf-scikit-training',
    metric_definitions=[
        {'Name': 'RSME',
        'Regex': "RSME: ([0-9.]+).*$"}],
    hyperparameters = {'n-estimators': 500}
)

### Fit SKLearn Esitmator to Training Data

In [59]:
sklearn_est.fit(
    {
        'train': preprocessed_training_data, 
        'test': preprocessed_test_data
    },
    wait = False
)

### Training Job Output Information

In [60]:
training_job_description = sklearn_est.jobs[-1].describe()

model_data_s3_uri = '{}{}/{}'.format(
    training_job_description['OutputDataConfig']['S3OutputPath'],
    training_job_description['TrainingJobName'],
    'output/model.tar.gz')

In [61]:
training_job_description

{'TrainingJobName': 'red-wine-rf-scikit-training-2020-02-29-15-03-44-768',
 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:726963482731:training-job/red-wine-rf-scikit-training-2020-02-29-15-03-44-768',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-us-east-1-726963482731/red-wine-rf-scikit-training-2020-02-29-15-03-44-768/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'n-estimators': '500',
  'sagemaker_container_log_level': '20',
  'sagemaker_enable_cloudwatch_metrics': 'false',
  'sagemaker_job_name': '"red-wine-rf-scikit-training-2020-02-29-15-03-44-768"',
  'sagemaker_program': '"train.py"',
  'sagemaker_region': '"us-east-1"',
  'sagemaker_submit_directory': '"s3://sagemaker-us-east-1-726963482731/red-wine-rf-scikit-training-2020-02-29-15-03-44-768/source/sourcedir.tar.gz"'},
 'AlgorithmSpecification': {'TrainingImage': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.20.0-cpu-py3',
  

In [62]:
model_data_s3_uri

's3://sagemaker-us-east-1-726963482731/red-wine-rf-scikit-training-2020-02-29-15-03-44-768/output/model.tar.gz'

## Hyperparameter Tuning Test

In [67]:
seed = 5590

# Define exploration boundaries
hyperparameter_ranges = {
    'n-estimators': IntegerParameter(10, 1000),
    'max-features': CategoricalParameter(['auto', 'sqrt', 'log2'])}

# create Optimizer
Optimizer = sagemaker.tuner.HyperparameterTuner(
    estimator=sklearn_est,
    hyperparameter_ranges=hyperparameter_ranges,
    base_tuning_job_name='red-wine-rf-hpo',
    objective_type='Minimize',
    objective_metric_name='RMSE',
    metric_definitions=[
        {'Name': 'RMSE',
        'Regex': "RMSE: ([0-9.]+).*$"}],
    max_jobs=15,
    max_parallel_jobs=3)

In [68]:
Optimizer.fit(
    {
        'train': preprocessed_training_data, 
        'test': preprocessed_test_data
    },
    wait = False
)

In [83]:
is_minimize = (tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['Type'] != 'Maximize')

NameError: name 'tuning_job_result' is not defined

In [84]:
tuner = Optimizer.analytics()
df_tune_ouput_all = tuner.dataframe()

df = df_tune_ouput_all[df_tune_ouput_all['FinalObjectiveValue'] > -float('inf')]

df

Unnamed: 0,FinalObjectiveValue,TrainingElapsedTimeSeconds,TrainingEndTime,TrainingJobName,TrainingJobStatus,TrainingStartTime,max-features,n-estimators
0,0.59819,55.0,2020-02-29 17:50:30+00:00,red-wine-rf-hpo-200229-1732-015-60904f48,Completed,2020-02-29 17:49:35+00:00,"""log2""",153.0
1,0.600965,41.0,2020-02-29 17:48:52+00:00,red-wine-rf-hpo-200229-1732-014-c78d88a4,Completed,2020-02-29 17:48:11+00:00,"""log2""",89.0
2,0.608505,54.0,2020-02-29 17:49:45+00:00,red-wine-rf-hpo-200229-1732-013-3a0ecf82,Completed,2020-02-29 17:48:51+00:00,"""log2""",28.0
3,0.597289,84.0,2020-02-29 17:47:40+00:00,red-wine-rf-hpo-200229-1732-012-00dbcba5,Completed,2020-02-29 17:46:16+00:00,"""sqrt""",665.0
4,0.597413,70.0,2020-02-29 17:45:54+00:00,red-wine-rf-hpo-200229-1732-011-1fcf2b15,Completed,2020-02-29 17:44:44+00:00,"""log2""",1000.0
5,0.597129,80.0,2020-02-29 17:45:58+00:00,red-wine-rf-hpo-200229-1732-010-3eab2de1,Completed,2020-02-29 17:44:38+00:00,"""log2""",706.0
6,0.597207,90.0,2020-02-29 17:43:14+00:00,red-wine-rf-hpo-200229-1732-009-c1f213f3,Completed,2020-02-29 17:41:44+00:00,"""log2""",919.0
7,0.597364,63.0,2020-02-29 17:42:16+00:00,red-wine-rf-hpo-200229-1732-008-ac409c29,Completed,2020-02-29 17:41:13+00:00,"""sqrt""",259.0
8,0.597138,68.0,2020-02-29 17:42:42+00:00,red-wine-rf-hpo-200229-1732-007-f82de863,Completed,2020-02-29 17:41:34+00:00,"""sqrt""",704.0
9,0.59679,52.0,2020-02-29 17:38:45+00:00,red-wine-rf-hpo-200229-1732-006-46f3de97,Completed,2020-02-29 17:37:53+00:00,"""log2""",302.0


In [87]:
import bokeh
import bokeh.io
bokeh.io.output_notebook()
from bokeh.plotting import figure, show
from bokeh.models import HoverTool

class HoverHelper():

    def __init__(self, tuning_analytics):
        self.tuner = tuning_analytics

    def hovertool(self):
        tooltips = [
            ("FinalObjectiveValue", "@FinalObjectiveValue"),
            ("TrainingJobName", "@TrainingJobName"),
        ]
        for k in self.tuner.tuning_ranges.keys():
            tooltips.append( (k, "@{%s}" % k) )

        ht = HoverTool(tooltips=tooltips)
        return ht

    def tools(self, standard_tools='pan,crosshair,wheel_zoom,zoom_in,zoom_out,undo,reset'):
        return [self.hovertool(), standard_tools]

hover = HoverHelper(tuner)

p = figure(plot_width=900, plot_height=400, tools=hover.tools(), x_axis_type='datetime')
p.circle(source=df, x='TrainingStartTime', y='FinalObjectiveValue')
show(p)

# Deploy Model

In [89]:
sklearn_est.latest_training_job.wait(logs='None')


2020-02-29 15:06:27 Starting - Preparing the instances for training
2020-02-29 15:06:27 Downloading - Downloading input data
2020-02-29 15:06:27 Training - Training image download completed. Training in progress.
2020-02-29 15:06:27 Uploading - Uploading generated training model
2020-02-29 15:06:27 Completed - Training job completed


In [90]:
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_est.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact persisted at ' + artifact)

Model artifact persisted at s3://sagemaker-us-east-1-726963482731/red-wine-rf-scikit-training-2020-02-29-15-03-44-768/output/model.tar.gz


In [91]:
sklearn_predictor = sklearn_est.deploy(initial_instance_count=1, 
                                   instance_type='ml.m4.xlarge')

---------------!

# Create Predictions

In [92]:
sklearn_predictor.content_type = 'text/csv'
sklearn_predictor.serializer = csv_serializer
sklearn_predictor.deserializer = None

NameError: name 'csv_serializer' is not defined

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.as_matrix()[:, 1:])

# Shut Down Endopoint

In [93]:
sagemaker.Session().delete_endpoint(sklearn_predictor.endpoint)