# Prequisites and Preprocessing
### Permissions and environment variables
### Here we set up the linkage and authentication to AWS services.

* The roles used to give learning and hosting access to your data. See the documentation for how to specify these.
* The S3 bucket that you want to use for training and model data.

In [5]:
%%time

import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
import sagemaker
import sklearn
from sagemaker import get_execution_role
import pandas as pd
import numpy as np

role = get_execution_role()

region = boto3.Session().region_name

sess = sagemaker.Session()

bucket='vlgsagemakersimplemodel' # put your s3 bucket name here, and create s3 bucket
prefix = 'sagemaker/simple-model-xgboost-multiclass-classification'

CPU times: user 935 ms, sys: 55 ms, total: 990 ms
Wall time: 1.12 s


# Data ingestion

In [1]:
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data

--2020-02-05 19:29:13--  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4551 (4.4K) [application/x-httpd-php]
Saving to: ‘iris.data’


2020-02-05 19:29:14 (87.2 MB/s) - ‘iris.data’ saved [4551/4551]



In [91]:
iris = pd.read_csv('iris.data', header=None, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'])
iris.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [92]:
iris.species.unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [93]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit(iris.species.values)

np.save('classes.npy', le.classes_)

In [95]:
encoder = LabelEncoder()
encoder.classes_ = np.load('classes.npy')

In [96]:
iris['species'] = encoder.transform(iris.species.values)

iris.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [98]:
encoder.transform(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])

array([0, 1, 2])

In [99]:
iris = pd.concat([iris['species'], iris.drop(['species'], axis=1)], axis=1)
iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


In [103]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(iris, test_size=0.2, random_state=1)

train_data, validation_data = train_test_split(train_data,test_size=0.2, random_state=1)

train_data.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
80,1,5.5,2.4,3.8,1.1
60,1,5.0,2.0,3.5,1.0
104,2,6.5,3.0,5.8,2.2
62,1,6.0,2.2,4.0,1.0
115,2,6.4,3.2,5.3,2.3


In [104]:
test_data.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
14,0,5.8,4.0,1.2,0.2
98,1,5.1,2.5,3.0,1.1
75,1,6.6,3.0,4.4,1.4
16,0,5.4,3.9,1.3,0.4
131,2,7.9,3.8,6.4,2.0


In [105]:
validation_data.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
81,1,5.5,2.4,3.7,1.0
38,0,4.4,3.0,1.3,0.2
23,0,5.1,3.3,1.7,0.5
72,1,6.3,2.5,4.9,1.5
43,0,5.0,3.5,1.6,0.6


In [106]:
train_data.to_csv('train.csv', header=False, index=False)
test_data.to_csv('test.csv', header=False, index=False)
validation_data.to_csv('validation.csv', header=False, index=False)

In [107]:
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv')).upload_file('validation.csv')

In [122]:
s3_training_file_location  = 's3://{}/{}/train/train.csv'.format(bucket, prefix)
print(s3_training_file_location)
s3_validation_file_location = 's3://{}/{}//validation/validation.csv'.format(bucket, prefix)
print(s3_validation_file_location)
training_input_config = sagemaker.session.s3_input(s3_data=s3_training_file_location,content_type="csv")
validation_input_config = sagemaker.session.s3_input(s3_data=s3_validation_file_location,content_type="csv")

print(training_input_config)
print(validation_input_config)

s3://vlgsagemakersimplemodel/sagemaker/simple-model-xgboost-multiclass-classification/train/train.csv
s3://vlgsagemakersimplemodel/sagemaker/simple-model-xgboost-multiclass-classification//validation/validation.csv
<sagemaker.inputs.s3_input object at 0x7ff8f81c6cc0>
<sagemaker.inputs.s3_input object at 0x7ff8f81147b8>


# Training the XGBoost model

In [123]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region, 'xgboost', '0.90-1')

In [125]:
xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)

In [129]:
xgb.set_hyperparameters(
                        num_round=12,
                        objective = "multi:softmax", 
                        num_class = 3, 
                        eval_metric = "mlogloss"
                    )

In [130]:
xgb.fit({'train': training_input_config, 'validation': training_input_config})

2020-02-05 20:50:37 Starting - Starting the training job...
2020-02-05 20:50:39 Starting - Launching requested ML instances.........
2020-02-05 20:52:17 Starting - Preparing the instances for training......
2020-02-05 20:53:20 Downloading - Downloading input data...
2020-02-05 20:53:42 Training - Downloading the training image..[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value mlogloss to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter booster value gbtree to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm 

In [131]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

-------------------!

In [133]:
from sagemaker.predictor import csv_serializer
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer
xgb_predictor.deserializer = None

def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

predictions = predict(test_data.values[:, 1:])
predictions

array([0., 1., 1., 0., 2., 1., 2., 0., 0., 2., 1., 0., 2., 1., 1., 0., 1.,
       1., 0., 0., 1., 1., 2., 0., 2., 1., 0., 0., 1., 2.])

In [135]:
test_data.values[:, :1]

array([[0.],
       [1.],
       [1.],
       [0.],
       [2.],
       [1.],
       [2.],
       [0.],
       [0.],
       [2.],
       [1.],
       [0.],
       [2.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [0.],
       [2.],
       [1.],
       [0.],
       [0.],
       [1.],
       [2.]])

In [137]:
from sklearn import metrics
cm = metrics.confusion_matrix(test_data.values[:, :1], predictions)
print(cm)

[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]
