In [50]:
import os
import boto3
import re
import copy
import time
from time import gmtime, strftime
from sagemaker import get_execution_role
role = get_execution_role()
region = boto3.Session().region_name
bucket='projet01' # Put your s3 bucket name here
prefix = 'sagemaker/xgboost-mnist2' # Used as part of the path in the bucket where you store data
# customize to your bucket where you will store data
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data'


In [51]:
import pickle, gzip, urllib.request, json
import numpy as np # Load the dataset
import pandas as pd
from sklearn.model_selection import train_test_split


urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz")
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train2_set, valid2_set, test2_set = pickle.load(f, encoding='latin1') 
print(train2_set[0].shape)

data = pd.read_csv(url, header=None)
train_set, test_set = train_test_split(data ,test_size=0.2)
train_set, valid_set = train_test_split(train_set ,test_size=0.25)
print(data.shape)
print(train_set.shape)
print(test_set.shape)
print(valid_set.shape)
data.head()

(50000, 784)
(958, 10)
(574, 10)
(192, 10)
(192, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


In [52]:
import struct
import io
import csv
import boto3

def label_to_int(string):
    if string == "positive" or string == 'p':
        return 1
    return 0

def data_to_int(string):
    if string == "o":
        return 111
    elif string == "x":
        return 120
    return 98

def transform_data(data):
    i = 0
    for item in data:
        item = [data_to_int(e) for e in item]
        data[i] = item
        i = i + 1
    return data
        
def convert_data():
    data_partitions = [('train', train_set), ('validation', valid_set), ('test', test_set)]
    for data_partition_name, data_partition in data_partitions:
        print('{}: {} {}'.format(data_partition_name, data_partition[0].shape,data_partition[1].shape))
        labels = [label_to_int(t) for t in data_partition[9]]
        features = [ t.tolist() for t in data_partition.as_matrix(columns=data_partition.columns[0:9])]
        features = transform_data(features)
        if data_partition_name != 'test':
            examples = np.insert(features, 0, labels, axis=1)
        else:
            examples = features 
            #print(examples[50000,:])
        print(examples)
        np.savetxt('data.csv', examples, delimiter=',', fmt='%s')
        key = "{}/{}/examples".format(prefix,data_partition_name) 
        url ='s3n://{}/{}'.format(bucket, key)
        boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_file('data.csv')
        print('Done writing to {}'.format(url))
convert_data()

train: (574,) (574,)
[[  0  98  98 ... 111 111 111]
 [  1 120  98 ... 120 111 120]
 [  1  98  98 ...  98 111 120]
 ...
 [  1 111 120 ...  98 111 111]
 [  1 120  98 ...  98 111 111]
 [  1  98 111 ...  98  98 120]]
Done writing to s3n://projet01/sagemaker/xgboost-mnist2/train/examples
validation: (192,) (192,)
[[  1 120 120 ...  98 111 111]
 [  1 120 111 ... 120 111  98]
 [  1 111 111 ...  98 120 111]
 ...
 [  1  98 120 ...  98 111 111]
 [  1 120 120 ... 111 120 111]
 [  1  98 120 ... 120 111  98]]




Done writing to s3n://projet01/sagemaker/xgboost-mnist2/validation/examples
test: (192,) (192,)
[[111, 111, 111, 98, 98, 120, 120, 98, 120], [98, 120, 111, 98, 120, 111, 98, 120, 98], [120, 120, 111, 120, 111, 120, 111, 98, 111], [98, 98, 120, 98, 120, 111, 120, 98, 111], [120, 111, 111, 120, 120, 98, 98, 111, 120], [120, 98, 111, 98, 120, 111, 111, 120, 120], [120, 120, 111, 120, 111, 111, 120, 111, 120], [120, 120, 111, 98, 98, 111, 98, 120, 111], [120, 111, 120, 111, 111, 120, 120, 111, 98], [98, 111, 120, 111, 120, 120, 120, 111, 98], [120, 120, 111, 120, 111, 98, 120, 111, 98], [98, 111, 111, 98, 98, 98, 120, 120, 120], [120, 98, 98, 111, 111, 111, 120, 120, 98], [120, 120, 120, 98, 98, 111, 111, 120, 111], [120, 120, 111, 111, 111, 98, 111, 120, 120], [120, 120, 98, 120, 98, 111, 120, 111, 111], [120, 111, 120, 120, 98, 111, 120, 98, 111], [111, 120, 120, 120, 111, 111, 111, 120, 120], [98, 111, 120, 120, 120, 111, 120, 111, 98], [120, 120, 111, 120, 120, 111, 98, 111, 111], [120

In [53]:
import sagemaker 
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(boto3.Session().region_name, 'xgboost')

train_data = 's3://{}/{}/{}'.format(bucket, prefix, 'train') 
validation_data = 's3://{}/{}/{}'.format(bucket, prefix, 'validation')
s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model_sdk')
print(train_data)

xgb_model = sagemaker.estimator.Estimator(container, role, train_instance_count=1, train_instance_type='ml.m4.xlarge', train_volume_size = 5, output_path=s3_output_location, sagemaker_session=sagemaker.Session())
xgb_model.set_hyperparameters(max_depth = 5,
 eta = .2,
 gamma = 4,
 min_child_weight = 6,
 silent = 0,
 objective = "multi:softmax",
 num_class = 10,
 num_round = 10)

train_channel = sagemaker.session.s3_input(train_data, content_type='text/csv')
valid_channel = sagemaker.session.s3_input(validation_data, content_type='text/csv')
data_channels = {'train': train_channel, 'validation': valid_channel}
xgb_model.fit(inputs=data_channels, logs=True)


	get_image_uri(region, 'xgboost', '0.90-1').


s3://projet01/sagemaker/xgboost-mnist2/train
2020-02-01 13:04:33 Starting - Starting the training job...
2020-02-01 13:04:34 Starting - Launching requested ML instances......
2020-02-01 13:05:33 Starting - Preparing the instances for training......
2020-02-01 13:06:34 Downloading - Downloading input data...
2020-02-01 13:07:32 Training - Training image download completed. Training in progress.
2020-02-01 13:07:32 Uploading - Uploading generated training model.[34mArguments: train[0m
[34m[2020-02-01:13:07:27:INFO] Running standalone xgboost training.[0m
[34m[2020-02-01:13:07:27:INFO] File size need to be processed in the node: 0.03mb. Available memory size in the node: 8523.55mb[0m
[34m[2020-02-01:13:07:27:INFO] Determined delimiter of CSV input is ','[0m
[34m[13:07:27] S3DistributionType set as FullyReplicated[0m
[34m[13:07:27] 574x9 matrix with 5166 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-02-01:13:07:27:INFO] Determi

In [54]:
common_training_params = \
{
 "AlgorithmSpecification": {
 "TrainingImage": container,
 "TrainingInputMode": "File"
 },
 "RoleArn": role,
 "OutputDataConfig": {
 "S3OutputPath": bucket_path + "/"+ prefix + "/xgboost"
 },
 "ResourceConfig": {
 "InstanceCount": 1,
 "InstanceType": "ml.m4.xlarge",
 "VolumeSizeInGB": 5
 },
 "HyperParameters": {
 "max_depth":"5",
 "eta":"0.2",
 "gamma":"4",
 "min_child_weight":"6",
 "silent":"0",
 "objective": "multi:softmax",
 "num_class": "10",
 "num_round": "10"
 },
 "StoppingCondition": {
 "MaxRuntimeInSeconds": 86400
 },
 "InputDataConfig": [
 {
 "ChannelName": "train",
 "DataSource": {
 "S3DataSource": {
 "S3DataType": "S3Prefix",
 "S3Uri": bucket_path + "/"+ prefix+ '/train/',
 "S3DataDistributionType": "FullyReplicated"
 }
 },
 "ContentType": "text/csv",
 "CompressionType": "None"
 },
 {
 "ChannelName": "validation",
 "DataSource": {
 "S3DataSource": {
 "S3DataType": "S3Prefix",
 "S3Uri": bucket_path + "/"+ prefix+ '/validation/',
 "S3DataDistributionType": "FullyReplicated"
 }
 },
 "ContentType": "text/csv",
 "CompressionType": "None"
 }
 ]
}

training_job_name = 'xgboost-mnist' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Job name is:", training_job_name)
training_job_params = copy.deepcopy(common_training_params)
training_job_params['TrainingJobName'] = training_job_name
training_job_params['ResourceConfig']['InstanceCount'] = 1

Job name is: xgboost-mnist2020-02-01-13-14-11


In [56]:
sm = boto3.Session().client('sagemaker')
sm.create_training_job(**training_job_params)
status = sm.describe_training_job(TrainingJobName=training_job_name)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=training_job_name)
status = sm.describe_training_job(TrainingJobName=training_job_name)['TrainingJobStatus']
print("Training job ended with status: " + status)
if status == 'Failed':
 message = sm.describe_training_job(TrainingJobName=training_job_name)['FailureReason']
 print('Training failed with the following error: {}'.format(message))
 raise Exception('Training job failed')

InProgress
Training job ended with status: Completed


In [57]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge',)

-------------!

In [113]:
def transform_data2(data):
    l = []
    for item in data:
        if item == '111':
            l += 'o'
        elif item == '120':
            l += 'x'
        else:
            l += 'b'
    return l

s3 = boto3.resource('s3')
test_key = "{}/test/examples".format(prefix)
s3.Bucket(bucket).download_file(test_key, 'test_data')
with open('test_data', 'r') as f:
    for j in range(0,100):
        single_test = f.readline()
        origin_data = transform_data2(single_test[:-1].split(','))
        transformed_data.append(origin_data)
        result = xgb_predictor.predict(single_test)
        f_result = float(result)
        print("Data: "+str(origin_data))
        if (f_result == 1):
            print("Prediction: Positive")
        else:
            print("Prediction: Negative")
        print("-------------")

Data: ['o', 'o', 'o', 'b', 'b', 'x', 'x', 'b', 'x']
Prediction: Positive
-------------
Data: ['b', 'x', 'o', 'b', 'x', 'o', 'b', 'x', 'b']
Prediction: Positive
-------------
Data: ['x', 'x', 'o', 'x', 'o', 'x', 'o', 'b', 'o']
Prediction: Negative
-------------
Data: ['b', 'b', 'x', 'b', 'x', 'o', 'x', 'b', 'o']
Prediction: Positive
-------------
Data: ['x', 'o', 'o', 'x', 'x', 'b', 'b', 'o', 'x']
Prediction: Positive
-------------
Data: ['x', 'b', 'o', 'b', 'x', 'o', 'o', 'x', 'x']
Prediction: Positive
-------------
Data: ['x', 'x', 'o', 'x', 'o', 'o', 'x', 'o', 'x']
Prediction: Positive
-------------
Data: ['x', 'x', 'o', 'b', 'b', 'o', 'b', 'x', 'o']
Prediction: Negative
-------------
Data: ['x', 'o', 'x', 'o', 'o', 'x', 'x', 'o', 'b']
Prediction: Positive
-------------
Data: ['b', 'o', 'x', 'o', 'x', 'x', 'x', 'o', 'b']
Prediction: Positive
-------------
Data: ['x', 'x', 'o', 'x', 'o', 'b', 'x', 'o', 'b']
Prediction: Positive
-------------
Data: ['b', 'o', 'o', 'b', 'b', 'b', 'x', '