# Hit or Miss
   (1) The dataset starts as a Board. This is a class in our Python app: it can take ships and places them on a 10x10 2-D array. 
    
   (2) The board is convereted into a feature set. This set of features considers each unique point on the 2-D array, and generates a vector of features for that point. A 10x10 array will have 100 rows and 22 columns: 10 columns for game board row indicators, 10 for game board column indicators, 1 for a miss label, and another for hit. The hit column is our label.
    
   (3) We need a modeling solution that will read the board and automatically select a point to hit. This solution should have a train, validation, and test sets.

In [21]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import boto3
import os
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker
from sagemaker import get_execution_role
from sklearn.model_selection import train_test_split
import numpy as np

import sagemaker
from random import shuffle
import multiprocessing
from multiprocessing import Pool
import csv
import nltk
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner

In [22]:
import pandas as pd

train = pd.read_csv('../Data/train.csv', names = list(range(22)))
val= pd.read_csv('../Data/test.csv', names = list(range(22)))
test= pd.read_csv('../Data/validation.csv', names = list(range(22)))

In [31]:
!aws s3 cp ../Data/validation.csv s3://sagemaker-us-west-1-023375022819/validation/
!aws s3 cp ../Data/train.csv s3://sagemaker-us-west-1-023375022819/train/

upload: ../Data/validation.csv to s3://sagemaker-us-west-1-023375022819/validation/validation.csv
upload: ../Data/train.csv to s3://sagemaker-us-west-1-023375022819/train/train.csv


In [23]:
train_labels = np.array(train[0]).astype("float32")
train_features = np.array(train.drop(0, axis=1)).astype("float32")
test_labels = np.array(test[0]).astype("float32")
test_features  = np.array(test.drop(0, axis=1)).astype("float32")
val_labels = np.array(val[0]).astype("float32")
val_features  = np.array(val.drop(0, axis=1)).astype("float32")

In [24]:
def get_base_estimator(clf, sess, role):

    container = get_image_uri(boto3.Session().region_name, clf)

    est = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, clf),
                                    sagemaker_session=sess)
    return est

In [25]:
def get_estimator(clf, sess, role):
    
    container = get_image_uri(boto3.Session().region_name, clf)

    
    if clf == 'xgboost':
        est = get_base_estimator(clf, sess, role)
        est.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        num_round=100)
        
    elif clf == 'linear-learner':
        
        est = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)

    elif clf == 'knn':
        est = sagemaker.KNN(role=sagemaker.get_execution_role(),
                                              k = 10,
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='classifier',
                                                sample_size = 200)
        

        
        
    elif clf == 'factorization-machines':
        est = sagemaker.FactorizationMachines(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                                num_factors = 2)
        
        
    return est

In [29]:
# add k-fold cross validation here 
sess = sagemaker.Session()
role = get_execution_role()
client = boto3.client('sagemaker')
bucket = sess.default_bucket()

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/train'.format(bucket), content_type='csv')
s3_input_test = sagemaker.s3_input(s3_data='s3://{}/validation/'.format(bucket), content_type='csv')



In [27]:
import sagemaker
from sagemaker.amazon.amazon_estimator import RecordSet
import boto3

# instantiate the LinearLearner estimator object
multiclass_estimator = sagemaker.LinearLearner(role=sagemaker.get_execution_role(),
                                               train_instance_count=1,
                                               train_instance_type='ml.m4.xlarge',
                                               predictor_type='binary_classifier',
                                               num_classes=2)



In [16]:
# wrap data in RecordSet objects
train_records = multiclass_estimator.record_set(train_features, train_labels, channel='train')
test_records = multiclass_estimator.record_set(test_features, test_labels, channel='test')

# start a training job
multiclass_estimator.fit([train_records, test_records])

2019-07-14 01:35:42 Starting - Starting the training job...
2019-07-14 01:35:52 Starting - Launching requested ML instances......
2019-07-14 01:36:57 Starting - Preparing the instances for training...
2019-07-14 01:37:43 Downloading - Downloading input data...
2019-07-14 01:38:12 Training - Downloading the training image...
2019-07-14 01:38:43 Uploading - Uploading generated training model
2019-07-14 01:38:43 Completed - Training job completed

[31mDocker entrypoint called with argument(s): train[0m
[31m[07/14/2019 01:38:33 INFO 140235397719872] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u

Billable seconds: 61


In [28]:
def get_tuner(clf, est):
        
    if clf == 'xgboost':
        objective_metric_name = 'validation:auc'

        hyperparameter_ranges = {'eta': ContinuousParameter(0, 1),
                        'min_child_weight': ContinuousParameter(1, 10),
                        'alpha': ContinuousParameter(0, 2),
                        'max_depth': IntegerParameter(1, 10)}
        
    elif clf == 'knn':
        
        objective_metric_name = 'test:accuracy'

        hyperparameter_ranges = {'k': IntegerParameter(1, 1024),
                        'sample_size': IntegerParameter(256, 20000000)}
        
    elif clf == 'linear-learner':
        objective_metric_name = 'test:recall'
        
        hyperparameter_ranges = {'l1': ContinuousParameter(0.0000001,1),
                            'use_bias': CategoricalParameter([True, False])}
        
    elif clf == 'factorization-machines':
        objective_metric_name = 'test:binary_classification_accuracy'
        
        hyperparameter_ranges = {'bias_wd': IntegerParameter(1, 1000)}
        
    tuner = HyperparameterTuner(est,
                    objective_metric_name,
                    hyperparameter_ranges,
                    max_jobs=30,
                    max_parallel_jobs=3)
    
    return tuner

In [35]:
def run_training_job(clf):

    # build the estimator
    est = get_estimator(clf, sess, role)

    # get the hyperparameter tuner config 
    # set this to look for recall somehow 
    if clf == 'xgboost':
        
        tuner = get_tuner(clf, est)
        
        tuner.fit({'train': s3_input_train, 'test': s3_input_test}) 

    else:
        # set the records
        train_records = est.record_set(train_features, train_labels, channel='train')
        test_records = est.record_set(test_features, test_labels, channel='test')

        tuner = get_tuner(clf, est)
        
        tuner.fit([train_records, test_records])
    

In [36]:
def magic_loop(models_to_run):
    pool = Pool(processes=multiprocessing.cpu_count())
    transformed_rows = pool.map(run_training_job, models_to_run)
    pool.close() 
    pool.join()

In [37]:
clfs = ['xgboost', 'linear-learner', 'factorization-machines', 'knn']
magic_loop(clfs)

ClientError: An error occurred (ResourceLimitExceeded) when calling the CreateHyperParameterTuningJob operation: The account-level service limit 'Number of instances across all training jobs' is 4 Instances, with current utilization of 6 Instances and a request delta of 3 Instances. Please contact AWS support to request an increase for this limit.