This is a basic classification problem example using AWS SageMaker's XGBoost Algorithm to predict whether a person makes <= 50K per year or > 50K per year based on census data. It is modeled after the AWS tutorial found here:
https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html

The census data set can be found here:
https://archive.ics.uci.edu/ml/datasets/Census+Income

In [None]:
import boto3
from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

# Enter the s3 bucket and path where you want to store the training and test data
bucket = 'bucket'
prefix = 'prefix'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [None]:
import urllib.request

# Get the training data set
# This data was originally obtained from https://archive.ics.uci.edu/ml/datasets/Census+Income
urllib.request.urlretrieve("https://raw.githubusercontent.com/FINRAOS/CodeSamples/master/machine-learning-samples/src/main/resources/adult.data", "adult.data")
urllib.request.urlretrieve("https://raw.githubusercontent.com/FINRAOS/CodeSamples/master/machine-learning-samples/src/main/resources/adult.test", "adult.test")

In [None]:
# List of the field names for the data set
fields = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country'
]

# Set indexes for categorical and continuous field types
categoricalFieldIndexes = [1, 3, 5, 6, 7, 8, 9, 13]
continuousFieldIndexes = [0, 2, 4, 10, 11, 12]

In [None]:
import pandas as pd

# Load the data set into a dataframe and format it
def load_and_format_data(csv_name):
    df = pd.read_csv(csv_name, header=None, names=fields + ['label'])

    df = df.reindex(columns=['label'] + fields)

    df.replace(regex='^ ', value = '', inplace=True)
    df.replace({'label' : '\.$'}, {'label' : ''}, regex=True, inplace=True)
    
    return df

# Load both the training and test data set
train_data = load_and_format_data('adult.data')
test_data = load_and_format_data('adult.test')

In [None]:
def removeField(field_names):
    for field_name in field_names:
        index = fields.index(field_name)
        fields.remove(field_name)
        
        updateIndexes(categoricalFieldIndexes, index)
        updateIndexes(continuousFieldIndexes, index)
        
        train_data.drop(columns=[field_name], inplace=True)
        test_data.drop(columns=[field_name], inplace=True)

def updateIndexes(indexes, removeIndex):
    loc = 0
    for i in range(0, len(indexes)):
        index = indexes[loc]
        if index == removeIndex:
            indexes.remove(removeIndex)
            loc = loc - 1
        elif index > removeIndex:
            indexes[loc] = index - 1
        loc = loc + 1
        
#removeField(['fnlwgt', 'education-num', 'relationship'])

print(fields)
print(categoricalFieldIndexes)
print(continuousFieldIndexes)

In [None]:
# Display first ten rows of training set
train_data.head(10)

In [None]:
# Create a dictionary to store the mapping of categorical text values to numerical index values
def get_category_index_map(column_names):
    category_indexes = {}

    for x in column_names:
        categories = set(train_data[x].astype('category').cat.categories.tolist() + test_data[x].astype('category').cat.categories.tolist())
        category_indexes[x] = {k:v for k, v in list(zip(categories, range(0, len(categories))))}

    return category_indexes

category_index_map = get_category_index_map(['label'] + [fields[x] for x in categoricalFieldIndexes])

In [None]:
# Convert categorical features containing text to numerical index values in the dataframe
train_data.replace(category_index_map, inplace=True)
test_data.replace(category_index_map, inplace=True)

In [None]:
# Display first ten rows of training set after data conversion
train_data.head(10)

In [None]:
# Display first ten rows of test set without the label column
test_data.drop(columns=['label']).head(10)

In [None]:
# Export dataframes to CSV (test data set should not contain the label)
train_data.to_csv(path_or_buf='train.csv', header=None, index=False)
test_data.drop(columns=['label']).to_csv(path_or_buf='test.csv', header=None, index=False)

In [None]:
# Upload CSV files to S3
for x in ['train', 'test']:
    key = '{}/{}/data.csv'.format(prefix, x)
    boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_file('{}.csv'.format(x))

In [None]:
import sagemaker

from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(boto3.Session().region_name, 'xgboost', '0.90-1')

In [None]:
train_data_loc = 's3://{}/{}/{}'.format(bucket, prefix, 'train')

s3_output_location = 's3://{}/{}/{}'.format(bucket, prefix, 'xgboost_model_sdk')

In [None]:
xgb_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.m4.xlarge',
                                         train_volume_size = 5,
                                         output_path=s3_output_location,
                                         sagemaker_session=sagemaker.Session())

In [None]:
xgb_model.set_hyperparameters(objective = "multi:softmax",
                              num_class = 2,
                              num_round = 10)

In [None]:
train_channel = sagemaker.session.s3_input(train_data_loc, content_type='text/csv')

data_channels = {'train': train_channel}

In [None]:
xgb_model.fit(inputs=data_channels, logs=True)

In [None]:
# Run batch transform job
batch_input = 's3://{}/{}/test/data.csv'.format(bucket, prefix)

batch_output = 's3://{}/{}/batch-inference'.format(bucket, prefix)

transformer = xgb_model.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=batch_output)

transformer.transform(data=batch_input, data_type='S3Prefix', content_type='text/csv', split_type='Line')

transformer.wait()

In [None]:
s3 = boto3.resource('s3')

s3.Bucket(bucket).download_file(prefix + '/batch-inference/data.csv.out',  'batch_results')

In [None]:
import re

with open('batch_results') as f:
    results = f.readlines()
    
count = 0

# Check the batch transform results against the test data
for j in range (0, len(test_data['label'])):
    result = int(re.sub('.0$', '', results[j].rstrip()))
    
    if test_data['label'][j] != result:
        count = count + 1
    
print('Test error rate: {}'.format(count / len(test_data['label'])))