In [None]:
import boto3
from sagemaker import get_execution_role

role = get_execution_role()

region = boto3.Session().region_name

# Enter the s3 bucket and path where you want to store the training and test data
bucket = 'bucket'
prefix = 'prefix'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [None]:
import urllib.request

# Get the training data set
# This data was originally obtained from Data retrieved from https://archive.ics.uci.edu/ml/datasets/Census+Income
urllib.request.urlretrieve("https://raw.githubusercontent.com/FINRAOS/CodeSamples/master/machine-learning-samples/src/main/resources/adult.data", "adult.data")

In [None]:
# List of the field names for the data set
fields = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country'
]

# Set indexes for categorical and continuous field types
categoricalFieldIndexes = [1, 3, 5, 6, 7, 8, 9, 13]
continuousFieldIndexes = [0, 2, 4, 10, 11, 12]

In [None]:
import pandas as pd

# Load the data set into a dataframe and format it
def load_and_format_data(csv_name):
    df = pd.read_csv(csv_name, header=None, names=fields + ['label'])

    df = df.reindex(columns=['label'] + fields)

    df.replace(regex='^ ', value = '', inplace=True)
    df.replace({'label' : '\.$'}, {'label' : ''}, regex=True, inplace=True)
    
    return df

train_data = load_and_format_data('adult.data')

In [None]:
# Display first ten rows of training set
train_data.head(10)

In [None]:
# Display first ten rows of test set without label
test_data.drop(columns=['label']).head(10)

In [None]:
# Export dataframes to CSV (test data)
train_data.to_csv(path_or_buf='train.csv', index=False)
test_data.drop(columns=['label']).to_csv(path_or_buf='test.csv', index=False)

In [None]:
# Upload CSV to S3
key = '{}/{}/data.csv'.format(prefix, 'train')
boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_file('{}.csv'.format('train'))