## Import required libraries including SageMaker Python SDK

In [1]:
import pandas as pd
import json
import time
import uuid

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup

### Update SageMaker SDK if necessary 

In [2]:
if int(sagemaker.__version__.split('.')[0]) != 2:
    !pip install sagemaker==2.24.1
    print("Updating SageMakerVersion. Please restart the kernel")
else:
    print("SageMaker SDK version is good")

SageMaker SDK version is good


### Set region, boto3 and SageMaker SDK variables

In [3]:
role = get_execution_role()
sess = sagemaker.Session()

boto_session = boto3.Session()
region = boto_session.region_name
print("Region = {}".format(region))

sagemaker_boto_client = boto_session.client('sagemaker')

sagemaker_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client)

Region = us-west-2


### Create directories in the SageMaker default bucket for this tutorial¶


In [9]:
default_bucket= sess.default_bucket() # Alterantively you can use our custom bucket here. 

prefix = 'sagemaker-toturial' # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + '/data'
traindataprefix = prefix + '/train_data'
testdataprefix = prefix + '/test_data'
testdatanolabelprefix = prefix + '/test_data_no_label'
trainheaderprefix = prefix + '/train_headers'

## Upload raw data to S3¶

Use the following code snippet to download the dataset to /data/ folder

In [5]:
!mkdir ./data
!wget -O ./data/adult.data https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data

mkdir: cannot create directory ‘./data’: File exists
--2021-03-03 01:14:51--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3.8M) [application/x-httpd-php]
Saving to: ‘./data/adult.data’


2021-03-03 01:14:51 (16.4 MB/s) - ‘./data/adult.data’ saved [3974305/3974305]



In [7]:
# change column names and save the file as .csv
data_path = './data/adult.data'

# From https://archive.ics.uci.edu/ml/datasets/Adult
col_names = [
    'age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
    'income'
            ]
             
df = pd.read_csv(data_path, sep=',', header=None)
df.columns = col_names

df.to_csv('./data/adult.csv', index=False)

df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [10]:
response = sess.upload_data('./data/adult.csv', bucket=default_bucket, key_prefix=dataprefix)
print(response)

s3://sagemaker-us-west-2-367158743199/sagemaker-toturial/data/adult.csv


# Data prepration using  Sage Maker Data Wragnler

Now that we have the raw data uploaded to S3, we will use SageMaker Data Wrangler to prepare it. With SageMaker Data Wrangler, you can simplify the process of data preparation and feature engineering, and complete each step of the data preparation workflow, including data selection, cleansing, exploration, and visualization from a single visual interface. 

For this toturial, we will do the follopwing:

* For Amazon SageMaker built-in XGBoost algorithm, the label column needs to be the first column in the dataframe
* check for missing values in columns and drop those rows

PLACE HOLDER FOR WRANGLER IMAGES

### Upload the Data Wrangler `.flow` file to Amazon S3

DataWrangler will generate a `.flow` file. It contains a reference to an S3 bucket used during the Wrangling. By running the below cells, the `.flow` file is uploaded to Amazon S3 so that it can be used as an input to the
processing job.

In [None]:
flow_file_name = "./Untitled Folder/Untitiled.flow"
flow_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}"
flow_name = f"flow-{flow_id}"
flow_uri = f"s3://{default_bucket}/{prefix}/{flow_name}.flow"

In [None]:
# Load .flow file
with open(flow_file_name) as f:
    flow = json.load(f)

# Upload to S3
s3_client = boto3.client("s3")
s3_client.upload_file(flow_file_name, default_bucket, f"{prefix}/{flow_name}.flow")

print(f"Data Wrangler Flow uploaded to {flow_uri}")

After running the above cell, you can continue the guide using the provided `./data/credit_preprocessed.csv`

# SageMaker Feature Store

Amazon SageMaker Feature Store is a purpose-built repository where you can store and access features so it’s much easier to name, organize, and reuse them across teams. SageMaker Feature Store provides a unified store for features during training and real-time inference without the need to write additional code or create manual processes to keep features consistent. SageMaker Feature Store keeps track of the metadata of stored features (e.g. feature name or version number) so that you can query the features for the right attributes in batches or in real time using Amazon Athena, an interactive query service. SageMaker Feature Store also keeps features updated, because as new data is generated during inference, the single repository is updated so new features are always available for models to use during training and inference.

A feature store consists of an offline componet stored in S3 and an online component stored in a low-latency database. The online database is optional, but very useful if you need supplemental features to be available at inference. In this section, we will create a feature groups for our Claims and Customers datasets. After inserting the claims and customer data into their respective feature groups, you need to query the offline store with Athena to build the training dataset.

You can reference the [SageMaker Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html) for more information about the SageMaker Feature Store.

In [None]:
featurestore_runtime = boto_session.client(
    service_name='sagemaker-featurestore-runtime', 
    region_name=region
)

feature_store_session = sagemaker.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

In [None]:
df.dtypes

### Configure the feature groups

The datatype for each feature is set by passing a dataframe and inferring the proper datatype. Feature data types can also be set via a config variable, but it will have to match the correspongin Python data type in the Pandas dataframe when it's ingested to the Feature Group.



In [None]:
fg_name = f'{prefix}-credit'

credit_feature_group = FeatureGroup(
    name=fg_name, 
    sagemaker_session=feature_store_session)

credit_feature_group.load_feature_definitions(data_frame=credit_preprocessed);

# Pipelines

### Define the first Data Wrangler step's inputs¶

In [None]:
processing_dir = "/opt/ml/processing"

In [None]:
flow_step_inputs = []

# flow file contains the code for each transformation
flow_file_input = sagemaker.processing.ProcessingInput(
    source=flow_uri,            
    destination=f"{processing_dir}/flow", 
    input_name='flow')

### Define outputs for first Data Wranger step¶

In [None]:
flow_output_name = f"{flow['nodes'][-1]['node_id']}.{flow['nodes'][-1]['outputs'][0]['name']}"

flow_step_outputs = []

flow_output = sagemaker.processing.ProcessingOutput(
    output_name=flow_output_name,
    feature_store_output=sagemaker.processing.FeatureStoreOutput(
        feature_group_name=claims_fg_name), 
    app_managed=True)

flow_step_outputs.append(flow_output)