In [160]:
import os
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split

import sagemaker
from sagemaker.session import Session
from sagemaker.xgboost.estimator import XGBoost

In [161]:
# Set region, boto3 and SageMaker SDK variables¶

#You can change this to a region of your choice
import sagemaker
region = sagemaker.Session().boto_region_name
print("Using AWS Region: {}".format(region))

boto3.setup_default_session(region_name=region)
boto_session = boto3.Session(region_name=region)

s3_client = boto3.client('s3', region_name=region)
sagemaker_boto_client = boto_session.client('sagemaker')

sagemaker_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client)

sagemaker_role = sagemaker.get_execution_role()
account_id = boto3.client('sts').get_caller_identity()["Account"]

random_state = 42

Using AWS Region: us-east-1


In [26]:
%store
%store -r

Stored variables and their in-db values:
data_prefix                    -> 'sagemaker-tutorial/data'
default_bucket                 -> 'sagemaker-us-east-1-367158743199'
feature_group_name             -> 'FG-flow-sm-tutorial-31-16-16-17-9f41d66b'
hyperparameters                -> {'max_depth': '3', 'eta': '0.2', 'objective': 'bin
model_data                     -> 's3://sagemaker-us-east-1-367158743199/tf2-resnet-
prefix                         -> 'sagemaker-tutorial'
s3_raw_data                    -> 's3://sagemaker-us-east-1-367158743199/sagemaker-t


## Get the data from offline feature store

Feature Store provides offline storage for feature values in your S3 bucket. Your data is stored in your S3 bucket using a prefixing scheme based on event time. The offline store is an append-only store, enabling Feature Store to maintain a historical record of all feature values. Data is stored in the offline store in Parquet format for optimized storage and query access.

You can query, explore, and visualize features using Data Wrangler from Amazon SageMaker Studio.  Feature Store supports combining data to produce, train, validate, and test data sets, and allows you to extract data at different points in time.

 
<span style="color:red">**TODO:  THE CODE NEEDS TO CHANGE**</span>


In [132]:
# featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', 
#                                            region_name=region
#                                           )

# feature_store_session = Session(
#     boto_session=boto_session,
#     sagemaker_client=sagemaker_boto_client,
#     sagemaker_featurestore_runtime_client=featurestore_runtime
# )

# offline_feature_store_bucket = f's3://{default_bucket}/{account_id}/sagemaker/{region}/offline-store/{feature_group_name}/data/year=2021/month=03/day=31/hour=16/'
# offline_feature_store_bucket

# !aws s3 cp $offline_feature_store_bucket ../sm-tutorial/02_build_train/ --recursive

# # offline_feature_store_bucket = f's3://{default_bucket}/'
# fg_prefix = f'sagemaker/{region}/offline-store/{feature_group_name}/data/'
# s3_client.list_objects_v2(Bucket=default_bucket,
#                          Prefix=fg_prefix,
#                          Delimiter='/')

# def download_all_objects_in_folder():
#     s3_resource = boto3.resource('s3')
#     my_bucket = s3_resource.Bucket(default_bucket)
#     objects = my_bucket.objects.filter(Prefix=offline_feature_prefix)
#     for obj in objects:
#         path, filename = os.path.split(obj.key)
#         my_bucket.download_file(obj.key, filename)

In [135]:
file_names = ['20210331T162204Z_ATxVKv9V8rL9hJyQ.parquet',
             '20210331T162204Z_BVs0QiuqNaVyrXTY.parquet',
             '20210331T162204Z_KnPgBMRO3yEo3BP3.parquet	']

local_processed_data = '../sm-tutorial/02_build_train/processed_data/'
for f in file_names:
    s3_path = f's3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/{f}'

    ! aws s3 cp $s3_path $local_processed_data

download: s3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/20210331T162204Z_ATxVKv9V8rL9hJyQ.parquet to ../sm-tutorial/02_build_train/processed_data/20210331T162204Z_ATxVKv9V8rL9hJyQ.parquet
download: s3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/20210331T162204Z_BVs0QiuqNaVyrXTY.parquet to ../sm-tutorial/02_build_train/processed_data/20210331T162204Z_BVs0QiuqNaVyrXTY.parquet
download: s3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/20210331T162204Z_KnPgBMRO3yEo3BP3.parquet to ../sm-tutorial/02_build_train/processed_data/20210331T162204Z_KnPgBMRO3yEo3BP3.parquet


In [158]:
import pyarrow.parquet as pq

def join_parquet_files(dir_path=local_processed_data):
    all_files = os.listdir(dir_path)


    df = pd.DataFrame()
    for f in all_files:
        full_path = os.path.join(dir_path, f)
        df_partial = pq.read_table(full_path).to_pandas()
        df = pd.concat([df, df_partial], axis=0)
    
    return df

In [164]:
df_processed = join_parquet_files()

## Split DataFrame into Train & Test Sets

In [172]:
X_train, X_val = train_test_split(df_processed, test_size=0.3, random_state=random_state)

In [181]:
X_train.to_csv(f'{local_processed_data}/train.csv', header=False, index=False)

response = sagemaker_session.upload_data(f'{local_processed_data}/train.csv',
                                         bucket=default_bucket, 
                                         key_prefix=data_prefix)
train_data_uri = response
%store train_data_uri

Stored 'train_data_uri' (str)


In [182]:
X_val.to_csv(f'{local_processed_data}/validation.csv', header=False, index=False)

response = sagemaker_session.upload_data(f'{local_processed_data}/validation.csv',
                                         bucket=default_bucket, 
                                         key_prefix=data_prefix)
validation_data_uri = response
%store validation_data_uri

Stored 'validation_data_uri' (str)


# Train a model using XGBoost

 
<span style="color:red">**TODO:  XGBoost details**</span>


## Set the hyperparameters

 
<span style="color:red">**TODO:  XGBoost hyperparameters details**</span>


In [184]:
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "binary:logistic",
    "num_round": "50"}

%store hyperparameters

Stored 'hyperparameters' (dict)


## Create and fit the estimator

In [186]:
prefix

'sagemaker-tutorial'

In [188]:
train_instance_count = 1
train_instance_type = "ml.m4.xlarge"
content_type = "text/csv"
estimator_output_path = f's3://{default_bucket}/{prefix}/training_jobs'

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", sagemaker.Session().boto_region_name, "1.2-1")


# construct a SageMaker estimator that calls the xgboost-container
xgb_estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                              hyperparameters=hyperparameters,
                                              role=sagemaker.get_execution_role(),
                                              instance_count=train_instance_count,
                                              instance_type=train_instance_type,
                                              volume_size=5,  # 5 GB
                                              output_path=estimator_output_path)

In [None]:
if 'training_job_1_name' not in locals():
    
    xgb_estimator.fit(inputs = {'train': train_data_uri})
    training_job_1_name = xgb_estimator.latest_training_job.job_name
    %store training_job_1_name
    
else:
    print(f'Using previous training job: {training_job_1_name}')

2021-04-01 14:56:01 Starting - Starting the training job...
2021-04-01 14:56:25 Starting - Launching requested ML instancesProfilerReport-1617288961: InProgress
.....