In [None]:
!python -m pip install -q scramp==1.2.2 awswrangler==2.2.0

## Import required libraries including SageMaker Python SDK

In [56]:
import pandas as pd
import numpy as np
import re
import json
import time
import uuid
from time import gmtime, strftime
import pathlib
from sklearn.model_selection import train_test_split

import boto3
import awswrangler as wr
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep
from sagemaker.workflow.step_collections import RegisterModel


### Update SageMaker SDK if necessary 

In [2]:
if int(sagemaker.__version__.split('.')[0]) != 2:
    !pip install sagemaker==2.24.1
    print("Updating SageMakerVersion. Please restart the kernel")
else:
    print("SageMaker SDK version is good")

SageMaker SDK version is good


### Set region, boto3 and SageMaker SDK variables

In [3]:
role = get_execution_role()
sess = sagemaker.Session()

boto_session = boto3.Session()
region = boto_session.region_name
print("Region = {}".format(region))

sagemaker_boto_client = boto_session.client('sagemaker')

sagemaker_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client)

sagemaker_role = sagemaker.get_execution_role()

Region = us-west-2


### Create directories in the SageMaker default bucket for this tutorial¶


In [11]:
default_bucket= sess.default_bucket() # Alterantively you can use our custom bucket here. 

prefix = 'sagemaker-tutorial' # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + '/data'
traindataprefix = prefix + '/train_data'
testdataprefix = prefix + '/test_data'
testdatanolabelprefix = prefix + '/test_data_no_label'
trainheaderprefix = prefix + '/train_headers'

# Preprocess data

Use the following code snippet to download the dataset to /data/ folder

In [20]:
!mkdir ./data
!wget -O ./data/default_of_credit_card.xls  https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls

mkdir: cannot create directory ‘./data’: File exists
--2021-03-04 19:43:11--  https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5539328 (5.3M) [application/x-httpd-php]
Saving to: ‘./data/default_of_credit_card.xls’


2021-03-04 19:43:12 (19.1 MB/s) - ‘./data/default_of_credit_card.xls’ saved [5539328/5539328]



In [21]:
# change column names and save the file as .csv
data_path = './data/default_of_credit_card.xls'

df = pd.read_excel('./data/default_of_credit_card.xls', header=1)

df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [22]:
# check for missing data
df.isnull().sum()

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default payment next month    0
dtype: int64

In [23]:
# df_features = df.drop(columns=['income', 'native-country'])

# # removing trailing and heading whitespaces in column values
# string_cols = df_features.select_dtypes(['object']).columns
# df_features[string_cols] = df_features[string_cols].apply(lambda x: x.str.strip())

# # remove special characters from native-country column
# # df_features['native-country'] = df_features['native-country'].apply(lambda x: re.sub('[^A-Za-z0-9]+', '', x))


# # one hot encoding is applied to prepare the data for feature store ingestion
# df_features = pd.get_dummies(df_features)

# df_features = df_features.fillna(0)
# df_features = df_features.reset_index()

timestamp = pd.to_datetime('now').timestamp()
df['EVENT_TIME'] = timestamp

# df['ID'] =  pd.to_numeric(df['ID'] , downcast='float64')
# df['ID'] =df['ID'].astype(float)
# df['ID'] = df['ID'].apply(np.uint64)

df = df.astype(np.float64)


For Amazon SageMaker built-in XGBoost algorithm, the label column needs to be the first column in the dataframe. Use the code below to make that change:

In [24]:
cols = list(df)
cols.insert(0, cols.pop(cols.index('default payment next month')))
df = df.loc[:, cols]

In [25]:
df.rename(columns={"default payment next month": "LABEL"}, inplace=True)

Now we need to upload the data to S3. Also after running the cell below, you can continue the guide using the provided `./data/dataset.csv`

In [26]:
df.to_csv('./data/dataset.csv', index=False)

response = sess.upload_data('./data/dataset.csv', bucket=default_bucket, key_prefix=dataprefix)
print(response)

s3://sagemaker-us-west-2-367158743199/sagemaker-tutorial/data/dataset.csv


### create train and test data set

In [27]:
df_train = df.sample(frac=.80, random_state=42)
df_test = df.drop(df_train.index)

# save train and test data locally
df_train.to_csv('./data/train.csv', index=False)
df_test.to_csv('./data/test.csv', index=False)

# upload train and test data to s3
response = sess.upload_data('./data/train.csv', bucket=default_bucket, key_prefix=dataprefix)
train_data_uri = response

response = sess.upload_data('./data/test.csv', bucket=default_bucket, key_prefix=dataprefix)
test_data_uri = response

# SageMaker Feature Store

Amazon SageMaker Feature Store is a purpose-built repository where you can store and access features so it’s much easier to name, organize, and reuse them across teams. SageMaker Feature Store provides a unified store for features during training and real-time inference without the need to write additional code or create manual processes to keep features consistent. SageMaker Feature Store keeps track of the metadata of stored features (e.g. feature name or version number) so that you can query the features for the right attributes in batches or in real time using Amazon Athena, an interactive query service. SageMaker Feature Store also keeps features updated, because as new data is generated during inference, the single repository is updated so new features are always available for models to use during training and inference.

A feature store consists of an offline componet stored in S3 and an online component stored in a low-latency database. The online database is optional, but very useful if you need supplemental features to be available at inference. In this section, we will create a feature groups for our Claims and Customers datasets. After inserting the claims and customer data into their respective feature groups, you need to query the offline store with Athena to build the training dataset.

You can reference the [SageMaker Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html) for more information about the SageMaker Feature Store.

In [67]:
featurestore_runtime = boto_session.client(
    service_name='sagemaker-featurestore-runtime', 
    region_name=region
)

feature_store_session = sagemaker.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime
)

### Configure the feature groups

When you set up your feature groups, you need to customize the feature names with a unique name and set up each feature group with the FeatureGroup class. 


The datatype for each feature is set by passing a dataframe and inferring the proper datatype. Feature data types can also be set via a config variable, but it will have to match the correspongin Python data type in the Pandas dataframe when it's ingested to the Feature Group.

In [68]:
fg_name = f'credit-' + strftime('%d-%H-%M-%S', gmtime())

credit_feature_group = FeatureGroup(
    name=fg_name, 
    sagemaker_session=feature_store_session)

# You can now load the feature definitions by passing a data frame containing the feature data.
credit_feature_group.load_feature_definitions(data_frame=df);

### Create the feature groups¶

You must tell the Feature Group which columns in the dataframe correspond to the required record indentifier and event time features.

In [69]:
print(f"{fg_name} is the feature group name in use")

credit-04-20-45-51 is the feature group name in use


In this step, you use the create function to create the feature group. The following code shows all of the available parameters. The online store is not created by default, so you must set this as True if you want to enable it. The s3_uri is the S3 bucket location of your offline store.

In [70]:
print(f"\n Using s3://{default_bucket}/{prefix}")


 Using s3://sagemaker-us-west-2-367158743199/sagemaker-tutorial


In [71]:
# record identifier and event time feature names
record_identifier_feature_name = 'ID'
event_time_feature_name = 'EVENT_TIME'


# check if the feature groups is created successfully 
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")
    

print(f"\n Using s3://{default_bucket}/{prefix}")
credit_feature_group.create(
    s3_uri=f"s3://{default_bucket}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=sagemaker_role,
    enable_online_store=True
)

wait_for_feature_group_creation_complete(feature_group=credit_feature_group)


 Using s3://sagemaker-us-west-2-367158743199/sagemaker-tutorial
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup credit-04-20-45-51 successfully created.


In [72]:
credit_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-west-2:367158743199:feature-group/credit-04-20-45-51',
 'FeatureGroupName': 'credit-04-20-45-51',
 'RecordIdentifierFeatureName': 'ID',
 'EventTimeFeatureName': 'EVENT_TIME',
 'FeatureDefinitions': [{'FeatureName': 'LABEL', 'FeatureType': 'Fractional'},
  {'FeatureName': 'ID', 'FeatureType': 'Fractional'},
  {'FeatureName': 'LIMIT_BAL', 'FeatureType': 'Fractional'},
  {'FeatureName': 'SEX', 'FeatureType': 'Fractional'},
  {'FeatureName': 'EDUCATION', 'FeatureType': 'Fractional'},
  {'FeatureName': 'MARRIAGE', 'FeatureType': 'Fractional'},
  {'FeatureName': 'AGE', 'FeatureType': 'Fractional'},
  {'FeatureName': 'PAY_0', 'FeatureType': 'Fractional'},
  {'FeatureName': 'PAY_2', 'FeatureType': 'Fractional'},
  {'FeatureName': 'PAY_3', 'FeatureType': 'Fractional'},
  {'FeatureName': 'PAY_4', 'FeatureType': 'Fractional'},
  {'FeatureName': 'PAY_5', 'FeatureType': 'Fractional'},
  {'FeatureName': 'PAY_6', 'FeatureType': 'Fractional'},
  {'FeatureName'

### Ingest records into the Feature Groups

After the Feature Groups have been created, we can put data into each store by using the PutRecord API. This API can handle high TPS and is designed to be called by different streams. The data from all of these Put requests is buffered and written to s3 in chunks. The files will be written to the offline store within a few minutes of ingestion.

In [None]:
if 'credit_table' in locals():
    print("You may have already ingested the data into your Feature Groups. If you'd like to do this again, you can run the ingest methods outside of the 'if/else' statement.")

else:
    credit_feature_group.ingest(
    data_frame=df, max_workers=5, wait=True
    );

# Train a model using XGBoost built-in algorithm

Once the training and test datasets have been persisted in S3, you can start training a model by defining which SageMaker Estimator you'd like to use. For this guide, you will use the XGBoost built-in algorithm to build an XGBoost training container as shown in the following code example.

In [None]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"}


# set an output path where the trained model will be saved
model_prefix = 'xgboost'
output_path = 's3://{}/{}/output'.format(default_bucket, prefix)

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")


# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)


# define the data type and paths to the training and validation datasets
content_type = "text/csv"
train_input = TrainingInput(train_data_uri, content_type=content_type)
validation_input = TrainingInput(test_data_uri, content_type=content_type)

# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})

# Model lineage with artifacts and associations


Amazon SageMaker ML Lineage Tracking creates and stores information about the steps of a machine learning (ML) workflow from data preparation to model deployment. With the tracking information you can reproduce the workflow steps, track model and dataset lineage, and establish model governance and audit standards. With SageMaker Lineage Tracking data scientists and model builders can do the following:

* Keep a running history of model discovery experiments.
* Establish model governance by tracking model lineage artifacts for auditing and compliance verification.
* Clone and rerun workflows to experiment with what-if scenarios while developing models.
* Share a workflow that colleagues can reproduce and enhance (for example, while collaborating on solving a business problem).
* Clone and rerun workflows with additional debugging or logging routines, or new input variations for troubleshooting issues in production models.

### Register artifacts

Although the xgb_estimator object retains much the data we need to learn about how the model was trained, it is, in fact, an ephermeral object which SageMaker does not persist and cannot be re-instantiated at a later time. Although we lose some of its convieneces once it is gone, we can still get back all the data we need by accessing the training jobs it once created.



In [None]:
training_job_1_info = sagemaker_boto_client.describe_training_job(TrainingJobName=estimator.latest_training_job.job_name)

#### Training data artifact

In [None]:
training_data_s3_uri = training_job_1_info['InputDataConfig'][0]['DataSource']['S3DataSource']['S3Uri']

matching_artifacts = list(sagemaker.lineage.artifact.Artifact.list(
    source_uri=training_data_s3_uri,
    sagemaker_session=sagemaker_session))

if matching_artifacts:
    training_data_artifact = matching_artifacts[0]
    print(f'Using existing artifact: {training_data_artifact.artifact_arn}')
else:
    training_data_artifact = sagemaker.lineage.artifact.Artifact.create(
        artifact_name='TrainingData',
        source_uri=training_data_s3_uri,
        artifact_type='Dataset',
        sagemaker_session=sagemaker_session)
    print(f'Create artifact {training_data_artifact.artifact_arn}: SUCCESSFUL')

#### Model artifact

In [None]:
trained_model_s3_uri = training_job_1_info['ModelArtifacts']['S3ModelArtifacts']

matching_artifacts = list(sagemaker.lineage.artifact.Artifact.list(
    source_uri=trained_model_s3_uri,
    sagemaker_session=sagemaker_session))

if matching_artifacts:
    model_artifact = matching_artifacts[0]
    print(f'Using existing artifact: {model_artifact.artifact_arn}')
else:
    model_artifact = sagemaker.lineage.artifact.Artifact.create(
        artifact_name='TrainedModel',
        source_uri=trained_model_s3_uri,
        artifact_type='Model',
        sagemaker_session=sagemaker_session)
    print(f'Create artifact {model_artifact.artifact_arn}: SUCCESSFUL')

### Set artifact associations


In [None]:
training_job_1_name = training_job_1_info['TrainingJobName']
trial_component = sagemaker_boto_client.describe_trial_component(TrialComponentName=training_job_1_name+'-aws-training-job')
trial_component_arn = trial_component['TrialComponentArn']

#### Input artifacts

In [None]:
input_artifacts = [training_data_artifact]

for a in input_artifacts:
    try:
        sagemaker.lineage.association.Association.create(
            source_arn=a.artifact_arn,
            destination_arn=trial_component_arn,
            association_type='ContributedTo',
            sagemaker_session=sagemaker_session)
        print(f"Association with {a.artifact_type}: SUCCEESFUL")
    except:
        print(f"Association already exists with {a.artifact_type}")

In [None]:
output_artifacts = [model_artifact]

for a in output_artifacts:
    try:
        sagemaker.lineage.association.Association.create(
            source_arn=a.artifact_arn,
            destination_arn=trial_component_arn,
            association_type='Produced',
            sagemaker_session=sagemaker_session)
        print(f"Association with {a.artifact_type}: SUCCESSFUL")
    except:
        print(f"Association already exists with {a.artifact_type}")

# Evaluate model for bias with Clarify

Amazon SageMaker Clarify helps improve your machine learning (ML) models by detecting potential bias and helping explain the predictions that models make. It helps you identify various types of bias in pretraining data and in posttraining that can emerge during model training or when the model is in production. SageMaker Clarify helps explain how these models make predictions using a feature attribution approach. It also monitors inferences models make in production for bias or feature attribution drift. The fairness and explainability functionality provided by SageMaker Clarify provides components that help AWS customers build less biased and more understandable machine learning models. It also provides tools to help you generate model governance reports which you can use to inform risk and compliance teams, and external regulators.

You can reference the SageMaker Developer Guide for more information about SageMaker Clarify.

In [None]:
# Create model from estimator

model_1_name = f'{prefix}-xgboost-pre-smote'
model_matches = sagemaker_boto_client.list_models(NameContains=model_1_name)['Models']

if not model_matches:
    
    model_1 = sagemaker_session.create_model_from_job(
        name=model_1_name,
        training_job_name=training_job_1_info['TrainingJobName'],
        role=sagemaker_role,
        image_uri=training_job_1_info['AlgorithmSpecification']['TrainingImage'])
else:
    
    print(f"Model {model_1_name} already exists.")



### Check for data set bias and model bias
With SageMaker, we can check for pre-training and post-training bias. Pre-training metrics show pre-existing bias in that data, while post-training metrics show bias in the predictions from the model. Using the SageMaker SDK, we can specify which groups we want to check bias across and which metrics we'd like to show.

To run the full Clarify job, you must un-comment the code in the cell below. Running the job will take ~15 minutes. If you wish to save time, you can view the results in the next cell after which loads a pre-generated output if no bias job was run.

In [None]:
bias_report_1_output_path = f's3://{default_bucket}/{prefix}/clarify-output/bias_1'
train_instance_type = "ml.m4.xlarge"


train_cols = wr.s3.read_csv(training_data_s3_uri).columns.to_list()

clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(
    role=sagemaker_role,
    instance_count=1,
    instance_type=train_instance_type,
    sagemaker_session=sagemaker_session)

In [None]:
bias_data_config = sagemaker.clarify.DataConfig(
    s3_data_input_path=train_data_uri,
    s3_output_path=bias_report_1_output_path,
    label='LABEL',
    headers=train_cols,
    dataset_type='text/csv')

model_config = sagemaker.clarify.ModelConfig(
    model_name=model_1_name,
    instance_type=train_instance_type,
    instance_count=1,
    accept_type='text/csv',
    content_type='text/csv')

predictions_config = sagemaker.clarify.ModelPredictedLabelConfig(probability_threshold=0.5)

Use `BiasConfig` to provide information on which columns contain the facets (sensitive groups, Sex), what the sensitive features (facet_values_or_threshold) might be, and what the desirable outcomes are (label_values_or_threshold).

You can run both the pretraining and posttraining analysis in the processing job at the same time with run_bias().

In [None]:
bias_config = sagemaker.clarify.BiasConfig(
    label_values_or_threshold=[2],
    facet_name='SEX',
    facet_values_or_threshold=[1])


clarify_processor.run_bias(
    data_config=bias_data_config,
    bias_config=bias_config,
    model_config=model_config,
    model_predicted_label_config=predictions_config,
    pre_training_methods='all',
    post_training_methods='all')

clarify_bias_job_1_name = clarify_processor.latest_job.name


#### View results of Clarify job
Running Clarify on your dataset or model can take ~15 minutes. Once it is done, you can view the results in Studio or download them from the bias_report_output_path S3 bucket.

If you don't have time to run the job, you can view the pre-generated results included with this demo. Otherwise, you can run the job by un-commenting the code in the cell above.

{PLACEHOLDER https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb]

In [None]:
if 'clarify_bias_job_name' in locals():
    s3_client.download_file(Bucket=bucket, Key=f'{prefix}/clarify-output/bias-1/analysis.json', Filename='clarify_output/bias_1/analysis.json')
    print(f'Downloaded analysis from previous Clarify job: {clarify_bias_job_name}')
else:
    print(f'Loading pre-generated analysis file...')

with open('clarify_output/bias_1/analysis.json', 'r') as f:
        bias_analysis = json.load(f)

results = bias_analysis['pre_training_bias_metrics']['facets']['customer_gender_female'][0]['metrics'][1]
print(json.dumps(results, indent=4))

# Deposit Model and Lineage in SageMaker Model Registry

Once a useful model has been trained and its artifacts properly associated, the next step is to save the model in a registry for future reference and possible deployment.


## Create Model Package Group
A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning.

In [51]:
if 'mpg_name' not in locals():
    mpg_name = prefix
    %store mpg_name
    print(f'Model Package Group name: {mpg_name}')

Stored 'mpg_name' (str)
Model Package Group name: sagemaker-tutorial


In [None]:
mpg_input_dict = {
    'ModelPackageGroupName': mpg_name,
    'ModelPackageGroupDescription': 'Insurance claim fraud detection'
}

In [None]:
matching_mpg = sagemaker_boto_client.list_model_package_groups(NameContains=mpg_name)['ModelPackageGroupSummaryList']

if matching_mpg:
    print(f'Using existing Model Package Group: {mpg_name}')
else:
    mpg_response = sagemaker_boto_client.create_model_package_group(**mpg_input_dict)
    print(f'Create Model Package Group {mpg_name}: SUCCESSFUL')
    %store mpg_name


## Create Model Package for trained model¶


In [None]:
# Create and upload a metrics report¶

model_metrics_report = {'classification_metrics': {}}
for metric in training_job_1_info['FinalMetricDataList']:
    stat = {metric['MetricName']: {'value': metric['Value']}}
    model_metrics_report['classification_metrics'].update(stat)
    
with open('./data/training_metrics.json', 'w') as f:
    json.dump(model_metrics_report, f)
    
metrics_prefix = f"{prefix}/training_jobs/{training_job_1_info['TrainingJobName']}"


sess.upload_data('./data/training_metrics.json', bucket=default_bucket, key_prefix=metrics_prefix)

### Define the inference spec


In [None]:
from inf import InferenceSpecification 

In [None]:
mp_inference_spec = InferenceSpecification().get_inference_specification_dict(
    ecr_image=training_job_1_info['AlgorithmSpecification']['TrainingImage'],
    supports_gpu=False,
    supported_content_types=['text/csv'],
    supported_mime_types=['text/csv'])


mp_inference_spec['InferenceSpecification']['Containers'][0]['ModelDataUrl'] = training_job_1_info['ModelArtifacts']['S3ModelArtifacts']

In [None]:
model_metrics = {
    'ModelQuality': {
        'Statistics': {
            'ContentType': 'application/json',
            'S3Uri': f's3://{default_bucket}/{prefix}/{metrics_s3_key}'
        }
    },
    'Bias': {
        'Report': {
            'ContentType': 'application/json',
            'S3Uri': f'{bias_report_1_output_path}/analysis.json'
        }
    }
}

In [None]:
mp_input_dict = {
    'ModelPackageGroupName': mpg_name,
    'ModelPackageDescription': 'XGBoost classifier to detect insurance fraud.',
    'ModelApprovalStatus': 'PendingManualApproval',
    'ModelMetrics': model_metrics
}

mp_input_dict.update(mp_inference_spec)
mp1_response = sagemaker_boto_client.create_model_package(**mp_input_dict)

In [None]:
mp_info = sagemaker_boto_client.describe_model_package(ModelPackageName=mp1_response['ModelPackageArn'])
mp_status = mp_info['ModelPackageStatus']

while mp_status not in ['Completed', 'Failed']:
    time.sleep(5)
    mp_info = sagemaker_boto_client.describe_model_package(ModelPackageName=mp1_response['ModelPackageArn'])
    mp_status = mp_info['ModelPackageStatus']
    print(f'model package status: {mp_status}')
print(f'model package status: {mp_status}')

In [None]:
mp_list = sagemaker_boto_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList']
mp_list

### Approve the model


In the real-life MLOps lifecycle, a model package gets approved after evaluation by data scientists, subject matter experts and auditors.



In [None]:
model_package_update = {
    'ModelPackageArn': mp_list[0]['ModelPackageArn'],
    'ModelApprovalStatus': 'Approved'
}

update_response = sagemaker_boto_client.update_model_package(**model_package_update)


# Create an inference endpoint

Deploy an approved model to make prediction via Feature Store. This might take about  10 minutes.


In [None]:
primary_container = {'ModelPackageName': mp_list[0]['ModelPackageArn']}
endpoint_name = f'{model_1_name}-endpoint'
endpoint_config_name=f'{model_1_name}-endpoint-config'
existing_configs = len(sagemaker_boto_client.list_endpoint_configs(NameContains=endpoint_config_name, MaxResults = 30)['EndpointConfigs'])

endpoint_instance_count = 1
endpoint_instance_type = "ml.m4.xlarge"

if existing_configs == 0:
    create_ep_config_response = sagemaker_boto_client.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[{
            'InstanceType': endpoint_instance_type,
            'InitialVariantWeight': 1,
            'InitialInstanceCount': endpoint_instance_count,
            'ModelName': model_1_name,
            'VariantName': 'AllTraffic'
        }]
    )
    %store endpoint_config_name

In [None]:
existing_endpoints = sagemaker_boto_client.list_endpoints(NameContains=endpoint_name, MaxResults = 30)['Endpoints']
if not existing_endpoints:
    create_endpoint_response = sagemaker_boto_client.create_endpoint(
        EndpointName=endpoint_name,
        EndpointConfigName=endpoint_config_name)
    %store endpoint_name

endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=endpoint_name)
endpoint_status = endpoint_info['EndpointStatus']

while endpoint_status == 'Creating':
    endpoint_info = sagemaker_boto_client.describe_endpoint(EndpointName=endpoint_name)
    endpoint_status = endpoint_info['EndpointStatus']
    print('Endpoint status:', endpoint_status)
    if endpoint_status == 'Creating':
        time.sleep(30)

### create a predictor

In [None]:
predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session)

#### Sample an application from the test data and get it's features from online feature store

In [None]:
sample_id = df_test.sample(1).iloc[0]['ID']
sample_id

In [None]:
response = featurestore_runtime.get_record(
        FeatureGroupName=fg_name, 
        RecordIdentifierValueAsString= str(sample_id)

    )
df_sample = pd.DataFrame(response['Record']).set_index('FeatureName').drop('LABEL')

data_input = ','.join(df_sample['ValueAsString'])
data_input

In [None]:
results = predictor.predict(data_input, initial_args = {"ContentType": "text/csv"})
prediction = json.loads(results)

print (f'Probablitity of the default payment next month for ID:{int(sample_id)} is :', prediction)

# Create a SageMaker Pipeline to Automate All the Steps from Data Prep to Model Deployment
Now that youve manually done each step in our machine learning workflow, you can create a pipeline which trains a new model, persists the model in SageMaker and then adds the model to the registry.

### Pipeline parameters
An important feature of SageMaker Pipelines is the ability to define the steps ahead of time, but be able to change the parameters to those steps at execution without having to re-define the pipeline. This can be achieved by using ParameterInteger, ParameterFloat or ParameterString to define a value upfront which can be modified when you call pipeline.start(parameters=parameters) later. Only certain parameters can be defined this way.

In [12]:
train_instance_param = ParameterString(
    name="TrainingInstance",
    default_value="ml.m4.xlarge"
)

# set an output path where the trained model will be saved
model_prefix = 'xgboost'
output_path = 's3://{}/{}/output'.format(default_bucket, prefix)

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")

## Train XGBoost Model

In [29]:
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":"50"}

train_instance_count = 1
train_instance_type = "ml.m4.xlarge"
content_type = "text/csv"


# construct a SageMaker estimator that calls the xgboost-container
xgb_estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=train_instance_count, 
                                          instance_type=train_instance_type, 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path)

train_step = TrainingStep(
    name='XgboostTrain',
    estimator=xgb_estimator,
    inputs={
        'train': TrainingInput(train_data_uri, content_type=content_type)
    }
)



## Model Pre-Deployment Step

In [32]:
model = sagemaker.model.Model(
    name='credit-default-demo-pipeline-xgboost',
    image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    sagemaker_session=sagemaker_session,
    role=sagemaker_role
)

inputs = sagemaker.inputs.CreateModelInput(
    instance_type="ml.m4.xlarge"
)

create_model_step = CreateModelStep(
    name="ModelPreDeployment",
    model=model,
    inputs=inputs
)

## Run Bias Metrics with Clarify

In [41]:
# clarify config
bias_report_output_path = f's3://{default_bucket}/{prefix}/clarify-output/bias'
s3_client = boto3.client('s3', region_name=region)


bias_data_config = sagemaker.clarify.DataConfig(
    s3_data_input_path=train_data_uri,
    label='LABEL',
    dataset_type='text/csv',
s3_output_path=bias_report_output_path)

bias_config = sagemaker.clarify.BiasConfig(
    label_values_or_threshold=[0],
    facet_name='SEX',
    facet_values_or_threshold=[1])

analysis_config = bias_data_config.get_config()
analysis_config.update(bias_config.get_config())
analysis_config["methods"] = {"pre_training_bias": {"methods": "all"}}

clarify_config_dir = pathlib.Path('config')
clarify_config_dir.mkdir(exist_ok=True)
with open(clarify_config_dir / 'analysis_config.json', 'w') as f:
    json.dump(analysis_config, f)
    
s3_client.upload_file(Filename='config/analysis_config.json', Bucket=default_bucket, Key=f'{prefix}/clarify-config/analysis_config.json')

In [45]:
# clarify processing step

clarify_processor = sagemaker.processing.Processor(
    base_job_name='fraud-detection-demo-clarify-processor',
    image_uri=sagemaker.clarify.image_uris.retrieve(framework='clarify', region=region),
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.c5.xlarge')

clarify_step = ProcessingStep(
    name="ClarifyProcessor",
    processor=clarify_processor,
    inputs=[
        sagemaker.processing.ProcessingInput(
            input_name="analysis_config",
            source=f's3://{default_bucket}/{prefix}/clarify-config/analysis_config.json',
            destination="/opt/ml/processing/input/config"),
        sagemaker.processing.ProcessingInput(
            input_name="dataset",
            source=train_data_uri,
            destination="/opt/ml/processing/input/data")  
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            source="/opt/ml/processing/output/analysis.json",
            destination=bias_report_output_path,
            output_name="analysis_result")
    ]
)

## Register Model

In [53]:
# model_metrics = demo_helpers.ModelMetrics(
#     bias=sagemaker.model_metrics.MetricsSource(
#         s3_uri=clarify_step.properties.ProcessingOutputConfig.Outputs['analysis_result'].S3Output.S3Uri,
#         content_type="application/json"
#     )
# )

if 'mpg_name' not in locals():
    mpg_name = prefix
    %store mpg_name
    print(f'Model Package Group name: {mpg_name}')
    
register_step = RegisterModel(
    name="XgboostRegisterModel",
    estimator=xgb_estimator,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium", "ml.m5.xlarge"],
    transform_instances=["ml.m5.xlarge"],
    model_package_group_name=mpg_name,
    approval_status=model_approval_status,
#     model_metrics=model_metrics
)

## Deploy Model

# Combine the Pipline Steps and Run

In [59]:
pipeline_name = f'credit-default'
%store pipeline_name

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        train_instance_param, 
        model_approval_status],
    steps=[
#         claims_flow_step,
#         customers_flow_step,
#         create_dataset_step,
        train_step, 
        create_model_step, 
        clarify_step, 
        register_step,
#         deploy_step
    ])

Stored 'pipeline_name' (str)


## Submit the pipeline definition to the SageMaker Pipeline service¶

Note: If an existing pipeline has the same name it will be overwritten.


In [62]:
pipeline.upsert(role_arn=sagemaker_role)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


{'PipelineArn': 'arn:aws:sagemaker:us-west-2:367158743199:pipeline/credit-default',
 'ResponseMetadata': {'RequestId': 'b3a9bfdb-b1b2-4826-a75d-da5d737648f1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b3a9bfdb-b1b2-4826-a75d-da5d737648f1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '82',
   'date': 'Thu, 04 Mar 2021 19:59:17 GMT'},
  'RetryAttempts': 0}}

In [66]:
start_response = pipeline.start()

start_response.wait()
start_response.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-west-2:367158743199:pipeline/credit-default',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-west-2:367158743199:pipeline/credit-default/execution/ging0sljodj3',
 'PipelineExecutionDisplayName': 'execution-1614888032576',
 'PipelineExecutionStatus': 'Succeeded',
 'CreationTime': datetime.datetime(2021, 3, 4, 20, 0, 32, 495000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 3, 4, 20, 4, 59, 15000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-west-2:367158743199:user-profile/d-cquuj3sidhuj/default-1613588520951',
  'UserProfileName': 'default-1613588520951',
  'DomainId': 'd-cquuj3sidhuj'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-west-2:367158743199:user-profile/d-cquuj3sidhuj/default-1613588520951',
  'UserProfileName': 'default-1613588520951',
  'DomainId': 'd-cquuj3sidhuj'},
 'ResponseMetadata': {'RequestId': '2f5cf3c4-4a45-4201-860c-462f5bba7cae',
  'HTTPStatusCode': 200,
  'HTTPHe