In [207]:
import os
import boto3
import pandas as pd
from sklearn.model_selection import train_test_split

import sagemaker
from sagemaker import clarify
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost

In [161]:
# Set region, boto3 and SageMaker SDK variables¶

#You can change this to a region of your choice
region = sagemaker.Session().boto_region_name
print("Using AWS Region: {}".format(region))

boto3.setup_default_session(region_name=region)
boto_session = boto3.Session(region_name=region)

s3_client = boto3.client('s3', region_name=region)
sagemaker_boto_client = boto_session.client('sagemaker')

sagemaker_session = sagemaker.session.Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_boto_client)

sagemaker_role = sagemaker.get_execution_role()
account_id = boto3.client('sts').get_caller_identity()["Account"]

random_state = 42

Using AWS Region: us-east-1


In [26]:
%store
%store -r

Stored variables and their in-db values:
data_prefix                    -> 'sagemaker-tutorial/data'
default_bucket                 -> 'sagemaker-us-east-1-367158743199'
feature_group_name             -> 'FG-flow-sm-tutorial-31-16-16-17-9f41d66b'
hyperparameters                -> {'max_depth': '3', 'eta': '0.2', 'objective': 'bin
model_data                     -> 's3://sagemaker-us-east-1-367158743199/tf2-resnet-
prefix                         -> 'sagemaker-tutorial'
s3_raw_data                    -> 's3://sagemaker-us-east-1-367158743199/sagemaker-t


## Get the data from offline feature store

Feature Store provides offline storage for feature values in your S3 bucket. Your data is stored in your S3 bucket using a prefixing scheme based on event time. The offline store is an append-only store, enabling Feature Store to maintain a historical record of all feature values. Data is stored in the offline store in Parquet format for optimized storage and query access.

You can query, explore, and visualize features using Data Wrangler from Amazon SageMaker Studio.  Feature Store supports combining data to produce, train, validate, and test data sets, and allows you to extract data at different points in time.

 
<span style="color:red">**TODO:  THE CODE NEEDS TO CHANGE**</span>


In [132]:
# featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', 
#                                            region_name=region
#                                           )

# feature_store_session = Session(
#     boto_session=boto_session,
#     sagemaker_client=sagemaker_boto_client,
#     sagemaker_featurestore_runtime_client=featurestore_runtime
# )

# offline_feature_store_bucket = f's3://{default_bucket}/{account_id}/sagemaker/{region}/offline-store/{feature_group_name}/data/year=2021/month=03/day=31/hour=16/'
# offline_feature_store_bucket

# !aws s3 cp $offline_feature_store_bucket ../sm-tutorial/02_build_train/ --recursive

# # offline_feature_store_bucket = f's3://{default_bucket}/'
# fg_prefix = f'sagemaker/{region}/offline-store/{feature_group_name}/data/'
# s3_client.list_objects_v2(Bucket=default_bucket,
#                          Prefix=fg_prefix,
#                          Delimiter='/')

# def download_all_objects_in_folder():
#     s3_resource = boto3.resource('s3')
#     my_bucket = s3_resource.Bucket(default_bucket)
#     objects = my_bucket.objects.filter(Prefix=offline_feature_prefix)
#     for obj in objects:
#         path, filename = os.path.split(obj.key)
#         my_bucket.download_file(obj.key, filename)

In [135]:
file_names = ['20210331T162204Z_ATxVKv9V8rL9hJyQ.parquet',
             '20210331T162204Z_BVs0QiuqNaVyrXTY.parquet',
             '20210331T162204Z_KnPgBMRO3yEo3BP3.parquet	']

local_processed_data = '../sm-tutorial/02_build_train/processed_data/'
for f in file_names:
    s3_path = f's3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/{f}'

    ! aws s3 cp $s3_path $local_processed_data

download: s3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/20210331T162204Z_ATxVKv9V8rL9hJyQ.parquet to ../sm-tutorial/02_build_train/processed_data/20210331T162204Z_ATxVKv9V8rL9hJyQ.parquet
download: s3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/20210331T162204Z_BVs0QiuqNaVyrXTY.parquet to ../sm-tutorial/02_build_train/processed_data/20210331T162204Z_BVs0QiuqNaVyrXTY.parquet
download: s3://sagemaker-us-east-1-367158743199/367158743199/sagemaker/us-east-1/offline-store/FG-flow-sm-tutorial-31-16-16-17-9f41d66b-1617207382/data/year=2021/month=03/day=31/hour=16/20210331T162204Z_KnPgBMRO3yEo3BP3.parquet to ../sm-tutorial/02_build_train/processed_data/20210331T162204Z_KnPgBMRO3yEo3BP3.parquet


In [158]:
import pyarrow.parquet as pq

def join_parquet_files(dir_path=local_processed_data):
    all_files = os.listdir(dir_path)


    df = pd.DataFrame()
    for f in all_files:
        full_path = os.path.join(dir_path, f)
        df_partial = pq.read_table(full_path).to_pandas()
        df = pd.concat([df, df_partial], axis=0)
    
    return df

In [164]:
df_processed = join_parquet_files()

## Split DataFrame into Train & Test Sets

In [193]:
X_train, X_test = train_test_split(df_processed, test_size=0.2, random_state=random_state)

X_train, X_val = train_test_split(X_test, test_size=0.2, random_state=random_state)

In [194]:
X_train.to_csv(f'{local_processed_data}/train.csv', header=False, index=False)

response = sagemaker_session.upload_data(f'{local_processed_data}/train.csv',
                                         bucket=default_bucket, 
                                         key_prefix=data_prefix)
train_data_uri = response
%store train_data_uri

Stored 'train_data_uri' (str)


In [195]:
X_val.to_csv(f'{local_processed_data}/validation.csv', header=False, index=False)

response = sagemaker_session.upload_data(f'{local_processed_data}/validation.csv',
                                         bucket=default_bucket, 
                                         key_prefix=data_prefix)
validation_data_uri = response
%store validation_data_uri

Stored 'validation_data_uri' (str)


In [196]:
X_test.to_csv(f'{local_processed_data}/test.csv', header=False, index=False)

response = sagemaker_session.upload_data(f'{local_processed_data}/test.csv',
                                         bucket=default_bucket, 
                                         key_prefix=data_prefix)
test_data_uri = response
%store test_data_uri

Stored 'test_data_uri' (str)


# Train a model using XGBoost

 
<span style="color:red">**TODO:  XGBoost details**</span>


## Set the hyperparameters

 
<span style="color:red">**TODO:  XGBoost hyperparameters details**</span>


In [184]:
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "binary:logistic",
    "num_round": "50"}

%store hyperparameters

Stored 'hyperparameters' (dict)


## Create and fit the estimator

In [186]:
prefix

'sagemaker-tutorial'

In [188]:
train_instance_count = 1
train_instance_type = "ml.m4.xlarge"
content_type = "text/csv"
estimator_output_path = f's3://{default_bucket}/{prefix}/training_jobs'

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", sagemaker.Session().boto_region_name, "1.2-1")


# construct a SageMaker estimator that calls the xgboost-container
xgb_estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container,
                                              hyperparameters=hyperparameters,
                                              role=sagemaker.get_execution_role(),
                                              instance_count=train_instance_count,
                                              instance_type=train_instance_type,
                                              volume_size=5,  # 5 GB
                                              output_path=estimator_output_path)

In [200]:
# define the data type and paths to the training and validation datasets
content_type = "text/csv"
train_input = TrainingInput(train_data_uri, content_type=content_type)
validation_input = TrainingInput(validation_data_uri, content_type=content_type)

# execute the XGBoost training job
xgb_estimator.fit({'train': train_input, 'validation': validation_input})

training_job_1_name = xgb_estimator.latest_training_job.job_name
%store training_job_1_name

2021-04-01 15:06:03 Starting - Starting the training job...
2021-04-01 15:06:32 Starting - Launching requested ML instancesProfilerReport-1617289563: InProgress
.........
2021-04-01 15:07:53 Starting - Preparing the instances for training......
2021-04-01 15:08:58 Downloading - Downloading input data...
2021-04-01 15:09:33 Training - Downloading the training image......
2021-04-01 15:10:33 Uploading - Uploading generated training model
2021-04-01 15:10:33 Completed - Training job completed
ProfilerReport-1617289563: NoIssuesFound
[34m[2021-04-01 15:10:15.010 ip-10-0-179-132.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_c

Here we create the SageMaker model.

In [206]:
model_name = f'{training_job_1_name}-model'
model = xgb_estimator.create_model(name=model_name)
container_def = model.prepare_container_def()

sagemaker_session.create_model(model_name,
                     sagemaker_role,
                     container_def)

'sagemaker-xgboost-2021-04-01-15-06-03-055-model'

# Amazon SageMaker Clarify
Now that you have your model set up. Let’s say hello to SageMaker Clarify!



In [210]:
clarify_processor = clarify.SageMakerClarifyProcessor(role=sagemaker_role,
                                                      instance_count=1,
                                                      instance_type='ml.m5.xlarge',
                                                      sagemaker_session=sagemaker_session)

## Detecting Bias

SageMaker Clarify helps you detect possible pre- and post-training biases using a variety of metrics. 

A `DataConfig` object communicates some basic information about data I/O to SageMaker Clarify. We specify where to find the input dataset, where to store the output, the target column (`label`), the header names, and the dataset type.

In [216]:
bias_report_output_path = 's3://{}/{}/clarify-bias'.format(default_bucket, prefix)
bias_data_config = clarify.DataConfig(s3_data_input_path=train_data_uri,
                                      s3_output_path=bias_report_output_path,
                                      label='Target',
                                      headers=df_processed.columns.to_list(),
                                      dataset_type='text/csv')

A `ModelConfig` object communicates information about your trained model. To avoid additional traffic to your production models, SageMaker Clarify sets up and tears down a dedicated endpoint when processing. * instance_type and instance_count specify your preferred instance type and instance count used to run your model on during SageMaker Clarify’s processing. The testing dataset is small so a single standard instance is good enough to run this example. If your have a large complex dataset, you may want to use a better instance type to speed up, or add more instances to enable Spark parallelization. * accept_type denotes the endpoint response payload format, and content_type denotes the payload format of request to the endpoint.

In [217]:
model_config = clarify.ModelConfig(model_name=model_name,
                                   instance_type='ml.m5.xlarge',
                                   instance_count=1,
                                   accept_type='text/csv',
                                   content_type='text/csv')

A `ModelPredictedLabelConfig` provides information on the format of your predictions. XGBoost model outputs probabilities of samples, so SageMaker Clarify invokes the endpoint then uses `probability_threshold` to convert the probability to binary labels for bias analysis. Prediction above the threshold is interpreted as label value 1 and below or equal as label value 0.

In [218]:
predictions_config = clarify.ModelPredictedLabelConfig(probability_threshold=0.8)

### Writing BiasConfig

SageMaker Clarify also needs information on what the sensitive columns (`facets`) are, what the sensitive features (`facet_values_or_threshold`) may be, and what the desirable outcomes are (`label_values_or_threshold`). SageMaker Clarify can handle both categorical and continuous data for `facet_values_or_threshold` and for `label_values_or_threshold`. In this case we are using categorical data.



In [219]:
df_processed.head()

Unnamed: 0,LABEL,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,...,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,EVENT_TIME,write_time,api_invocation_time,is_deleted
0,0.0,23659.0,180000.0,2.0,2.0,2.0,36.0,0.0,0.0,0.0,...,1300.0,2902.0,3047.0,0.0,2293.0,2000.0,1617208000.0,2021-03-31 16:27:27.182000+00:00,2021-03-31 16:22:26+00:00,False
1,0.0,20877.0,280000.0,1.0,1.0,1.0,38.0,1.0,-1.0,-1.0,...,5410.0,2380.0,3709.0,5849.0,5606.0,5299.0,1617208000.0,2021-03-31 16:27:27.182000+00:00,2021-03-31 16:22:26+00:00,False
2,1.0,28236.0,100000.0,2.0,1.0,1.0,37.0,1.0,-2.0,-2.0,...,0.0,0.0,0.0,0.0,0.0,150.0,1617208000.0,2021-03-31 16:27:27.182000+00:00,2021-03-31 16:22:26+00:00,False
3,0.0,23660.0,120000.0,2.0,2.0,1.0,32.0,0.0,0.0,0.0,...,5596.0,5498.0,5500.0,4000.0,4300.0,5012.0,1617208000.0,2021-03-31 16:27:27.182000+00:00,2021-03-31 16:22:26+00:00,False
4,0.0,17126.0,60000.0,2.0,2.0,2.0,26.0,1.0,-2.0,-2.0,...,0.0,0.0,2306.0,40367.0,1416.0,1419.0,1617208000.0,2021-03-31 16:27:27.182000+00:00,2021-03-31 16:22:26+00:00,False


We specify this information in the BiasConfig API. Here we use `SEX` as the sensitive group.

group_name is used to form subgroups for the measurement of Conditional Demographic Disparity in Labels (CDDL) and Conditional Demographic Disparity in Predicted Labels (CDDPL) with regards to Simpson’s paradox.

In [221]:
bias_config = clarify.BiasConfig(label_values_or_threshold=[1],
                                facet_name='SEX',
                                facet_values_or_threshold=[0],
                                group_name='AGE')

 
<span style="color:red">**TODO:  PRETRIANING BIAS WITH WRANGLER**</span>


refer to smote

## Post-training Bias

Computing post-training bias metrics does require a trained model.

Unbiased training data (as determined by concepts of fairness measured by bias metric) may still result in biased model predictions after training. Whether this occurs depends on several factors including hyperparameter choices.

You can run these options separately with `run_pre_training_bias()` and `run_post_training_bias()` or at the same time with `run_bias()` as shown below.


In [None]:
clarify_processor.run_bias(data_config=bias_data_config,
                           bias_config=bias_config,
                           model_config=model_config,
                           model_predicted_label_config=predictions_config,
                           pre_training_methods='all',
                           post_training_methods='all')


Job Name:  Clarify-Bias-2021-04-01-15-27-05-077
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-367158743199/sagemaker-tutorial/data/train.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-367158743199/Clarify-Bias-2021-04-01-15-27-05-077/input/analysis_config/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-1-367158743199/sagemaker-tutorial/clarify-bias', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
..............

<span style="color:red">**TODO:  SCREENSHOTS OF BIAS REPORT IN STUDIO**</span>

https://sagemaker-examples.readthedocs.io/en/latest/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.html
