In [1]:
%store -r s3_bucket_name
%store -r prefix
%store -r training_data_path

In [2]:
import sagemaker

session = sagemaker.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()

In [3]:
s3_training_data_path = training_data_path
s3_output_path = f"s3://{s3_bucket_name}/{prefix}/output"

In [4]:
!aws s3 cp {s3_training_data_path} tmp/training_data.csv

download: s3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv to tmp/training_data.csv


In [5]:
import pandas as pd

training_data = pd.read_csv("tmp/training_data.csv")
training_data

Unnamed: 0,label,a,b,c,d
0,1,-8.837413,-6.551265,23,-75
1,1,-9.216749,-2.483494,2,-51
2,1,-2.017317,-6.326533,91,34
3,1,-10.748736,-4.622519,8,-78
4,0,-3.675848,12.629029,47,32
...,...,...,...,...,...
2995,0,-5.786462,-6.790668,-65,70
2996,1,-2.552410,-1.793217,42,4
2997,0,-10.692197,1.583437,-90,-62
2998,1,-14.109003,-4.745680,37,64


In [6]:
from sagemaker.image_uris import retrieve

container = retrieve('xgboost', region, version='1.2-1')

In [7]:
from sagemaker.estimator import Estimator

estimator = Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=session
)

In [8]:
estimator.set_hyperparameters(
    objective='binary:logistic',
    max_depth=8,
    eta=0.1,
    min_child_weight=4,
    num_round=500
)

In [9]:
from sagemaker.inputs import TrainingInput

train_input = TrainingInput(
    s3_training_data_path, 
    content_type='csv'
)

In [10]:
%%time

estimator.fit({'train': train_input}, wait='True')

2021-05-24 14:02:25 Starting - Starting the training job...
2021-05-24 14:02:48 Starting - Launching requested ML instancesProfilerReport-1621864945: InProgress
......
2021-05-24 14:03:49 Starting - Preparing the instances for training.........
2021-05-24 14:05:09 Downloading - Downloading input data...
2021-05-24 14:05:49 Training - Downloading the training image..[34m[2021-05-24 14:05:59.704 ip-10-2-124-51.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV 

In [15]:
import random
from string import ascii_uppercase

def generate_model_name():
    chars = random.choices(ascii_uppercase, k=5)
    output = 'model-' + ''.join(chars)
    return output

In [17]:
model_name = generate_model_name()
model_name

'model-ACBOE'

In [18]:
model = estimator.create_model(name=model_name)

In [19]:
type(model)

sagemaker.model.Model

In [20]:
model.__dict__

{'model_data': 's3://sagemaker-us-east-1-581320662326/sagemaker-xgboost-2021-05-24-14-02-24-886/output/model.tar.gz',
 'image_uri': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1',
 'role': 'arn:aws:iam::581320662326:role/SuperAdminRole',
 'predictor_cls': <function sagemaker.estimator.Estimator.create_model.<locals>.predict_wrapper(endpoint, session)>,
 'env': {},
 'name': 'model-ACBOE',
 '_base_name': None,
 'vpc_config': None,
 'sagemaker_session': <sagemaker.session.Session at 0x7fe5d010b5d0>,
 'endpoint_name': None,
 '_is_compiled_model': False,
 '_compilation_job_name': None,
 '_is_edge_packaged_model': False,
 '_enable_network_isolation': False,
 'model_kms_key': None,
 'image_config': None}

In [22]:
container_def = model.prepare_container_def()
container_def

{'Image': '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.2-1',
 'Environment': {},
 'ModelDataUrl': 's3://sagemaker-us-east-1-581320662326/sagemaker-xgboost-2021-05-24-14-02-24-886/output/model.tar.gz'}

In [23]:
session.create_model(
    model_name,
    role,
    container_def
)

'model-ACBOE'

In [28]:
from sagemaker.clarify import SageMakerClarifyProcessor

processor = SageMakerClarifyProcessor(
    role=role,                                                  
    instance_count=1,                                                  
    instance_type='ml.m5.large',
    sagemaker_session=session
)

In [29]:
from sagemaker.clarify import DataConfig

data_config = DataConfig(
    s3_data_input_path=s3_training_data_path,
    s3_output_path=s3_output_path,
    label='label',
    headers=training_data.columns.to_list(),
    dataset_type='text/csv'
)

data_config.__dict__

{'s3_data_input_path': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv',
 's3_output_path': 's3://sagemaker-cookbook-bucket/chapter07/output',
 's3_data_distribution_type': 'FullyReplicated',
 's3_compression_type': 'None',
 'label': 'label',
 'headers': ['label', 'a', 'b', 'c', 'd'],
 'features': None,
 'analysis_config': {'dataset_type': 'text/csv',
  'headers': ['label', 'a', 'b', 'c', 'd'],
  'label': 'label'}}

In [30]:
from sagemaker.clarify import ModelConfig

model_config = ModelConfig(
    model_name=model_name,
    instance_type='ml.c5.xlarge',
    instance_count=1,
    accept_type='text/csv'
)

In [31]:
from sagemaker.clarify import ModelPredictedLabelConfig

predictions_config = ModelPredictedLabelConfig(
    probability_threshold=0.75
)

In [43]:
from sagemaker.clarify import BiasConfig

bias_config = BiasConfig(
    label_values_or_threshold=[1],              
    facet_name='a',                             
    facet_values_or_threshold=[5]
)

In [33]:
%%time

processor.run_post_training_bias(
    data_config=data_config, 
    data_bias_config=bias_config,
    methods=['DPPL', 'RD'],
    model_config=model_config,
    model_predicted_label_config=predictions_config
)


Job Name:  Clarify-Posttraining-Bias-2021-05-24-14-20-24-904
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
...............................[34mINFO:sagemaker-clarify-processing:Starting SageMaker Cl

In [34]:
output = processor.latest_job.outputs[0]
output_destination = output.destination
output_destination

's3://sagemaker-cookbook-bucket/chapter07/output'

In [35]:
!aws s3 cp {output_destination}/ tmp/ --recursive

download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json to tmp/analysis_config.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis.json to tmp/analysis.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.pdf to tmp/report.pdf
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.ipynb to tmp/report.ipynb
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.html to tmp/report.html


In [36]:
!ls -lahF tmp/

total 840K
drwxr-xr-x 2 root root 6.0K May 24 14:34 ./
drwxr-xr-x 4 root root 6.0K May 24 14:33 ../
-rw-r--r-- 1 root root  837 May 24 14:32 analysis.json
-rw-r--r-- 1 root root  470 May 24 14:20 analysis_config.json
-rw-r--r-- 1 root root 268K May 24 14:32 report.html
-rw-r--r-- 1 root root 1.5K May 24 14:32 report.ipynb
-rw-r--r-- 1 root root  29K May 24 14:32 report.pdf
-rw-r--r-- 1 root root  46K May 24 13:26 test_data.csv
-rw-r--r-- 1 root root  46K May 24 13:26 test_data_no_header.csv
-rw-r--r-- 1 root root  45K May 24 14:32 test_features.csv
-rw-r--r-- 1 root root 139K May 24 13:26 training_data.csv
-rw-r--r-- 1 root root 139K May 24 13:26 training_data_no_header.csv
-rw-r--r-- 1 root root  47K May 24 13:26 validation_data.csv
-rw-r--r-- 1 root root  47K May 24 13:26 validation_data_no_header.csv


In [37]:
!cat tmp/analysis.json

{
    "version": "1.0",
    "post_training_bias_metrics": {
        "label": "label",
        "facets": {
            "a": [
                {
                    "value_or_threshold": "(5.0, 13.99152988349206]",
                    "metrics": [
                        {
                            "name": "DPPL",
                            "description": "Difference in Positive Proportions in Predicted Labels (DPPL)",
                            "value": 0.483991825613079
                        },
                        {
                            "name": "RD",
                            "description": "Recall Difference (RD)",
                            "value": 0.9460719041278296
                        }
                    ]
                }
            ]
        },
        "label_value_or_threshold": "1"
    }
}

In [42]:
%store model_name
model_name

Stored 'model_name' (str)


'model-ACBOE'