In [16]:
%store -r s3_bucket_name
%store -r prefix
%store -r training_data_path

In [17]:
import sagemaker

session = sagemaker.Session()
region = session.boto_region_name
role = sagemaker.get_execution_role()

In [18]:
import pandas as pd
import numpy as np

In [19]:
!mkdir -p tmp

In [20]:
s3_training_data_path = training_data_path
s3_output_path = f"s3://{s3_bucket_name}/{prefix}/output"

In [21]:
!aws s3 cp {s3_training_data_path} tmp/training_data.csv

download: s3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv to tmp/training_data.csv


In [22]:
training_data = pd.read_csv("tmp/training_data.csv")
training_data

Unnamed: 0,approved,sex,math,science,technology,random1,random2
0,1,1,97,97,98,93,82
1,1,1,85,68,62,92,65
2,1,1,99,100,80,71,60
3,1,1,91,79,84,60,70
4,1,1,73,86,66,70,98
...,...,...,...,...,...,...,...
595,1,1,99,86,85,98,87
596,1,1,71,97,90,86,99
597,1,1,95,86,62,69,73
598,1,1,78,71,68,72,68


In [23]:
from sagemaker import clarify

processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',
    sagemaker_session=session
)

In [24]:
data_config = clarify.DataConfig(
    s3_data_input_path=s3_training_data_path,
    s3_output_path=s3_output_path,
    label='approved',
    headers=training_data.columns.to_list(),
    dataset_type='text/csv'
)

In [25]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[1],
    facet_name='sex',
)

In [26]:
%%time

processor.run_pre_training_bias(
    data_config=data_config, 
    data_bias_config=bias_config,
    methods=['CI']
)


Job Name:  Clarify-Pretraining-Bias-2021-06-13-17-25-31-783
Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/input/training_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-cookbook-bucket/chapter07/output', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]
..................................[34mINFO:sagemaker-clarify-processing:Starting SageMaker 

In [27]:
output = processor.latest_job.outputs[0]
output_destination = output.destination
output_destination

's3://sagemaker-cookbook-bucket/chapter07/output'

In [28]:
!aws s3 cp {output_destination}/ tmp/ --recursive

download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis_config.json to tmp/analysis_config.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/analysis.json to tmp/analysis.json
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.ipynb to tmp/report.ipynb
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.pdf to tmp/report.pdf
download: s3://sagemaker-cookbook-bucket/chapter07/output/explanations_shap/baseline.csv to tmp/explanations_shap/baseline.csv
download: s3://sagemaker-cookbook-bucket/chapter07/output/report.html to tmp/report.html
download: s3://sagemaker-cookbook-bucket/chapter07/output/explanations_shap/out.csv to tmp/explanations_shap/out.csv


In [29]:
!ls -lahF tmp/

total 880K
drwxr-xr-x 4 root root 6.0K Jun 13 17:31 ./
drwxr-xr-x 6 root root 6.0K Jun 13 17:30 ../
drwxr-xr-x 2 root root 6.0K May 29 23:03 .ipynb_checkpoints/
-rw-r--r-- 1 root root  913 Jun 13 17:31 analysis.json
-rw-r--r-- 1 root root  320 Jun 13 17:25 analysis_config.json
-rw-r--r-- 1 root root 139K May 24 17:15 baseline.csv
-rw-r--r-- 1 root root 133K May 24 17:28 baseline_no_label.csv
-rw-r--r-- 1 root root 1.1K May 24 20:29 constraints.json
drwxr-xr-x 2 root root 6.0K Jun 13 17:31 explanations_shap/
-rw-r--r-- 1 root root 288K Jun 13 17:31 report.html
-rw-r--r-- 1 root root  22K Jun 13 17:31 report.ipynb
-rw-r--r-- 1 root root  43K Jun 13 17:31 report.pdf
-rw-r--r-- 1 root root  407 May 24 15:11 sample.jsonl
-rw-r--r-- 1 root root 122K May 24 20:29 statistics.json
-rw-r--r-- 1 root root 3.8K Jun 13 17:23 test_data.csv
-rw-r--r-- 1 root root 3.8K Jun 13 17:23 test_data_no_header.csv
-rw-r--r-- 1 root root  45K May 30 09:03 test_features.csv
-rw-r--r-- 1 root root  12K Jun 13 17:

In [30]:
!cat tmp/analysis.json

{
    "version": "1.0",
    "pre_training_bias_metrics": {
        "label": "approved",
        "facets": {
            "sex": [
                {
                    "value_or_threshold": "1",
                    "metrics": [
                        {
                            "name": "CI",
                            "description": "Class Imbalance (CI)",
                            "value": -0.5933333333333334
                        }
                    ]
                },
                {
                    "value_or_threshold": "0",
                    "metrics": [
                        {
                            "name": "CI",
                            "description": "Class Imbalance (CI)",
                            "value": 0.5933333333333334
                        }
                    ]
                }
            ]
        },
        "label_value_or_threshold": "1"
    }
}