# Run Ad-Hoc Data Bias Analysis

In [3]:
from smclarify.bias import report
from typing import Dict
from collections import defaultdict
import pandas as pd
import seaborn as sns

# Read Dataset From S3

In [6]:
%store -r bias_data_s3_uri

In [7]:
print(bias_data_s3_uri)

s3://ads-508-team4/bias-detection-1647856853/master_unbal.csv


In [8]:
%store -r balanced_bias_data_s3_uri

In [9]:
print(balanced_bias_data_s3_uri)

s3://ads-508-team4/bias-detection-1647856853/master_bal.csv


In [10]:
!aws s3 cp $bias_data_s3_uri ./data-clarify/

download: s3://ads-508-team4/bias-detection-1647856853/master_unbal.csv to data-clarify/master_unbal.csv


In [11]:
!aws s3 cp $balanced_bias_data_s3_uri ./data-clarify/

download: s3://ads-508-team4/bias-detection-1647856853/master_bal.csv to data-clarify/master_bal.csv


In [46]:
df = pd.read_csv("./data-clarify/master_bal.csv")
df.shape

(17472, 2)

# Calculate Bias Metrics on Unbalanced Data

In [50]:
facet_column = report.FacetColumn(name='"product_category_name"')

label_column = report.LabelColumn(
    name='"price"', 
    data=df['"price"'],
    positive_label_values=[150, ]
)

# Run SageMaker Clarify Bias Report

In [51]:
report.bias_report(
    df=df, 
    facet_column=facet_column, 
    label_column=label_column, 
    stage_type=report.StageType.PRE_TRAINING, 
    metrics=["CI", "DPL", "KL", "JS", "LP", "TVD", "KS"]
)

[{'value_or_threshold': 'agro_industria_e_comercio',
  'metrics': [{'name': 'CI',
    'description': 'Class Imbalance (CI)',
    'value': 0.9896978021978022},
   {'name': 'DPL',
    'description': 'Difference in Positive Proportions in Labels (DPL)',
    'value': -0.22074176351014452},
   {'name': 'JS',
    'description': 'Jensen-Shannon Divergence (JS)',
    'value': 0.04972970448491889},
   {'name': 'KL',
    'description': 'Kullback-Liebler Divergence (KL)',
    'value': 0.09854273045130943},
   {'name': 'KS',
    'description': 'Kolmogorov-Smirnov Distance (KS)',
    'value': 0.22074176351014457},
   {'name': 'LP', 'description': 'L-p Norm (LP)', 'value': 0.3121759957382008},
   {'name': 'TVD',
    'description': 'Total Variation Distance (TVD)',
    'value': 0.22074176351014455}]},
 {'value_or_threshold': 'alimentos',
  'metrics': [{'name': 'CI',
    'description': 'Class Imbalance (CI)',
    'value': 0.9869505494505495},
   {'name': 'DPL',
    'description': 'Difference in Positi

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>