In [1]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('adult_income-prepped.csv',  index_col=0).reset_index(drop=True) 
data.head()

Unnamed: 0,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,workclass_Private,226802,education_11th,7,marital-status_Never-married,occupation_Machine-op-inspct,relationship_Own-child,race_Black,gender_Male,0,0,40,native-country_United-States,0
1,workclass_Private,89814,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Farming-fishing,relationship_Husband,race_White,gender_Male,0,0,50,native-country_United-States,0
2,workclass_Local-gov,336951,education_Assoc-acdm,12,marital-status_Married-civ-spouse,occupation_Protective-serv,relationship_Husband,race_White,gender_Male,0,0,40,native-country_United-States,1
3,workclass_Private,195878,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Exec-managerial,relationship_Husband,race_White,gender_Male,0,0,24,native-country_Cuba,0
4,workclass_Private,160323,education_Some-college,10,marital-status_Married-civ-spouse,occupation_Machine-op-inspct,relationship_Husband,race_Black,gender_Male,7688,0,40,native-country_United-States,1


In [3]:
%%writefile cat_encoder.py

from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
import numpy as np

class CatEncoder:
    def __init__(self, cat_columns, data, normalize: bool=True):
        self.cat_indexes = [data.columns.get_loc(name) for name in cat_columns]
        self.num_indexes = [idx for idx in range(len(data.columns)) if idx not in self.cat_indexes]
        self.encoder = preprocessing.OneHotEncoder()
        self.encoder.fit(data[cat_columns])
        self.num_columns = list(data.columns[self.num_indexes])
        self.cat_columns = cat_columns
        cat_transformed_names = self.encoder.get_feature_names(input_features=self.cat_columns)
        self._transformed_column_names =  self.num_columns + list(cat_transformed_names)
        if normalize:
            self.normalizer = Normalizer()
            self.normalizer.fit(data.iloc[:, self.num_indexes])
        else:
            self.normalizer = None

    def __call__(self, x):
        numeric = x[:, self.num_indexes]
        if self.normalizer is not None:
            numeric = self.normalizer.transform(numeric)
        categorical = self.encoder.transform(x[:, self.cat_indexes]).toarray()
        return np.concatenate((numeric, categorical), axis=1)

    @property
    def transformed_features(self):
        return self._transformed_column_names

Overwriting cat_encoder.py


In [4]:
%%writefile decoder.py
import numpy as np

class DecoderCertifai:
    def __init__(self,threshold):
        self.threshold = threshold
    
    def __call__(self,x):
        if not isinstance(x, np.ndarray):
             x = np.array(x)
        return (x > self.threshold).astype(int)

Overwriting decoder.py


In [5]:
df = pd.read_csv('adult_income-prepped.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,workclass_Private,226802,education_11th,7,marital-status_Never-married,occupation_Machine-op-inspct,relationship_Own-child,race_Black,gender_Male,0,0,40,native-country_United-States,0
1,38,workclass_Private,89814,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Farming-fishing,relationship_Husband,race_White,gender_Male,0,0,50,native-country_United-States,0
2,28,workclass_Local-gov,336951,education_Assoc-acdm,12,marital-status_Married-civ-spouse,occupation_Protective-serv,relationship_Husband,race_White,gender_Male,0,0,40,native-country_United-States,1
3,58,workclass_Private,195878,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Exec-managerial,relationship_Husband,race_White,gender_Male,0,0,24,native-country_Cuba,0
4,44,workclass_Private,160323,education_Some-college,10,marital-status_Married-civ-spouse,occupation_Machine-op-inspct,relationship_Husband,race_Black,gender_Male,7688,0,40,native-country_United-States,1


In [6]:
# Separate outcome
label_column = 'income'
y = df[label_column]
X_raw = df.drop(label_column, axis=1)

# remove some additional columns for demo purposes
rm=["fnlwgt", "capital-loss"]
dropped_indexes_list = [i for i,col in enumerate(X_raw.columns.to_list()) if col in rm]
final_list=X_raw.columns.to_list()
for i in rm:   
    final_list.remove(i)
X = X_raw[final_list]


In [7]:
%%writefile demo_encoder.py

import numpy as np
import xgboost as xgb
class DemoEncoder:
    
    def __init__(self,drop_columns):
        self.drop_columns = drop_columns
    
    def __call__(self,x,drop=True):
        if not drop:
            return xgb.DMatrix(data=x)
        return xgb.DMatrix(data=np.delete(x, self.drop_columns,1))

Overwriting demo_encoder.py


In [16]:
from demo_encoder import DemoEncoder
demo_encoder = DemoEncoder(dropped_indexes_list)
xg_reg.predict(demo_encoder(encoder(X_test[:10].values),drop=False))

array([0.20345506, 0.316487  , 0.710422  , 0.3638665 , 0.40527058,
       0.21702528, 0.258692  , 0.47924864, 0.23106048, 0.19092968],
      dtype=float32)

In [9]:
final_list

['age',
 'workclass',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'hours-per-week',
 'native-country']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [11]:
from cat_encoder import CatEncoder
cat_columns = [
   'workclass', 
   'education', 
   'marital-status', 
   'occupation', 
   'relationship',
   'race',
   'gender',
   'native-country'
           ]
encoder = CatEncoder(cat_columns, X)

In [12]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

In [13]:
data_dmatrix = xgb.DMatrix(data=encoder(X_train.values),label=y_train)

In [14]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

In [15]:
dtest = xgb.DMatrix(encoder(X_test.values))
preds = xg_reg.predict(dtest)
best_preds = [int(round(value)) for value in preds]
acc = accuracy_score(y_test, best_preds)
acc

0.8283345275872659

In [16]:
xg_reg.predict(xgb.DMatrix(encoder(X_test[:10].values)))

array([0.20345506, 0.316487  , 0.710422  , 0.3638665 , 0.40527058,
       0.21702528, 0.258692  , 0.47924864, 0.23106048, 0.19092968],
      dtype=float32)

In [None]:
a = np.array(df[:10])
print(a.shape)
a_deleted = np.delete(a, dropped_indexes_list,1)
a_deleted.shape

In [None]:
from decoder import DecoderCertifai
decoder  = DecoderCertifai(threshold=0.5)

In [20]:
%%writefile predict_transform.py

import numpy as np
import xgboost as xgb

class TransformedPredict:
    def __init__(self,model):
        self.model = model
        
    def predict(self,arr):
        dtest = xgb.DMatrix(data=arr)
        return self.model.predict(dtest)
        

Overwriting predict_transform.py


In [21]:
from predict_transform import TransformedPredict
certifai_xgb_model = TransformedPredict(xg_reg,dropped_indexes_list)

In [22]:
from certifai.scanner.builder import (CertifaiScanBuilder, CertifaiPredictorWrapper, CertifaiModel, CertifaiModelMetric,
                                      CertifaiDataset, CertifaiGroupingFeature, CertifaiDatasetSource,
                                      CertifaiPredictionTask, CertifaiTaskOutcomes, CertifaiOutcomeValue)
from certifai.scanner.report_utils import scores, construct_scores_dataframe

In [25]:
xbg_model_proxy = CertifaiPredictorWrapper(certifai_xgb_model, encoder=encoder, decoder=decoder)

In [26]:
dataframe_certifai = X_raw[final_list]

In [28]:
# Create the scan object from scratch using the ScanBuilder class

# First define the possible prediction outcomes
task = CertifaiPredictionTask(CertifaiTaskOutcomes.classification(
    [
        CertifaiOutcomeValue(1, name='income > 50K', favorable=True),
        CertifaiOutcomeValue(0, name='income < 50K')
    ]),
    prediction_description='Determine whether income greater than 50K or less')

scan = CertifaiScanBuilder.create('test_user_case',
                                  prediction_task=task)

# Add our local models
first_model = CertifaiModel('XGB',
                            local_predictor=xbg_model_proxy)
scan.add_model(first_model)

# Add the eval dataset
eval_dataset = CertifaiDataset('evaluation',
                               CertifaiDatasetSource.dataframe(dataframe_certifai))
scan.add_dataset(eval_dataset)

# Setup an evaluation for fairness on the above dataset using the model
# We'll look at disparity between groups defined by marital status and age
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('race'))
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('gender'))
scan.add_evaluation_type('fairness')
scan.evaluation_dataset_id = 'evaluation'

# Because the dataset contains a ground truth outcome column which the model does not
# expect to receive as input we need to state that in the dataset schema (since it cannot
# be inferred from the CSV)
scan.dataset_schema.outcome_feature_name = 'income'

# Run the scan.
# By default this will write the results into individual report files (one per model and evaluation
# type) in the 'reports' directory relative to the Jupyter root.  This may be disabled by specifying
# `write_reports=False` as below
# The result is a dictionary of dictionaries of reports.  The top level dict key is the evaluation type
# and the second level key is model id.
# Reports saved as JSON (which `write_reports=True` will do) may be visualized in the console app
result = scan.run(write_reports=False)

[--------------------] 2020-05-22 21:04:57.515139 - 0 of 1 reports (0.0% complete) - Starting scan with model_use_case_id: 'test_user_case' and scan_id: '1b1c7d7df1ce'
[--------------------] 2020-05-22 21:04:57.515357 - 0 of 1 reports (0.0% complete) - Running fairness evaluation for model: XGB
[####################] 2020-05-22 21:15:51.104844 - 1 of 1 reports (100.0% complete) - Completed all evaluations


In [29]:
result

{'fairness': {'XGB': OrderedDict([('scan',
                {'scan_id': '1b1c7d7df1ce',
                 'evaluation_type': 'fairness',
                 'started': '2020-05-22T21:04:57',
                 'ended': '2020-05-22T21:15:51',
                 'name': 'test_user_case'}),
               ('model_use_case',
                {'name': 'test_user_case',
                 'model_use_case_id': 'test_user_case',
                 'task_type': 'binary-classification'}),
               ('model',
                {'model_id': 'XGB',
                 'name': 'XGB',
                 'prediction_value_order': [1, 0]}),
               ('dataset',
                {'dataset_id': 'evaluation',
                 'num_features': 12,
                 'rows': 48842}),
               ('dataset_schema',
                {'outcome_column': 'income',
                 'feature_schemas': [{'type': <FeatureTypeEnum.INT: 'INT'>,
                   'min': 17,
                   'max': 90,
                   'spread