In [14]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [119]:
data = pd.read_csv('adult_income-prepped.csv',  index_col=0).reset_index(drop=True) 
data.head()

Unnamed: 0,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,workclass_Private,226802,education_11th,7,marital-status_Never-married,occupation_Machine-op-inspct,relationship_Own-child,race_Black,gender_Male,0,0,40,native-country_United-States,0
1,workclass_Private,89814,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Farming-fishing,relationship_Husband,race_White,gender_Male,0,0,50,native-country_United-States,0
2,workclass_Local-gov,336951,education_Assoc-acdm,12,marital-status_Married-civ-spouse,occupation_Protective-serv,relationship_Husband,race_White,gender_Male,0,0,40,native-country_United-States,1
3,workclass_Private,195878,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Exec-managerial,relationship_Husband,race_White,gender_Male,0,0,24,native-country_Cuba,0
4,workclass_Private,160323,education_Some-college,10,marital-status_Married-civ-spouse,occupation_Machine-op-inspct,relationship_Husband,race_Black,gender_Male,7688,0,40,native-country_United-States,1


In [120]:
%%writefile cat_encoder.py

from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
import numpy as np

class CatEncoder:
    def __init__(self, cat_columns, data, normalize: bool=True):
        self.cat_indexes = [data.columns.get_loc(name) for name in cat_columns]
        self.num_indexes = [idx for idx in range(len(data.columns)) if idx not in self.cat_indexes]
        self.encoder = preprocessing.OneHotEncoder()
        self.encoder.fit(data[cat_columns])
        self.num_columns = list(data.columns[self.num_indexes])
        self.cat_columns = cat_columns
        cat_transformed_names = self.encoder.get_feature_names(input_features=self.cat_columns)
        self._transformed_column_names =  self.num_columns + list(cat_transformed_names)
        if normalize:
            self.normalizer = Normalizer()
            self.normalizer.fit(data.iloc[:, self.num_indexes])
        else:
            self.normalizer = None

    def __call__(self, x):
        numeric = x[:, self.num_indexes]
        if self.normalizer is not None:
            numeric = self.normalizer.transform(numeric)
        categorical = self.encoder.transform(x[:, self.cat_indexes]).toarray()
        return np.concatenate((numeric, categorical), axis=1)

    @property
    def transformed_features(self):
        return self._transformed_column_names

Overwriting cat_encoder.py


In [121]:
df = pd.read_csv('adult_income-prepped.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,workclass_Private,226802,education_11th,7,marital-status_Never-married,occupation_Machine-op-inspct,relationship_Own-child,race_Black,gender_Male,0,0,40,native-country_United-States,0
1,38,workclass_Private,89814,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Farming-fishing,relationship_Husband,race_White,gender_Male,0,0,50,native-country_United-States,0
2,28,workclass_Local-gov,336951,education_Assoc-acdm,12,marital-status_Married-civ-spouse,occupation_Protective-serv,relationship_Husband,race_White,gender_Male,0,0,40,native-country_United-States,1
3,58,workclass_Private,195878,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Exec-managerial,relationship_Husband,race_White,gender_Male,0,0,24,native-country_Cuba,0
4,44,workclass_Private,160323,education_Some-college,10,marital-status_Married-civ-spouse,occupation_Machine-op-inspct,relationship_Husband,race_Black,gender_Male,7688,0,40,native-country_United-States,1


In [118]:
# Separate outcome
label_column = 'income'
y = df[label_column]
X_raw = df.drop(label_column, axis=1)

# remove some additional columns for demo purposes
rm=["fnlwgt", "capital-loss"]
dropped_indexes_list = [i for i,col in enumerate(X_raw.columns.to_list()) if col in rm]
final_list=X_raw.columns.to_list()
for i in rm:   
    final_list.remove(i)
X = X_raw[final_list]
X.head()

[2, 11]


Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,hours-per-week,native-country
0,25,workclass_Private,education_11th,7,marital-status_Never-married,occupation_Machine-op-inspct,relationship_Own-child,race_Black,gender_Male,0,40,native-country_United-States
1,38,workclass_Private,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Farming-fishing,relationship_Husband,race_White,gender_Male,0,50,native-country_United-States
2,28,workclass_Local-gov,education_Assoc-acdm,12,marital-status_Married-civ-spouse,occupation_Protective-serv,relationship_Husband,race_White,gender_Male,0,40,native-country_United-States
3,58,workclass_Private,education_HS-grad,9,marital-status_Married-civ-spouse,occupation_Exec-managerial,relationship_Husband,race_White,gender_Male,0,24,native-country_Cuba
4,44,workclass_Private,education_Some-college,10,marital-status_Married-civ-spouse,occupation_Machine-op-inspct,relationship_Husband,race_Black,gender_Male,7688,40,native-country_United-States


In [122]:
final_list

['age',
 'workclass',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'hours-per-week',
 'native-country']

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [124]:
from cat_encoder import CatEncoder
cat_columns = [
   'workclass', 
   'education', 
   'marital-status', 
   'occupation', 
   'relationship',
   'race',
   'gender',
   'native-country'
           ]
encoder = CatEncoder(cat_columns, X)

In [125]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

In [126]:
data_dmatrix = xgb.DMatrix(data=encoder(X_train.values),label=y_train)

In [127]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

In [128]:
dtest = xgb.DMatrix(encoder(X_test.values))
preds = xg_reg.predict(dtest)
best_preds = [int(round(value)) for value in preds]
acc = accuracy_score(y_test, best_preds)
acc

0.8283345275872659

In [129]:
# demo predicts(on 10 data instances) after converting from raw dataframe using final column list model is trained on
dtest_demo = xgb.DMatrix(encoder(np.array(df[:10][final_list])))
preds_demo = xg_reg.predict(dtest_demo)
preds_demo

array([0.20724142, 0.3638665 , 0.37601352, 0.3640377 , 0.73832226,
       0.3376968 , 0.19092968, 0.22121799, 0.2664432 , 0.5365489 ],
      dtype=float32)

In [198]:
a = np.array(df[:1])
print(a.shape)
np.delete(a, dropped_indexes_list,1).shape

(1, 15)


(1, 13)

In [199]:
%%writefile predict_transform.py

import numpy as np
import xgboost as xgb

class TransformedPredict:
    def __init__(self,model,dropped_indexes_list):
        self.model = model
        
    def predict(self,arr):
        dtest = xgb.DMatrix(data=np.delete(arr, self.dropped_indexes_list,1))
        return self.model.predict(dtest)
        

Overwriting predict_transform.py


In [140]:
from predict_transform import TransformedPredict
certifai_xgb_model = TransformedPredict(xg_reg,dropped_indexes_list)

In [139]:
from certifai.scanner.builder import (CertifaiScanBuilder, CertifaiPredictorWrapper, CertifaiModel, CertifaiModelMetric,
                                      CertifaiDataset, CertifaiGroupingFeature, CertifaiDatasetSource,
                                      CertifaiPredictionTask, CertifaiTaskOutcomes, CertifaiOutcomeValue)
from certifai.scanner.report_utils import scores, construct_scores_dataframe

In [141]:
xbg_model_proxy = CertifaiPredictorWrapper(certifai_xgb_model, encoder=encoder)

In [144]:
# Create the scan object from scratch using the ScanBuilder class

# First define the possible prediction outcomes
task = CertifaiPredictionTask(CertifaiTaskOutcomes.classification(
    [
        CertifaiOutcomeValue(1, name='income > 50K', favorable=True),
        CertifaiOutcomeValue(0, name='income < 50K')
    ]),
    prediction_description='Determine whether income greater than 50K or less')

scan = CertifaiScanBuilder.create('test_user_case',
                                  prediction_task=task)

# Add our local models
first_model = CertifaiModel('XGB',
                            local_predictor=xbg_model_proxy)
scan.add_model(first_model)

# Add the eval dataset
eval_dataset = CertifaiDataset('evaluation',
                               CertifaiDatasetSource.csv('adult_income-prepped.csv'))
scan.add_dataset(eval_dataset)

# Setup an evaluation for fairness on the above dataset using the model
# We'll look at disparity between groups defined by marital status and age
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('race'))
scan.add_fairness_grouping_feature(CertifaiGroupingFeature('gender'))
scan.add_evaluation_type('fairness')
scan.evaluation_dataset_id = 'evaluation'

# Because the dataset contains a ground truth outcome column which the model does not
# expect to receive as input we need to state that in the dataset schema (since it cannot
# be inferred from the CSV)
scan.dataset_schema.outcome_feature_name = 'income'

# Run the scan.
# By default this will write the results into individual report files (one per model and evaluation
# type) in the 'reports' directory relative to the Jupyter root.  This may be disabled by specifying
# `write_reports=False` as below
# The result is a dictionary of dictionaries of reports.  The top level dict key is the evaluation type
# and the second level key is model id.
# Reports saved as JSON (which `write_reports=True` will do) may be visualized in the console app
# result = scan.run(write_reports=False)