<a href="https://colab.research.google.com/github/AjeetSingh02/ExplainableAI/blob/master/FairML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>FairML: Auditing Black-Box Predictive Models</h1>

In [5]:
# clone the repository
!git clone https://github.com/adebayoj/fairml.git

Cloning into 'fairml'...
remote: Enumerating objects: 606, done.[K
remote: Total 606 (delta 0), reused 0 (delta 0), pack-reused 606
Receiving objects: 100% (606/606), 18.98 MiB | 38.04 MiB/s, done.
Resolving deltas: 100% (311/311), done.


In [0]:
# Install using setup.py
!sudo python /content/fairml/setup.py install

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [0]:
plt.style.use('ggplot')
plt.figure(figsize=(6, 6))

In [0]:
"""
Iimport the two key methods from fairml.
audit_model takes:

- (required) black-box function, which is the model to be audited
- (required) sample_data to be perturbed for querying the function. 
    This has to be a pandas dataframe with no missing data.

- other optional parameters that control the mechanics of the auditing process, for example:
  - number_of_runs : number of iterations to perform
  - interactions : flag to enable checking model dependence on interactions.

audit_model returns an overloaded dictionary where keys are the column names of input pandas dataframe and values are lists containing model  dependence on that particular feature. These lists of size number_of_runs.

"""
from fairml import audit_model
from fairml import plot_dependencies

In [0]:
# Taking sample data from repo itself
propublica_data = pd.read_csv("/content/fairml/doc/example_notebooks/propublica_data_for_fairml.csv")

In [0]:
# create feature and design matrix for model building.
compas_rating = propublica_data.score_factor.values
propublica_data = propublica_data.drop("score_factor", 1)

In [0]:
# this is just for demonstration, any classifier or regressor
# can be used here. fairml only requires a predict function
# to diagnose a black-box model.

In [15]:
# we fit a quick and dirty logistic regression sklearn
# model here.
clf = LogisticRegression(penalty='l2', C=0.01)
clf.fit(propublica_data.values, compas_rating)



LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Now let's audit the model built with FairML.

In [0]:
#  call audit model with model
total, _ = audit_model(clf.predict, propublica_data)

In [17]:
# print feature importance
print(total)

Feature: Two_yr_Recidivism,	 Importance: 0.22893713545042127
Feature: Number_of_Priors,	 Importance: 0.3608230719377835
Feature: Age_Above_FourtyFive,	 Importance: -0.006804925469863901
Feature: Age_Below_TwentyFive,	 Importance: 0.15327284510693454
Feature: African_American,	 Importance: 0.23493195074530135
Feature: Asian,	 Importance: -0.0003240440699935191
Feature: Hispanic,	 Importance: -0.008425145819831496
Feature: Native_American,	 Importance: 0.0004860661049902787
Feature: Other,	 Importance: -0.004860661049902786
Feature: Female,	 Importance: 0.04536616979909268
Feature: Misdemeanor,	 Importance: -0.052657161373946854


In [0]:
# generate feature dependence plot
fig = plot_dependencies(
    total.median(),
    reverse_values=False,
    title="FairML feature dependence"
)

In [0]:
file_name = "fairml_propublica_linear_direct.png"
plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250)

Trying on some other dataset

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

In [0]:
data = load_boston()

In [0]:
boston_df = pd.DataFrame(data.data, columns = data.feature_names)

In [55]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [62]:
boston_df.shape

(506, 13)

In [0]:
def convert_to_int(val):
    return int(val)

In [0]:
y = data.target

In [0]:
y = list(map(int, y))

In [0]:
clf = RandomForestClassifier(random_state=0)

In [72]:
clf.fit(boston_df.values, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [0]:
total, _ = audit_model(clf.predict, boston_df)

In [74]:
print(total)

Feature: CRIM,	 Importance: -43.901185770750985
Feature: ZN,	 Importance: -16.284584980237153
Feature: INDUS,	 Importance: -114.81225296442688
Feature: CHAS,	 Importance: -10.49802371541502
Feature: NOX,	 Importance: -93.22332015810277
Feature: RM,	 Importance: 82.45256916996047
Feature: AGE,	 Importance: -87.8913043478261
Feature: DIS,	 Importance: -80.03754940711462
Feature: RAD,	 Importance: -87.55335968379447
Feature: TAX,	 Importance: -105.0494071146245
Feature: PTRATIO,	 Importance: -82.88735177865613
Feature: B,	 Importance: 83.88932806324111
Feature: LSTAT,	 Importance: -99.45849802371542


In [0]:
fig = plot_dependencies(
    total.median(),
    reverse_values=True,
    title="FairML feature dependence"
)

In [0]:
file_name = "fairml_linear_direct.png"
plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=250)