In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from lib2 import Predicate, optimize
from models import customXGB
from apriori import preprocessDataset, runApriori, aprioriout2predicateList

In [2]:
DATAFILE = '../adult.data' # location of dataset
random_state = None # change to something for exactly reproducible results
sensitive_attribute = "Sex"
target_name = "label"
positive_label = ">50K"
negative_label = "<=50K"
model_train_fraction = 0.7

In [3]:
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
data = pd.DataFrame(
  np.genfromtxt(DATAFILE, delimiter=', ', dtype=str),
  columns=feature_names
)
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

# Train test split

In [4]:
X = data.drop(target_name, axis=1)
y = data[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=model_train_fraction, random_state=random_state)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Definition and Training of a black-box model

In [5]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(X_train, y_train, cate_columns=cate_columns)

<models.customXGB at 0x2a190c0b0d0>

In [6]:
model.predict(X_test.iloc[:100, :])

array([0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1])

# Testing the model

In [7]:
preds = model.predict(X_test)
print(classification_report(y_test.map({negative_label: 0, positive_label: 1}), preds))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91      7413
           1       0.76      0.66      0.71      2356

    accuracy                           0.87      9769
   macro avg       0.83      0.80      0.81      9769
weighted avg       0.86      0.87      0.86      9769



# Finding the affected

In [8]:
X_aff_idxs = np.where(model.predict(X_test) == 0)[0]
print(data.shape)
print(X_aff_idxs.shape)
X_aff = X_test.iloc[X_aff_idxs, :]
print(X_aff.shape)
# X_aff.reset_index(inplace=True, drop=True)

(32561, 15)
(7706,)
(7706, 14)


# Running apriori adaptation

In [9]:
d = X_test.drop([sensitive_attribute], axis=1)
freq_itemsets = runApriori(preprocessDataset(d), min_support=0.03)
print(freq_itemsets)

       support                                           itemsets
0     0.955574                                   (0+Capital Loss)
1     0.914935                                   (0+Capital Gain)
8     0.896816                            (United-States+Country)
54    0.870509                   (0+Capital Loss, 0+Capital Gain)
577   0.856485            (0+Capital Loss, United-States+Country)
...        ...                                                ...
4585  0.030095  (0+Capital Loss, White+Race, Assoc-voc+Educati...
4501  0.030095  (Sales+Occupation, 0+Capital Loss, 40+Hours pe...
2739  0.030095  (Craft-repair+Occupation, Married-civ-spouse+M...
3197  0.030095  (0+Capital Loss, Assoc-acdm+Education, United-...
3198  0.030095  (12+Education-Num, 0+Capital Loss, Assoc-acdm+...

[5806 rows x 2 columns]


In [10]:
print(freq_itemsets.tail(100))
# print(freq_itemsets.tail(100).to_string()) # uncomment for more details

       support                                           itemsets
2381  0.030402  (Not-in-family+Relationship, Adm-clerical+Occu...
2373  0.030402  (Some-college+Education, Adm-clerical+Occupati...
4486  0.030402  (Private+Workclass, Married-civ-spouse+Marital...
906   0.030402  (0+Capital Loss, White+Race, 9+Education-Num, ...
2360  0.030402  (Some-college+Education, Adm-clerical+Occupati...
...        ...                                                ...
4585  0.030095  (0+Capital Loss, White+Race, Assoc-voc+Educati...
4501  0.030095  (Sales+Occupation, 0+Capital Loss, 40+Hours pe...
2739  0.030095  (Craft-repair+Occupation, Married-civ-spouse+M...
3197  0.030095  (0+Capital Loss, Assoc-acdm+Education, United-...
3198  0.030095  (12+Education-Num, 0+Capital Loss, Assoc-acdm+...

[100 rows x 2 columns]


In [11]:
RL = aprioriout2predicateList(freq_itemsets)
pprint(RL[:10])
print(len(RL))

[Predicate(features=['Capital Loss'], values=['0']),
 Predicate(features=['Capital Gain'], values=['0']),
 Predicate(features=['Country'], values=['United-States']),
 Predicate(features=['Capital Loss', 'Capital Gain'], values=['0', '0']),
 Predicate(features=['Capital Loss', 'Country'], values=['0', 'United-States']),
 Predicate(features=['Race'], values=['White']),
 Predicate(features=['Country', 'Capital Gain'], values=['United-States', '0']),
 Predicate(features=['Capital Loss', 'Race'], values=['0', 'White']),
 Predicate(features=['Country', 'Race'], values=['United-States', 'White']),
 Predicate(features=['Race', 'Capital Gain'], values=['White', '0'])]
5806


# Running the optimization procedure

First, get the user-defined SD (subgroup descriptors)

In [12]:
SD = list(map(Predicate.from_dict, [
    {sensitive_attribute: val for val in data[sensitive_attribute].unique()}
]))
print(SD)

[Predicate(features=['Sex'], values=['Female'])]


Now, we run the submodular optimization

In [13]:
%%time

final_rules = optimize(SD, RL[:400], X_aff[:400], model)

Total triples = 90
X_aff shape before: (400, 14)
Calculated incorrect recourse for each triple
Calculated feature costs for each triple
Calculated feature changes for each feature
X_aff shape after: (400, 14)
set()
Calculated covers for each triple
6
CPU times: total: 13.1 s
Wall time: 1.71 s


In [38]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -67,
 72,
 -1,
 -1)


In [19]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -70,
 75,
 -1,
 -1)


In [29]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['10']),
   Predicate(features=['Education-Num'], values=['13']))],
 -159,
 206,
 -3,
 -3)
