In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# place lib2.py, models.py and apriori.py in the same folder
import lib2
from lib2 import Predicate, optimize
from models import customXGB
from apriori import preprocessDataset, runApriori, aprioriout2predicateList

In [2]:
DATAFILE = '../adult.data' # location of dataset
random_state = None # change to something for exactly reproducible results
sensitive_attribute = "Sex"
target_name = "label"
positive_label = ">50K"
negative_label = "<=50K"
model_train_fraction = 0.7

In [3]:
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

In [4]:
# define featureCost as a mapping holding, for each feature name, the respective cost of not keeping that feature constant
# Any feature not specified will have its cost set to 1
featureCosts = {"Sex": 100}

# define featureChange as a mapping from str to function, holding, for each feature name, the respective 
# function that calculates the cost of change from one value to another.
# Any feature change not specified will be set to 1 if there is change, and 0 otherwise.
def age_cost(age1: str, age2: str) -> int:
    return abs(int(age1) - int(age2))
featureChange = {"Age": age_cost}

In [5]:
lib2.setFeatureCost(featureCosts)
lib2.setFeatureChange(featureChange)

# Data loading

In [6]:
data = pd.DataFrame(
  np.genfromtxt(DATAFILE, delimiter=', ', dtype=str),
  columns=feature_names
)

# Train test split

In [7]:
X = data.drop(target_name, axis=1)
y = data[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=model_train_fraction, random_state=random_state)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Definition and Training of a black-box model

In [8]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(X_train, y_train, cate_columns=cate_columns)

<models.customXGB at 0x1f204d93250>

In [9]:
model.predict(X_test.iloc[:100, :])

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0])

# Testing the model

In [10]:
preds = model.predict(X_test)
print(classification_report(y_test.map({negative_label: 0, positive_label: 1}), preds))

              precision    recall  f1-score   support

           0       0.90      0.93      0.91      7448
           1       0.75      0.66      0.70      2321

    accuracy                           0.87      9769
   macro avg       0.82      0.79      0.81      9769
weighted avg       0.86      0.87      0.86      9769



# Finding the affected

In [11]:
X_aff_idxs = np.where(model.predict(X_test) == 0)[0]
print(data.shape)
print(X_aff_idxs.shape)
X_aff = X_test.iloc[X_aff_idxs, :]
print(X_aff.shape)
# X_aff.reset_index(inplace=True, drop=True)

(32561, 15)
(7742,)
(7742, 14)


# Running apriori adaptation

In [12]:
d = X_test.drop([sensitive_attribute], axis=1)
freq_itemsets = runApriori(preprocessDataset(d), min_support=0.03)
print(freq_itemsets)

       support                                           itemsets
0     0.954652                                   (0+Capital Loss)
1     0.915037                                   (0+Capital Gain)
2     0.895383                            (United-States+Country)
55    0.869690                   (0+Capital Loss, 0+Capital Gain)
56    0.853619            (0+Capital Loss, United-States+Country)
...        ...                                                ...
5386  0.030095  (White+Race, 7+Education-Num, 0+Capital Loss, ...
1473  0.030095  (Private+Workclass, 0+Capital Loss, Husband+Re...
1476  0.030095  (Husband+Relationship, Private+Workclass, Marr...
1200  0.030095  (Never-married+Marital Status, 40+Hours per we...
4248  0.030095  (Never-married+Marital Status, HS-grad+Educati...

[5664 rows x 2 columns]


In [13]:
print(freq_itemsets.tail(100))
# print(freq_itemsets.tail(100).to_string()) # uncomment for more details

       support                                           itemsets
4399  0.030505  (HS-grad+Education, United-States+Country, Div...
5533  0.030505  (Married-civ-spouse+Marital Status, 0+Capital ...
5366  0.030505  (Married-civ-spouse+Marital Status, Husband+Re...
1318  0.030402  (Never-married+Marital Status, 9+Education-Num...
5592  0.030402  (0+Capital Loss, 0+Capital Gain, United-States...
...        ...                                                ...
5386  0.030095  (White+Race, 7+Education-Num, 0+Capital Loss, ...
1473  0.030095  (Private+Workclass, 0+Capital Loss, Husband+Re...
1476  0.030095  (Husband+Relationship, Private+Workclass, Marr...
1200  0.030095  (Never-married+Marital Status, 40+Hours per we...
4248  0.030095  (Never-married+Marital Status, HS-grad+Educati...

[100 rows x 2 columns]


In [14]:
RL = aprioriout2predicateList(freq_itemsets)
pprint(RL[:10])
print(len(RL))

[Predicate(features=['Capital Loss'], values=['0']),
 Predicate(features=['Capital Gain'], values=['0']),
 Predicate(features=['Country'], values=['United-States']),
 Predicate(features=['Capital Loss', 'Capital Gain'], values=['0', '0']),
 Predicate(features=['Capital Loss', 'Country'], values=['0', 'United-States']),
 Predicate(features=['Race'], values=['White']),
 Predicate(features=['Capital Gain', 'Country'], values=['0', 'United-States']),
 Predicate(features=['Capital Loss', 'Race'], values=['0', 'White']),
 Predicate(features=['Race', 'Country'], values=['White', 'United-States']),
 Predicate(features=['Capital Loss', 'Capital Gain', 'Country'], values=['0', '0', 'United-States'])]
5664


# Running the optimization procedure

First, get the user-defined SD (subgroup descriptors)

In [28]:
SD = list(map(Predicate.from_dict, [
    {sensitive_attribute: val} for val in data[sensitive_attribute].unique()
]))
print(SD)

[Predicate(features=['Sex'], values=['Male']), Predicate(features=['Sex'], values=['Female'])]


Now, we run the submodular optimization

In [29]:
%%time

final_rules = optimize(SD, RL[:400], X_aff, model)

Total triples = 180
X_aff shape before: (7742, 14)
Calculated incorrect recourse for each triple
Calculated feature costs for each triple
Calculated feature changes for each feature
X_aff shape after: (7742, 14)
set()
Calculated covers for each triple
21
CPU times: total: 2min 56s
Wall time: 1min 17s


In [30]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['10']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education'], values=['Bachelors']),
   Predicate(features=['Education'], values=['Some-college']))],
 -4085,
 4845,
 -4,
 -4)


In [19]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -1248,
 1362,
 -1,
 -1)


In [38]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -67,
 72,
 -1,
 -1)


In [19]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -70,
 75,
 -1,
 -1)


In [29]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['10']),
   Predicate(features=['Education-Num'], values=['13']))],
 -159,
 206,
 -3,
 -3)
