In [40]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

In [41]:
from frequent_itemsets import runApriori, aprioriout2predicateList, preprocessDataset
from models import customXGB
from predicate import Predicate, recIsValid
from recourse_sets import TwoLevelRecourseSet
from metrics import incorrectRecoursesSingle, coverSingle

In [42]:
DATAFILE = '../adult.data' # location of dataset
random_state = None # change to something for exactly reproducible results
sensitive_attribute = "Sex"
target_name = "label"
positive_label = ">50K"
negative_label = "<=50K"
model_train_fraction = 0.7

In [43]:
# specify feature names
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
# specify categorical columns
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

In [44]:
data = pd.DataFrame(
  np.genfromtxt(DATAFILE, delimiter=', ', dtype=str),
  columns=feature_names
)

In [45]:
X = data.drop(target_name, axis=1)
y = data[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=model_train_fraction, random_state=random_state)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [46]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(X_train, y_train, cate_columns=cate_columns)

<models.customXGB at 0x20223688880>

In [47]:
X_aff_idxs = np.where(model.predict(X_test) == 0)[0]
print(f"Test data shape: {X_test.shape}")
X_aff = X_test.iloc[X_aff_idxs, :]
print(f"Affected shape: {X_aff.shape}")
# X_aff.reset_index(inplace=True, drop=True)

Test data shape: (9769, 14)
Affected shape: (7704, 14)


In [48]:
X_unaff_idxs = np.where(model.predict(X_test) == 1)[0]
X_unaff = X_test.iloc[X_unaff_idxs, :]
print(f"Unaffected shape: {X_unaff.shape}")
# X_aff.reset_index(inplace=True, drop=True)

Unaffected shape: (2065, 14)


In [49]:
males_affected = X_aff[X_aff["Sex"] == "Male"].drop([sensitive_attribute], axis=1)
females_affected = X_aff[X_aff["Sex"] == "Female"].drop([sensitive_attribute], axis=1)

In [50]:
freq_ma = runApriori(preprocessDataset(males_affected), min_support=0.03)
freq_fa = runApriori(preprocessDataset(females_affected), min_support=0.03)

In [51]:
RL_ma, RL_ma_supports = aprioriout2predicateList(freq_ma)
RL_fa, RL_fa_supports = aprioriout2predicateList(freq_fa)

In [52]:
RL_ma_dict = [dict(zip(p.features, p.values)) for p in RL_ma]
RL_fa_dict = [dict(zip(p.features, p.values)) for p in RL_fa]

# Intersection of frequent itemsets for male and female uffected (to be used in the if clauses)

In [53]:
aff_intersection = []
supports_male = []
supports_female = []
for i, pred1 in enumerate(RL_ma_dict):
    for j, pred2 in enumerate(RL_fa_dict):
        if pred1 == pred2:
            aff_intersection.append(Predicate.from_dict(pred1))
            supports_male.append(RL_ma_supports[i])
            supports_female.append(RL_fa_supports[j])

# Frequent itemsets for the unaffacted (to be used in the then clauses)

In [54]:
freq_unaffected, _ = aprioriout2predicateList(runApriori(preprocessDataset(X_unaff), min_support=0.03))
freq_unaffected_dict = [dict(zip(p.features, p.values)) for p in freq_unaffected]

In [55]:
SD = list(map(Predicate.from_dict, [
    {sensitive_attribute: val} for val in data[sensitive_attribute].unique()
]))

# Filter all if-then pairs to keep only valid

In [56]:
ifthens = [(h, s, sm, sf) for h, sm, sf in zip(aff_intersection, supports_male, supports_female) for s in freq_unaffected if recIsValid(h, s)]

In [57]:
len(ifthens) 

2746

# Keep triples whose incorrect recourse percentage is at most a given threshold

In [58]:
from tqdm import tqdm

In [20]:
ifthens_inc = []
for h, s, sm, sf in tqdm(ifthens):
    incorrect_recourses_for_males = incorrectRecoursesSingle(SD[0], h, s, X_aff, model)
    covered_males = sm * males_affected.shape[0]
    inc_male = incorrect_recourses_for_males / covered_males
    incorrect_recourses_for_females = incorrectRecoursesSingle(SD[1], h, s, X_aff, model)
    covered_females = sf * females_affected.shape[0]
    inc_female = incorrect_recourses_for_females / covered_females
    ifthens_inc.append((h, s, sm, sf, inc_male, inc_female))

100%|██████████████████████████████████████████████████████████████████████████████| 2496/2496 [09:16<00:00,  4.49it/s]


In [59]:
threshold_incorrect = 0.20
threshold_coverage = 0.70

ifthens_filtered_male = []
ifthens_filtered_female = []
for h, s, sm, sf, inc_male, inc_female in ifthens_inc:
    if inc_male <= threshold_incorrect and sm >= threshold_coverage:
        ifthens_filtered_male.append((h, s, sm, inc_male))
    if inc_female <= threshold_incorrect and sf >= threshold_coverage:
        ifthens_filtered_female.append((h, s, sf, inc_female))

In [60]:
print(len(ifthens_filtered_male))
print(len(ifthens_filtered_female))

6
7


In [61]:
from lib2 import recourse_report_preprocessed

print(recourse_report_preprocessed(["Male", "Female"], {"Male": ifthens_filtered_male, "Female": ifthens_filtered_female}))

For subgroup 'Male':
	If Capital Gain = 0,
	Then Capital Gain = 15024.
		Coverage: 95.739% of those in the subgroup that are affected.
		Incorrect recourses: 4.623% over all individuals covered by this rule.
	If Capital Loss = 0, Capital Gain = 0,
	Then Capital Loss = 0, Capital Gain = 15024.
		Coverage: 92.885% of those in the subgroup that are affected.
		Incorrect recourses: 4.542% over all individuals covered by this rule.
	If Capital Gain = 0, Country = United-States,
	Then Capital Gain = 15024, Country = United-States.
		Coverage: 84.571% of those in the subgroup that are affected.
		Incorrect recourses: 4.696% over all individuals covered by this rule.
	If Capital Gain = 0, Race = White,
	Then Capital Gain = 15024, Race = White.
		Coverage: 83.764% of those in the subgroup that are affected.
		Incorrect recourses: 4.790% over all individuals covered by this rule.
	If Capital Loss = 0, Capital Gain = 0, Country = United-States,
	Then Capital Loss = 0, Capital Gain = 15024, Countr