In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

In [3]:
from models import customXGB
from lib2 import valid_triples_with_coverage_correctness, rules2rulesbyif

In [4]:
DATAFILE = '../adult.data' # location of dataset
random_state = None # change to something for exactly reproducible results
sensitive_attribute = "Sex"
target_name = "label"
positive_label = ">50K"
negative_label = "<=50K"
model_train_fraction = 0.7

In [5]:
# specify feature names
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
# specify categorical columns
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

In [6]:
data = pd.DataFrame(
  np.genfromtxt(DATAFILE, delimiter=', ', dtype=str),
  columns=feature_names
)

In [7]:
X = data.drop(target_name, axis=1)
y = data[target_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=model_train_fraction, random_state=random_state)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [8]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(X_train, y_train, cate_columns=cate_columns)

<models.customXGB at 0x1d335cac670>

# Find all valid if-thens with all respective coverages and correctness, for all subgroups.

**Caution!** This step takes time.

In [8]:
ifthens_coverage_correctness = valid_triples_with_coverage_correctness(
    X_test,
    model,
    sensitive_attribute
)

print(f"Number of if-thens: {len(ifthens_coverage_correctness)}")

100%|██████████████████████████████████████████████████████████████████████████████| 4294/4294 [17:52<00:00,  4.00it/s]

Number of if-thens: 4294





In [9]:
rules_by_if = rules2rulesbyif(ifthens_coverage_correctness)

In [11]:
K = 10

# keep K ifs with maximum coverage on Females
rules_by_if = sorted(rules_by_if.items(), key=lambda e: e[1]["Female"][0][1], reverse=True)[:K]
rules_by_if = dict(rules_by_if)

In [1]:
##### Uncomment if you wish to save rules object on disk

# import dill

# with open("rules.data", "wb") as outf:
#     dill.dump(rules_by_if, outf)

In [9]:
##### Uncomment if you wish to load rules object from disk

# import dill

# with open("rules.data", "rb") as inf:
#     rules_by_if = dill.load(inf)

In [10]:
from formatting import recourse_report_reverse

In [11]:
print(recourse_report_reverse(list(rules_by_if.items())))

If Capital Gain = 0:
	Subgroup 'Female'
		Make Capital Gain = 15024 with coverage 0.9617726657645467 and correctness 0.8800562785789658.
		Make Capital Gain = 7688 with coverage 0.9617726657645467 and correctness 0.544143510376363.
		Make Capital Gain = 7298 with coverage 0.9617726657645467 and correctness 0.46676046429827645.
	Subgroup 'Male'
		Make Capital Gain = 15024 with coverage 0.9566315789473684 and correctness 0.9335387323943662.
		Make Capital Gain = 7688 with coverage 0.9566315789473684 and correctness 0.7742077464788732.
		Make Capital Gain = 7298 with coverage 0.9566315789473684 and correctness 0.7277728873239437.
If Capital Loss = 0, Capital Gain = 0:
	Subgroup 'Female'
		Make Capital Loss = 0, Capital Gain = 15024 with coverage 0.9336941813261164 and correctness 0.8815217391304347.
		Make Capital Loss = 0, Capital Gain = 7688 with coverage 0.9336941813261164 and correctness 0.5442028985507246.
		Make Capital Loss = 0, Capital Gain = 7298 with coverage 0.9336941813261164 