In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import lib
from lib import Predicate, optimize
from models import customXGB
from apriori import preprocessDataset, runApriori, aprioriout2predicateList

In [2]:
# Location of adult dataset
ADULT_DATAFILE = '../adult.data'
random_state = None

In [3]:
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
data = pd.DataFrame(
  np.genfromtxt(ADULT_DATAFILE, delimiter=', ', dtype=str),
  columns=feature_names
)
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

# Train test split

In [4]:
X = data.drop("label", axis=1)
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Definition and Training of a black-box model

In [5]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(X_train, y_train, cate_columns=cate_columns)

<models.customXGB at 0x2a797dabfd0>

In [6]:
model.predict(X_test.iloc[:100, :])

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0])

# Testing the model

In [7]:
preds = model.predict(X_test)
print(classification_report(y_test.map({'<=50K': 0, '>50K': 1}), preds))

              precision    recall  f1-score   support

           0       0.90      0.93      0.92      7484
           1       0.75      0.66      0.70      2285

    accuracy                           0.87      9769
   macro avg       0.83      0.80      0.81      9769
weighted avg       0.87      0.87      0.87      9769



# Finding the affected

In [8]:
X_aff_idxs = np.where(model.predict(X_test) == 0)[0]
print(data.shape)
print(X_aff_idxs.shape)
X_aff = X_test.iloc[X_aff_idxs, :]
print(X_aff.shape)
print(type(X_aff))
# X_aff.reset_index(inplace=True, drop=True)
# print(X_aff.shape)
# print(type(X_aff))

(32561, 15)
(7743,)
(7743, 14)
<class 'pandas.core.frame.DataFrame'>


# Testing of Rules Implementation (OBSOLETE)

Plus valuation functions.

In [9]:
d = data[["Education", "Race", "Age", "Sex", "Country"]]
y = data["label"]

RecourseRulesMale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"}),
    ({"Sex":"Male", "Country":"Greece"}, {"Sex":"Female", "Country":"Greece"})
]

RecourseRulesFemale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"})
]

sex = lib.TwoLevelRecourseSet("Sex", ["Male", "Female"])
sex.addRules("Male", RecourseRulesMale)
sex.addRules("Female", RecourseRulesFemale)

In [7]:
list(sex.suggest(d.iloc[5839]))

[Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Female', 'Greece'])]

In [8]:
print(f"Number of suggested actions (recourses) that have no effect: {lib.incorrectRecourses(sex, X_aff, model)}")
print(f"Number of individuals for which there is a suggestion: {lib.cover(sex, X_aff)}")
print(f"Total cost of changed features, based on change / no change, over all rules: {lib.featureCost(sex)}")
print(f"Total cost of changed features, based on magnitude of changed, over all rules: {lib.featureChange(sex)}")
print(f"Total number of triples / rules in our set: {lib.size(sex)}")
print(f"Maximum number of predicates in a triple / rule: {lib.maxwidth(sex)}")

print(lib.reward1(sex, X_aff, model))
print(lib.reward2(sex, X_aff))
print(lib.reward3(sex))
print(lib.reward4(sex))

Number of suggested actions (recourses) that have no effect: 36
Number of individuals for which there is a suggestion: 20
Total cost of changed features, based on change / no change, over all rules: 104
Total cost of changed features, based on magnitude of changed, over all rules: 5
Total number of triples / rules in our set: 5
Maximum number of predicates in a triple / rule: 3
512704
20
13896
135


In [9]:
len(X_aff[X_aff["Sex"] == "Male"])

15828

In [10]:
data[data.Sex == "Female"].shape

(10771, 15)

In [11]:
print(data)

      Age         Workclass  fnlwgt   Education Education-Num  \
0      39         State-gov   77516   Bachelors            13   
1      50  Self-emp-not-inc   83311   Bachelors            13   
2      38           Private  215646     HS-grad             9   
3      53           Private  234721        11th             7   
4      28           Private  338409   Bachelors            13   
...    ..               ...     ...         ...           ...   
32556  27           Private  257302  Assoc-acdm            12   
32557  40           Private  154374     HS-grad             9   
32558  58           Private  151910     HS-grad             9   
32559  22           Private  201490     HS-grad             9   
32560  52      Self-emp-inc  287927     HS-grad             9   

           Marital Status         Occupation   Relationship   Race     Sex  \
0           Never-married       Adm-clerical  Not-in-family  White    Male   
1      Married-civ-spouse    Exec-managerial        Husband  Wh

# Testing apriori adaptation and  implementation

In [9]:
d = X_test.drop(["Sex"], axis=1)
freq_itemsets = runApriori(preprocessDataset(d), min_support=0.03)
print(freq_itemsets)
RL = aprioriout2predicateList(freq_itemsets)
pprint(RL[:10])
print(len(RL))

       support                                           itemsets
0     0.950046                                   (0+Capital Loss)
1     0.921077                                   (0+Capital Gain)
2     0.896305                            (United-States+Country)
55    0.871123                   (0+Capital Loss, 0+Capital Gain)
3     0.858532                                       (White+Race)
...        ...                                                ...
1234  0.030095  (Craft-repair+Occupation, Husband+Relationship...
3851  0.030095  (United-States+Country, 0+Capital Gain, 12+Edu...
5427  0.030095  (Own-child+Relationship, 0+Capital Loss, Some-...
3731  0.030095  (0+Capital Loss, Sales+Occupation, 40+Hours pe...
3672  0.030095  (0+Capital Loss, Sales+Occupation, 0+Capital G...

[5709 rows x 2 columns]
[Predicate(features=['Capital Loss'], values=['0']),
 Predicate(features=['Capital Gain'], values=['0']),
 Predicate(features=['Country'], values=['United-States']),
 Predicate(featur

In [10]:
print(freq_itemsets.tail(100))
print(freq_itemsets.tail(100).to_string())

       support                                           itemsets
5634  0.030402  (Assoc-voc+Education, Private+Workclass, 0+Cap...
3710  0.030402  (0+Capital Loss, Husband+Relationship, Sales+O...
5252  0.030402  (HS-grad+Education, 0+Capital Gain, Private+Wo...
4274  0.030402  (Some-college+Education, 10+Education-Num, Not...
5666  0.030402  (11+Education-Num, Private+Workclass, 0+Capita...
...        ...                                                ...
1234  0.030095  (Craft-repair+Occupation, Husband+Relationship...
3851  0.030095  (United-States+Country, 0+Capital Gain, 12+Edu...
5427  0.030095  (Own-child+Relationship, 0+Capital Loss, Some-...
3731  0.030095  (0+Capital Loss, Sales+Occupation, 40+Hours pe...
3672  0.030095  (0+Capital Loss, Sales+Occupation, 0+Capital G...

[100 rows x 2 columns]
       support                                                                                                                                                     itemsets
5634  0.0304

In [11]:
print(RL[:10])

[Predicate(features=['Capital Loss'], values=['0']), Predicate(features=['Capital Gain'], values=['0']), Predicate(features=['Country'], values=['United-States']), Predicate(features=['Capital Loss', 'Capital Gain'], values=['0', '0']), Predicate(features=['Race'], values=['White']), Predicate(features=['Capital Loss', 'Country'], values=['0', 'United-States']), Predicate(features=['Capital Gain', 'Country'], values=['0', 'United-States']), Predicate(features=['Capital Loss', 'Race'], values=['0', 'White']), Predicate(features=['Race', 'Country'], values=['White', 'United-States']), Predicate(features=['Race', 'Capital Gain'], values=['White', '0'])]


## Now with different itemsets for each group

In [16]:
y_male = y_test[X_test.Sex == "Male"]
X_male = X_test.loc[X_test.Sex == "Male"].drop(["Sex"], axis=1)
y_female = y_test.loc[X_test.Sex == "Female"]
X_female = X_test.loc[X_test.Sex == "Female"].drop(["Sex"], axis=1)

# d = X_test.drop(["label", "Sex"], axis=1)

freq_itemsets_male = runApriori(preprocessDataset(X_male), min_support=0.03)
freq_itemsets_female = runApriori(preprocessDataset(X_female), min_support=0.03)
print(freq_itemsets_male)
print(freq_itemsets_female)

RL_male = aprioriout2predicateList(freq_itemsets_male)
RL_female = aprioriout2predicateList(freq_itemsets_female)
pprint(RL_male[:10])
pprint(RL_female[:10])
print(len(RL_male))
print(len(RL_female))

       support                                           itemsets
0     0.949615                                   (0+Capital Loss)
1     0.906626                                   (0+Capital Gain)
2     0.899692                            (United-States+Country)
3     0.879507                                       (White+Race)
51    0.856240                   (0+Capital Gain, 0+Capital Loss)
...        ...                                                ...
4184  0.030046  (13+Education-Num, Husband+Relationship, Unite...
2227  0.030046  (HS-grad+Education, Self-emp-not-inc+Workclass...
2226  0.030046  (0+Capital Loss, HS-grad+Education, Self-emp-n...
222   0.030046  (10+Education-Num, United-States+Country, 0+Ca...
621   0.030046  (13+Education-Num, Bachelors+Education, 0+Capi...

[6183 rows x 2 columns]
       support                                           itemsets
0     0.961269                                   (0+Capital Loss)
1     0.940836                                   (0

In [17]:
print(freq_itemsets_male.tail(100))
print(freq_itemsets_male.tail(100).to_string())
print()
print(freq_itemsets_female.tail(100))
print(freq_itemsets_female.tail(100).to_string())

       support                                           itemsets
251   0.030354  (10+Education-Num, 0+Capital Gain, White+Race,...
3820  0.030354  (Own-child+Relationship, United-States+Country...
3156  0.030354  (10+Education-Num, Husband+Relationship, Unite...
592   0.030354  (0+Capital Loss, Private+Workclass, Exec-manag...
48    0.030354                            (Federal-gov+Workclass)
...        ...                                                ...
4184  0.030046  (13+Education-Num, Husband+Relationship, Unite...
2227  0.030046  (HS-grad+Education, Self-emp-not-inc+Workclass...
2226  0.030046  (0+Capital Loss, HS-grad+Education, Self-emp-n...
222   0.030046  (10+Education-Num, United-States+Country, 0+Ca...
621   0.030046  (13+Education-Num, Bachelors+Education, 0+Capi...

[100 rows x 2 columns]
       support                                                                                                                                                                          

# Testing optimization procedure

## Start with hand-picked SD and RL

In [12]:
SD = list(map(Predicate.from_dict_categorical, [
    {"Sex": "Male"},
    {"Sex": "Female"}
]))

RL_byhand = list(map(Predicate.from_dict_categorical, [
    {"Education":"Bachelors", "Race":"White", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"United-States"},
    {"Race":"White", "Country":"Greece"},
    {"Race":"Black", "Country":"Greece"},
    {"Sex":"Male", "Country":"Greece"},
    {"Sex":"Female", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"United-States"},
    {"Race":"White", "Country":"Greece"},
    {"Race":"Black", "Country":"Greece"}
]))

print(SD)
print()
pprint(RL_byhand)

[Predicate(features=['Sex'], values=['Male']), Predicate(features=['Sex'], values=['Female'])]

[Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['White', 'Greece']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Male', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Female', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['White', 'Greece']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece'])]


## Now using the output of frequent itemsets

In [None]:
total = 0
for p1 in RL[:400]:
    for p2 in RL[:400]:
        if lib2.recIsValid(p1, p2):
            total += 1
            print(p1, p2)
print(total)

Predicate(features=['Marital Status'], values=['Married-civ-spouse']) Predicate(features=['Marital Status'], values=['Never-married'])
Predicate(features=['Marital Status', 'Capital Loss'], values=['Married-civ-spouse', '0']) Predicate(features=['Marital Status', 'Capital Loss'], values=['Never-married', '0'])
Predicate(features=['Marital Status', 'Race'], values=['Married-civ-spouse', 'White']) Predicate(features=['Marital Status', 'Race'], values=['Never-married', 'White'])
Predicate(features=['Marital Status', 'Country'], values=['Married-civ-spouse', 'United-States']) Predicate(features=['Marital Status', 'Country'], values=['Never-married', 'United-States'])
Predicate(features=['Relationship'], values=['Husband']) Predicate(features=['Relationship'], values=['Not-in-family'])
Predicate(features=['Relationship'], values=['Husband']) Predicate(features=['Relationship'], values=['Own-child'])
Predicate(features=['Marital Status', 'Capital Gain'], values=['Married-civ-spouse', '0']) P

In [13]:
print(X_aff.shape)
print(data.shape)

(7743, 14)
(32561, 15)


In [14]:
%%time

final_rules = optimize(SD, RL[:400], X_aff[:400], model)

CPU times: total: 2min 41s
Wall time: 19 s


In [27]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse'])),
  (Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Divorced']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -127,
 149,
 -10,
 -10)


## Testing new implementation

In [15]:
from lib2 import optimize as opt

In [16]:
%%time

final_rules = opt(SD, RL[:400], X_aff[:400], model)

Total triples = 216
X_aff shape before: (400, 14)
Calculated incorrect recourse for each triple
Calculated feature costs for each triple
Calculated feature changes for each feature
X_aff shape after: (400, 14)
set()
Calculated covers for each triple
24
CPU times: total: 31 s
Wall time: 4.13 s


In [17]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education'], values=['Bachelors']),
   Predicate(features=['Education'], values=['HS-grad'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['10']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13']))],
 -214,
 249,
 -4,
 -4)


In [22]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['10']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -218,
 245,
 -3,
 -3)
