In [1]:
import numpy as np
import pandas as pd
from pprint import pprint

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import lib
from lib import Predicate, optimize
from models import customXGB
from apriori import preprocessDataset, runApriori, aprioriout2predicateList

In [2]:
# Location of adult dataset
ADULT_DATAFILE = '../adult.data'
random_state = None

In [3]:
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
data = pd.DataFrame(
  np.genfromtxt(ADULT_DATAFILE, delimiter=', ', dtype=str),
  columns=feature_names
)
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

# Train test split

In [4]:
X = data.drop("label", axis=1)
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Definition and Training of a black-box model

In [5]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(X_train, y_train, cate_columns=cate_columns)

<models.customXGB at 0x25eeb8f8bb0>

In [6]:
model.predict(X_test.iloc[:100, :])

array([0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0])

# Testing the model

In [7]:
preds = model.predict(X_test)
print(classification_report(y_test.map({'<=50K': 0, '>50K': 1}), preds))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7430
           1       0.75      0.65      0.70      2339

    accuracy                           0.86      9769
   macro avg       0.82      0.79      0.80      9769
weighted avg       0.86      0.86      0.86      9769



# Finding the affected

In [8]:
X_aff_idxs = np.where(model.predict(X_test) == 0)[0]
print(data.shape)
print(X_aff_idxs.shape)
X_aff = X_test.iloc[X_aff_idxs, :]
print(X_aff.shape)
print(type(X_aff))
# X_aff.reset_index(inplace=True, drop=True)
# print(X_aff.shape)
# print(type(X_aff))

(32561, 15)
(7736,)
(7736, 14)
<class 'pandas.core.frame.DataFrame'>


# Testing of Rules Implementation (OBSOLETE)

Plus valuation functions.

In [6]:
d = data[["Education", "Race", "Age", "Sex", "Country"]]
y = data["label"]

RecourseRulesMale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"}),
    ({"Sex":"Male", "Country":"Greece"}, {"Sex":"Female", "Country":"Greece"})
]

RecourseRulesFemale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"})
]

sex = lib.TwoLevelRecourseSet("Sex", ["Male", "Female"])
sex.addRules("Male", RecourseRulesMale)
sex.addRules("Female", RecourseRulesFemale)

In [7]:
list(sex.suggest(d.iloc[5839]))

[Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Female', 'Greece'])]

In [8]:
print(f"Number of suggested actions (recourses) that have no effect: {lib.incorrectRecourses(sex, X_aff, model)}")
print(f"Number of individuals for which there is a suggestion: {lib.cover(sex, X_aff)}")
print(f"Total cost of changed features, based on change / no change, over all rules: {lib.featureCost(sex)}")
print(f"Total cost of changed features, based on magnitude of changed, over all rules: {lib.featureChange(sex)}")
print(f"Total number of triples / rules in our set: {lib.size(sex)}")
print(f"Maximum number of predicates in a triple / rule: {lib.maxwidth(sex)}")

print(lib.reward1(sex, X_aff, model))
print(lib.reward2(sex, X_aff))
print(lib.reward3(sex))
print(lib.reward4(sex))

Number of suggested actions (recourses) that have no effect: 36
Number of individuals for which there is a suggestion: 20
Total cost of changed features, based on change / no change, over all rules: 104
Total cost of changed features, based on magnitude of changed, over all rules: 5
Total number of triples / rules in our set: 5
Maximum number of predicates in a triple / rule: 3
512704
20
13896
135


In [9]:
len(X_aff[X_aff["Sex"] == "Male"])

15828

In [10]:
data[data.Sex == "Female"].shape

(10771, 15)

In [11]:
print(data)

      Age         Workclass  fnlwgt   Education Education-Num  \
0      39         State-gov   77516   Bachelors            13   
1      50  Self-emp-not-inc   83311   Bachelors            13   
2      38           Private  215646     HS-grad             9   
3      53           Private  234721        11th             7   
4      28           Private  338409   Bachelors            13   
...    ..               ...     ...         ...           ...   
32556  27           Private  257302  Assoc-acdm            12   
32557  40           Private  154374     HS-grad             9   
32558  58           Private  151910     HS-grad             9   
32559  22           Private  201490     HS-grad             9   
32560  52      Self-emp-inc  287927     HS-grad             9   

           Marital Status         Occupation   Relationship   Race     Sex  \
0           Never-married       Adm-clerical  Not-in-family  White    Male   
1      Married-civ-spouse    Exec-managerial        Husband  Wh

# Testing apriori adaptation and  implementation

In [13]:
d = X_test.drop(["Sex"], axis=1)
freq_itemsets = runApriori(preprocessDataset(d), min_support=0.03)
print(freq_itemsets)
RL = aprioriout2predicateList(freq_itemsets)
pprint(RL[:10])
print(len(RL))

       support                                           itemsets
0     0.953526                                   (0+Capital Loss)
1     0.918108                                   (0+Capital Gain)
2     0.899068                            (United-States+Country)
53    0.871635                   (0+Capital Gain, 0+Capital Loss)
54    0.856382            (United-States+Country, 0+Capital Loss)
...        ...                                                ...
4752  0.030095  (Bachelors+Education, Not-in-family+Relationsh...
5431  0.030095  (HS-grad+Education, Husband+Relationship, 0+Ca...
5430  0.030095  (Husband+Relationship, 0+Capital Gain, Married...
5742  0.030095    (0+Capital Loss, White+Race, 35+Hours per week)
4768  0.030095  (13+Education-Num, Bachelors+Education, United...

[5794 rows x 2 columns]
[Predicate(features=['Capital Loss'], values=['0']),
 Predicate(features=['Capital Gain'], values=['0']),
 Predicate(features=['Country'], values=['United-States']),
 Predicate(featur

In [14]:
print(freq_itemsets.tail(100))
print(freq_itemsets.tail(100).to_string())

       support                                           itemsets
5370  0.030402  (0+Capital Gain, Married-civ-spouse+Marital St...
4084  0.030402  (United-States+Country, 0+Capital Gain, Privat...
5277  0.030402  (HS-grad+Education, United-States+Country, Cra...
5449  0.030402  (United-States+Country, 9+Education-Num, Craft...
3923  0.030402  (Adm-clerical+Occupation, United-States+Countr...
...        ...                                                ...
4752  0.030095  (Bachelors+Education, Not-in-family+Relationsh...
5431  0.030095  (HS-grad+Education, Husband+Relationship, 0+Ca...
5430  0.030095  (Husband+Relationship, 0+Capital Gain, Married...
5742  0.030095    (0+Capital Loss, White+Race, 35+Hours per week)
4768  0.030095  (13+Education-Num, Bachelors+Education, United...

[100 rows x 2 columns]
       support                                                                                                                                                            itemsets
5370 

In [15]:
print(RL[:10])

[Predicate(features=['Capital Loss'], values=['0']), Predicate(features=['Capital Gain'], values=['0']), Predicate(features=['Country'], values=['United-States']), Predicate(features=['Capital Gain', 'Capital Loss'], values=['0', '0']), Predicate(features=['Country', 'Capital Loss'], values=['United-States', '0']), Predicate(features=['Race'], values=['White']), Predicate(features=['Capital Gain', 'Country'], values=['0', 'United-States']), Predicate(features=['Race', 'Capital Loss'], values=['White', '0']), Predicate(features=['Race', 'Country'], values=['White', 'United-States']), Predicate(features=['Capital Gain', 'Country', 'Capital Loss'], values=['0', 'United-States', '0'])]


## Now with different itemsets for each group

In [16]:
y_male = y_test[X_test.Sex == "Male"]
X_male = X_test.loc[X_test.Sex == "Male"].drop(["Sex"], axis=1)
y_female = y_test.loc[X_test.Sex == "Female"]
X_female = X_test.loc[X_test.Sex == "Female"].drop(["Sex"], axis=1)

# d = X_test.drop(["label", "Sex"], axis=1)

freq_itemsets_male = runApriori(preprocessDataset(X_male), min_support=0.03)
freq_itemsets_female = runApriori(preprocessDataset(X_female), min_support=0.03)
print(freq_itemsets_male)
print(freq_itemsets_female)

RL_male = aprioriout2predicateList(freq_itemsets_male)
RL_female = aprioriout2predicateList(freq_itemsets_female)
pprint(RL_male[:10])
pprint(RL_female[:10])
print(len(RL_male))
print(len(RL_female))

       support                                           itemsets
0     0.949615                                   (0+Capital Loss)
1     0.906626                                   (0+Capital Gain)
2     0.899692                            (United-States+Country)
3     0.879507                                       (White+Race)
51    0.856240                   (0+Capital Gain, 0+Capital Loss)
...        ...                                                ...
4184  0.030046  (13+Education-Num, Husband+Relationship, Unite...
2227  0.030046  (HS-grad+Education, Self-emp-not-inc+Workclass...
2226  0.030046  (0+Capital Loss, HS-grad+Education, Self-emp-n...
222   0.030046  (10+Education-Num, United-States+Country, 0+Ca...
621   0.030046  (13+Education-Num, Bachelors+Education, 0+Capi...

[6183 rows x 2 columns]
       support                                           itemsets
0     0.961269                                   (0+Capital Loss)
1     0.940836                                   (0

In [17]:
print(freq_itemsets_male.tail(100))
print(freq_itemsets_male.tail(100).to_string())
print()
print(freq_itemsets_female.tail(100))
print(freq_itemsets_female.tail(100).to_string())

       support                                           itemsets
251   0.030354  (10+Education-Num, 0+Capital Gain, White+Race,...
3820  0.030354  (Own-child+Relationship, United-States+Country...
3156  0.030354  (10+Education-Num, Husband+Relationship, Unite...
592   0.030354  (0+Capital Loss, Private+Workclass, Exec-manag...
48    0.030354                            (Federal-gov+Workclass)
...        ...                                                ...
4184  0.030046  (13+Education-Num, Husband+Relationship, Unite...
2227  0.030046  (HS-grad+Education, Self-emp-not-inc+Workclass...
2226  0.030046  (0+Capital Loss, HS-grad+Education, Self-emp-n...
222   0.030046  (10+Education-Num, United-States+Country, 0+Ca...
621   0.030046  (13+Education-Num, Bachelors+Education, 0+Capi...

[100 rows x 2 columns]
       support                                                                                                                                                                          

# Testing optimization procedure

## Start with hand-picked SD and RL

In [18]:
SD = list(map(Predicate.from_dict_categorical, [
    {"Sex": "Male"},
    {"Sex": "Female"}
]))

RL_byhand = list(map(Predicate.from_dict_categorical, [
    {"Education":"Bachelors", "Race":"White", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"United-States"},
    {"Race":"White", "Country":"Greece"},
    {"Race":"Black", "Country":"Greece"},
    {"Sex":"Male", "Country":"Greece"},
    {"Sex":"Female", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"United-States"},
    {"Race":"White", "Country":"Greece"},
    {"Race":"Black", "Country":"Greece"}
]))

print(SD)
print()
pprint(RL_byhand)

[Predicate(features=['Sex'], values=['Male']), Predicate(features=['Sex'], values=['Female'])]

[Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['White', 'Greece']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Male', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Female', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['White', 'Greece']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece'])]


## Now using the output of frequent itemsets

In [19]:
print(X_aff.shape)
print(data.shape)

(7736, 14)
(32561, 15)


In [26]:
%%time

final_rules = optimize(SD, RL[:400], X_aff[:400], model)

CPU times: total: 3min 54s
Wall time: 30.1 s


In [27]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse'])),
  (Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Divorced']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -127,
 149,
 -10,
 -10)


In [19]:
pprint(final_rules)

([], 0, 0, 0, 0)


In [20]:
print(X_aff.shape)

(25637, 15)


In [21]:
t = X_aff[X_aff.Sex == "Male"]
t = t[t.Education == "Bachelors"]
t = t[t.Race == "Black"]
t = t[t.Country == "United-States"]
print(t)

      Age     Workclass  fnlwgt  Education Education-Num      Marital Status  \
159    42       Private  228456  Bachelors            13           Separated   
192    41     Local-gov  523910  Bachelors            13  Married-civ-spouse   
505    24       Private  388093  Bachelors            13       Never-married   
907    24       Private  202570  Bachelors            13       Never-married   
1883   27   Federal-gov  508336  Bachelors            13       Never-married   
...    ..           ...     ...        ...           ...                 ...   
30849  42     State-gov  212027  Bachelors            13            Divorced   
31366  25       Private  173062  Bachelors            13       Never-married   
31407  24       Private  493034  Bachelors            13       Never-married   
31495  41       Private  197093  Bachelors            13            Divorced   
32040  42  Self-emp-inc  161532  Bachelors            13  Married-civ-spouse   

              Occupation    Relationshi

## Testing new implementation

In [20]:
from lib2 import optimize as opt

In [23]:
%%time

final_rules = opt(SD, RL[:400], X_aff, model)

Total triples = 264
X_aff shape before: (7736, 14)
Calculated incorrect recourse for each triple
Calculated feature costs for each triple
Calculated feature changes for each feature
X_aff shape after: (7736, 14)
set()
Calculated covers for each triple
17
CPU times: total: 4min 22s
Wall time: 1min 11s


In [22]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['9']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Education-Num'], values=['10']),
   Predicate(features=['Education-Num'], values=['13'])),
  (Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -218,
 245,
 -3,
 -3)
