In [1]:
import numpy as np
import pandas as pd
# from apriori_python import apriori
from pprint import pprint

import lib
from lib import Predicate, optimize
from models import customXGB
from apriori import runApriori, aprioriout2predicateList

In [2]:
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
data = pd.DataFrame(
  np.genfromtxt('adult.data', delimiter=', ', dtype=str),
  columns=feature_names
)
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

# Definition and Training of a black-box model

In [3]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(data, cate_columns=cate_columns)

<models.customXGB at 0x2393fe08700>

In [4]:
type(model.predict(data.iloc[:100, :]))

numpy.ndarray

In [None]:
model.predict

In [5]:
X_aff_idxs = np.where(model.predict(data) == 0)[0]
print(data.shape)
print(X_aff_idxs.shape)
X_aff = data.iloc[X_aff_idxs, :]
print(X_aff.shape)
print(type(X_aff))
# X_aff.reset_index(inplace=True, drop=True)
# print(X_aff.shape)
# print(type(X_aff))

(32561, 15)
(25637,)
(25637, 15)
<class 'pandas.core.frame.DataFrame'>


# Testing of Rules Implementation

Plus valuation functions.

In [6]:
d = data[["Education", "Race", "Age", "Sex", "Country"]]
y = data["label"]

RecourseRulesMale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"}),
    ({"Sex":"Male", "Country":"Greece"}, {"Sex":"Female", "Country":"Greece"})
]

RecourseRulesFemale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"})
]

sex = lib.TwoLevelRecourseSet("Sex", ["Male", "Female"])
sex.addRules("Male", RecourseRulesMale)
sex.addRules("Female", RecourseRulesFemale)

In [7]:
list(sex.suggest(d.iloc[5839]))

[Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Female', 'Greece'])]

In [8]:
print(f"Number of suggested actions (recourses) that have no effect: {lib.incorrectRecourses(sex, X_aff, model)}")
print(f"Number of individuals for which there is a suggestion: {lib.cover(sex, X_aff)}")
print(f"Total cost of changed features, based on change / no change, over all rules: {lib.featureCost(sex)}")
print(f"Total cost of changed features, based on magnitude of changed, over all rules: {lib.featureChange(sex)}")
print(f"Total number of triples / rules in our set: {lib.size(sex)}")
print(f"Maximum number of predicates in a triple / rule: {lib.maxwidth(sex)}")

print(lib.reward1(sex, X_aff, model))
print(lib.reward2(sex, X_aff))
print(lib.reward3(sex))
print(lib.reward4(sex))

Number of suggested actions (recourses) that have no effect: 36
Number of individuals for which there is a suggestion: 20
Total cost of changed features, based on change / no change, over all rules: 104
Total cost of changed features, based on magnitude of changed, over all rules: 5
Total number of triples / rules in our set: 5
Maximum number of predicates in a triple / rule: 3
512704
20
13896
135


In [9]:
len(X_aff[X_aff["Sex"] == "Male"])

15828

In [10]:
data[data.Sex == "Female"].shape

(10771, 15)

In [11]:
print(data)

      Age         Workclass  fnlwgt   Education Education-Num  \
0      39         State-gov   77516   Bachelors            13   
1      50  Self-emp-not-inc   83311   Bachelors            13   
2      38           Private  215646     HS-grad             9   
3      53           Private  234721        11th             7   
4      28           Private  338409   Bachelors            13   
...    ..               ...     ...         ...           ...   
32556  27           Private  257302  Assoc-acdm            12   
32557  40           Private  154374     HS-grad             9   
32558  58           Private  151910     HS-grad             9   
32559  22           Private  201490     HS-grad             9   
32560  52      Self-emp-inc  287927     HS-grad             9   

           Marital Status         Occupation   Relationship   Race     Sex  \
0           Never-married       Adm-clerical  Not-in-family  White    Male   
1      Married-civ-spouse    Exec-managerial        Husband  Wh

# Testing apriori adaptation and  implementation

In [12]:
# y = data["label"]
data = data.drop(["label", "Sex"], axis=1)
print(data.keys())
freq_itemsets = runApriori(data, min_support=0.03)
print(freq_itemsets)
RL = aprioriout2predicateList(freq_itemsets, data)
pprint(RL[:10])
print(len(RL))

Index(['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num',
       'Marital Status', 'Occupation', 'Relationship', 'Race', 'Capital Gain',
       'Capital Loss', 'Hours per week', 'Country'],
      dtype='object')
       support                                           itemsets
0     1.000000                                                (0)
64    0.895857                                 (0, United-States)
1     0.895857                                    (United-States)
2     0.854274                                            (White)
65    0.854274                                         (0, White)
...        ...                                                ...
2526  0.030005  (United-States, HS-grad, 9, White, Never-marri...
1748  0.030005                                (7, 0, 11th, White)
1747  0.030005                                   (7, 11th, White)
1746  0.030005                                   (0, 11th, White)
1741  0.030005                                      (

In [13]:
print(freq_itemsets.tail(100).to_string())

       support                                                                itemsets
2640  0.030435                (Some-college, United-States, White, Private, Own-child)
2642  0.030435            (Some-college, United-States, 10, White, Private, Own-child)
985   0.030435                                     (Exec-managerial, 0, Not-in-family)
2644  0.030435         (Some-college, United-States, 10, White, Private, 0, Own-child)
2734  0.030435                 (Married-civ-spouse, Husband, Sales, White, Private, 0)
2733  0.030435                    (Married-civ-spouse, Husband, Sales, White, Private)
3318  0.030435                                        (Self-emp-inc, 0, United-States)
886   0.030435                                        (Exec-managerial, Not-in-family)
1856  0.030435                                  (Wife, Private, 0, Married-civ-spouse)
1855  0.030435                                     (Wife, Private, Married-civ-spouse)
3316  0.030435                             

In [14]:
print(RL[:10])

[Predicate(features=['Capital Loss'], values=['0']), Predicate(features=['Capital Loss', 'Country'], values=['0', 'United-States']), Predicate(features=['Country'], values=['United-States']), Predicate(features=['Race'], values=['White']), Predicate(features=['Capital Loss', 'Race'], values=['0', 'White']), Predicate(features=['Capital Loss', 'Country', 'Race'], values=['0', 'United-States', 'White']), Predicate(features=['Country', 'Race'], values=['United-States', 'White']), Predicate(features=['Workclass'], values=['Private']), Predicate(features=['Workclass', 'Capital Loss'], values=['Private', '0']), Predicate(features=['Workclass', 'Country'], values=['Private', 'United-States'])]


# Testing optimization procedure with hand-picked SD and RL

In [15]:
SD = list(map(Predicate.from_dict_categorical, [
    {"Sex": "Male"},
    {"Sex": "Female"}
]))

RL_byhand = list(map(Predicate.from_dict_categorical, [
    {"Education":"Bachelors", "Race":"White", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"United-States"},
    {"Race":"White", "Country":"Greece"},
    {"Race":"Black", "Country":"Greece"},
    {"Sex":"Male", "Country":"Greece"},
    {"Sex":"Female", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"Greece"},
    {"Education":"Bachelors", "Race":"White", "Country":"United-States"},
    {"Race":"White", "Country":"Greece"},
    {"Race":"Black", "Country":"Greece"}
]))

print(SD)
print()
pprint(RL_byhand)

[Predicate(features=['Sex'], values=['Male']), Predicate(features=['Sex'], values=['Female'])]

[Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['White', 'Greece']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Male', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Female', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'Greece']),
 Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['White', 'Greece']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece'])]


In [16]:
print(X_aff.shape)
print(data.shape)

(25637, 15)
(32561, 13)


In [26]:
%%time

final_rules = optimize(SD, RL[:400], X_aff[:400], model)

CPU times: total: 3min 54s
Wall time: 30.1 s


In [27]:
pprint(final_rules)

([(Predicate(features=['Sex'], values=['Male']),
   Predicate(features=['Marital Status'], values=['Never-married']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse'])),
  (Predicate(features=['Sex'], values=['Female']),
   Predicate(features=['Marital Status'], values=['Divorced']),
   Predicate(features=['Marital Status'], values=['Married-civ-spouse']))],
 -127,
 149,
 -10,
 -10)


In [19]:
pprint(final_rules)

([], 0, 0, 0, 0)


In [20]:
print(X_aff.shape)

(25637, 15)


In [21]:
t = X_aff[X_aff.Sex == "Male"]
t = t[t.Education == "Bachelors"]
t = t[t.Race == "Black"]
t = t[t.Country == "United-States"]
print(t)

      Age     Workclass  fnlwgt  Education Education-Num      Marital Status  \
159    42       Private  228456  Bachelors            13           Separated   
192    41     Local-gov  523910  Bachelors            13  Married-civ-spouse   
505    24       Private  388093  Bachelors            13       Never-married   
907    24       Private  202570  Bachelors            13       Never-married   
1883   27   Federal-gov  508336  Bachelors            13       Never-married   
...    ..           ...     ...        ...           ...                 ...   
30849  42     State-gov  212027  Bachelors            13            Divorced   
31366  25       Private  173062  Bachelors            13       Never-married   
31407  24       Private  493034  Bachelors            13       Never-married   
31495  41       Private  197093  Bachelors            13            Divorced   
32040  42  Self-emp-inc  161532  Bachelors            13  Married-civ-spouse   

              Occupation    Relationshi