In [1]:
# Location of adult dataset
ADULT_DATAFILE = '../adult.data'

In [2]:
import numpy as np
import pandas as pd
# from apriori_python import apriori
from pprint import pprint

import lib
from models import customXGB

In [3]:
feature_names = [
   "Age", "Workclass", "fnlwgt", "Education",
   "Education-Num", "Marital Status", "Occupation",
   "Relationship", "Race", "Sex", "Capital Gain",
   "Capital Loss", "Hours per week", "Country", "label"
]
data = pd.DataFrame(
  np.genfromtxt(ADULT_DATAFILE, delimiter=', ', dtype=str),
  columns=feature_names
)
cate_columns = ['Workclass', 'Education', 'Marital Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Country']

In [4]:
d = data[["Education", "Race", "Age", "Sex", "Country"]]
y = data["label"]

RecourseRulesMale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"}),
    ({"Sex":"Male", "Country":"Greece"}, {"Sex":"Female", "Country":"Greece"})
]

RecourseRulesFemale = [
    ({"Education":"Bachelors", "Race":"White", "Country":"Greece"},
     {"Education":"Bachelors", "Race":"White", "Country":"United-States"}),
    ({"Race":"White", "Country":"Greece"}, {"Race":"Black", "Country":"Greece"})
]

sex = lib.TwoLevelRecourseSet("Sex", ["Male", "Female"])
sex.addRules("Male", RecourseRulesMale)
sex.addRules("Female", RecourseRulesFemale)

In [5]:
list(sex.suggest(d.iloc[5839]))

[Predicate(features=['Education', 'Race', 'Country'], values=['Bachelors', 'White', 'United-States']),
 Predicate(features=['Race', 'Country'], values=['Black', 'Greece']),
 Predicate(features=['Sex', 'Country'], values=['Female', 'Greece'])]

In [6]:
model = customXGB(n_estimators=300, max_depth=5)
model.fit(data, cate_columns=cate_columns)

<models.customXGB at 0x19cb23bb9a0>

In [7]:
type(model.predict(data.iloc[:100, :]))

numpy.ndarray

In [8]:
X_aff_idxs = np.where(model.predict(data) == 0)[0]
print(data.shape)
print(X_aff_idxs.shape)
X_aff = data.iloc[X_aff_idxs, :]
print(X_aff.shape)
print(type(X_aff))
# X_aff.reset_index(inplace=True, drop=True)
# print(X_aff.shape)
# print(type(X_aff))

(32561, 15)
(25534,)
(25534, 15)
<class 'pandas.core.frame.DataFrame'>


In [8]:
print(f"Number of suggested actions (recourses) that have no effect: {lib.incorrectRecourses(sex, X_aff, model)}")
print(f"Number of individuals for which there is a suggestion: {lib.cover(sex, X_aff)}")
print(f"Total cost of changed features, based on change / no change, over all rules: {lib.featureCost(sex)}")
print(f"Total cost of changed features, based on magnitude of changed, over all rules: {lib.featureChange(sex)}")
print(f"Total number of triples / rules in our set: {lib.size(sex)}")
print(f"Maximum number of predicates in a triple / rule: {lib.maxwidth(sex)}")

print(lib.reward1(sex, X_aff, model))
print(lib.reward2(sex, X_aff))
print(lib.reward3(sex))
print(lib.reward4(sex))

Number of suggested actions (recourses) that have no effect: 11
Number of individuals for which there is a suggestion: 7
Total cost of changed features, based on change / no change, over all rules: 104
Total cost of changed features, based on magnitude of changed, over all rules: 5
Total number of triples / rules in our set: 5
Maximum number of predicates in a triple / rule: 3
138929
7
13896
135


In [18]:
len(X_aff[X_aff["Sex"] == "Male"])

5979

In [15]:
data[data.Sex == "Female"].shape

(10771, 15)

In [10]:
freq_itemsets = lib.runApriori(d, min_support=0.001)
print(freq_itemsets)
RL = lib.aprioriout2predicateList(freq_itemsets, d)
pprint(RL[:10])
print(len(RL))

       support                             itemsets
0     0.895857                      (United-States)
1     0.854274                              (White)
2     0.669205                               (Male)
3     0.164461                          (Bachelors)
4     0.025061                                 (39)
...        ...                                  ...
3072  0.001290                        (Japan, Male)
3073  0.001167          (Japan, Asian-Pac-Islander)
3074  0.001996        (Vietnam, Asian-Pac-Islander)
3075  0.001351                      (Vietnam, Male)
3076  0.001321  (Vietnam, Male, Asian-Pac-Islander)

[3077 rows x 2 columns]
[Predicate(features=['Country'], values=['United-States']),
 Predicate(features=['Race'], values=['White']),
 Predicate(features=['Sex'], values=['Male']),
 Predicate(features=['Education'], values=['Bachelors']),
 Predicate(features=['Age'], values=['39']),
 Predicate(features=['Age'], values=['50']),
 Predicate(features=['Education'], values=['HS-