In [1]:
import pandas as pd
import numpy as np

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import warnings
warnings.filterwarnings('ignore')
import facts
from facts.models import customXGB
from facts.clean import clean_dataset
from facts import valid_ifthens_with_coverage_correctness, rules2rulesbyif
from facts.models import customLogisticRegression
from facts.parameters import ParameterProxy
from facts.formatting import recourse_report_reverse, print_recourse_report, print_recourse_report_cumulative, print_recourse_report_KStest_cumulative
from facts.utils import load_rules_by_if, save_rules_by_if
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Dataset

In [2]:
urlfile= "https://raw.githubusercontent.com/columbia/fairtest/master/data/adult/adult.csv"

X = pd.read_csv(urlfile)
df = clean_dataset(X, "adult")

age = [val.left for val in df.age.unique()]
age.sort()

df.head()

Unnamed: 0,age,Workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,"(34.0, 41.0]",State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,FullTime,United-States,0
1,"(41.0, 50.0]",Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Married,White,Male,0,0,PartTime,United-States,0
2,"(34.0, 41.0]",Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,FullTime,United-States,0
3,"(50.0, 90.0]",Private,7,Married-civ-spouse,Handlers-cleaners,Married,Black,Male,0,0,FullTime,United-States,0
4,"(26.0, 34.0]",Private,13,Married-civ-spouse,Prof-specialty,Married,Black,Female,0,0,FullTime,Cuba,0


In [3]:
y = df['income']
X = df.drop('income', axis=1)

num_features = X._get_numeric_data().columns.to_list()
cate_features = X.select_dtypes(include=['object','category']).columns.to_list()
ord_features = ['hours-per-week']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=None, stratify=y)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

## Load rules

In [5]:
rules_by_if_xgboost = load_rules_by_if("rulesAdultFairTest-xgboost.data")
rules_with_cumulative_xgboost = load_rules_by_if("rulesAdultFairTest-xgboost_cumulative.data")
rules_by_if_nn = load_rules_by_if("rulesAdultFairTest-NN.data")
rules_with_cumulative_nn = load_rules_by_if("rulesAdultFairTest-NN_cumulative.data")

## Params

In [6]:
feature_weights = {"race" : 100 , "sex": 100, "marital-status": 5, "relationship":5, "age": 10, "occupation": 4, "Workclass": 2, "native-country": 4,
                    "hours-per-week":2, "capital-gain": 1, "capital-loss":1, "education-num": 3}
features_with_binary_cost = cate_features
features_with_proportional_cost = num_features


comparators = facts.feature_change_builder(
    X,
    num_cols=features_with_proportional_cost,
    cate_cols=features_with_binary_cost,
    ord_cols=ord_features,
    feature_weights=feature_weights,
    num_normalization=True,
    feats_to_normalize = ["capital-gain","capital-loss"]
)
params = ParameterProxy(featureChanges=comparators)

# facts.update_costs_cumulative(rules_with_cumulative_correctness, params)

## Subgroup from Figure 2 of paper

In [7]:
sg = facts.Predicate.from_dict({
    "hours-per-week": "FullTime",
    "marital-status": " Married-civ-spouse",
    "occupation": " Adm-clerical"
})

In [8]:
sg in rules_by_if_xgboost, \
sg in rules_with_cumulative_xgboost, \
sg in rules_by_if_nn, \
sg in rules_with_cumulative_nn

(True, True, True, True)

Rule with cumulatives

In [9]:
top_rules, subgroup_costs = facts.select_rules_subset_cumulative(
    {sg: rules_with_cumulative_xgboost[sg]},
    metric="min-above-corr",
    sort_strategy="generic-sorting-ignore-exists-subgroup-empty",
    top_count=1,
    cor_threshold = 0.7,
    filter_sequence = [
        "remove-contained",
        # "remove-fair-rules",
        "keep-cheap-rules-above-thr-cor",
    ],
    params=params
)

print_recourse_report_cumulative(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True,
    show_then_costs=True,
    metric_name = 'Equal Cost of Effectiveness (Micro) (threshold = 0.7)'
)

If [1mhours-per-week = FullTime, marital-status =  Married-civ-spouse, occupation =  Adm-clerical[0m:
	Protected Subgroup '[1m Male[0m', [34m1.45%[39m covered
		Make [1m[31mhours-per-week = OverTime[39m[0m with effectiveness [32m21.88%[39m and counterfactual cost = 2.0.
		Make [1m[31moccupation =  Craft-repair[39m[0m with effectiveness [32m21.88%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Protective-serv[39m[0m with effectiveness [32m27.08%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Sales[39m[0m with effectiveness [32m27.08%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Tech-support[39m[0m with effectiveness [32m36.46%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Prof-specialty[39m[0m with effectiveness [32m38.54%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Exec-managerial[39m[0m with effectiveness [32m39.58%[39m and counterfactual cost = 4.0.
		

In [10]:
top_rules, subgroup_costs = facts.select_rules_subset_cumulative(
    {sg: rules_with_cumulative_nn[sg]},
    metric="min-above-corr",
    sort_strategy="generic-sorting-ignore-exists-subgroup-empty",
    top_count=1,
    cor_threshold = 0.7,
    filter_sequence = [
        "remove-contained",
        # "remove-fair-rules",
        "keep-cheap-rules-above-thr-cor",
    ],
    params=params
)

print_recourse_report_cumulative(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True,
    show_then_costs=True,
    metric_name = 'Equal Cost of Effectiveness (Micro) (threshold = 0.7)'
)

If [1mhours-per-week = FullTime, marital-status =  Married-civ-spouse, occupation =  Adm-clerical[0m:
	Protected Subgroup '[1m Male[0m', [34m1.20%[39m covered
		Make [1m[31mhours-per-week = OverTime[39m[0m with effectiveness [32m0.00%[39m and counterfactual cost = 2.0.
		Make [1m[31moccupation =  Exec-managerial[39m[0m with effectiveness [32m0.00%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Prof-specialty[39m[0m with effectiveness [32m0.00%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Craft-repair[39m[0m with effectiveness [32m0.00%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Sales[39m[0m with effectiveness [32m0.00%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Machine-op-inspct[39m[0m with effectiveness [32m0.00%[39m and counterfactual cost = 4.0.
		Make [1m[31moccupation =  Other-service[39m[0m with effectiveness [32m0.00%[39m and counterfactual cost = 4.0.
		Make