Datasets used: 
- `adult (Adult)` :<br>The UCI Adult dataset contains 14 features, which can be divided into two categories: demographic and income-related.<br> The demographic features include:

    age: continuous.<br>
    fnlwgt : continuous, represents final weight, which is the number of units in the target population that the responding unit represents.<br>
    workclass: categorical, with values 'Private', 'Local-gov', 'Self-emp-not-inc', 'Federal-gov', 'State-gov', 'Self-emp-inc', 'Without-pay'.<br>
    education: categorical, with values '11th', 'HS-grad', 'Assoc-acdm', 'Some-college', '10th', ..., '9th', 'Doctorate', '12th', '1st-4th', 'Preschool'.<br>
    education-num: continuous with values 1 to 16, one number assigned to each label of education feature.<br>
    marital-status: categorical, with values 'Never-married', 'Married-civ-spouse', 'Widowed', 'Separated', 'Divorced', 'Married-spouse-absent', 'Married-AF-spouse'.<br>
    occupation: categorical, with values such as 'Machine-op-inspct', 'Farming-fishing', 'Protective-serv'.<br>
    relationship: categorical, with values 'Own-child', 'Husband', 'Not-in-family', 'Unmarried', 'Wife', 'Other-relative'.<br>
    race: categorical, with values such as white, black, and Asian.<br>
    sex: categorical, with values male and female.<br>
    
    The income-related features include:<br>

    hours-per-week: continuous.<br>
    native-country: categorical, with values such as United-States, Mexico, and Germany.<br>
    capital-gain: continuous, represent the amount of money an individual has gained from the sale of investments such as stocks, bonds, or real estate.<br>
    capital-loss: continuous, represent the amount of money an individual has lost from the sale of investments such as stocks, bonds, or real estate..<br>
    The target feature is the income, which is binary:<br>
    income: categorical, with values less than or equal to 50K and greater than 50K.<br>

Reference Links: https://archive.ics.uci.edu/ml/datasets/adult

In [1]:
import pandas as pd
import numpy as np

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import facts
from facts.clean import clean_dataset
from facts import valid_ifthens_with_coverage_correctness, rules2rulesbyif
from facts.models import customLogisticRegression
from facts.parameters import ParameterProxy
from facts.formatting import recourse_report_reverse, print_recourse_report, print_recourse_report_cumulative, print_recourse_report_KStest_cumulative
from facts.utils import load_rules_by_if

In [2]:
cor_thres = 0.5
cost_budget = 10
top_count = 20
c_inf = 5

# Dataset

In [3]:
urlfile= "https://raw.githubusercontent.com/columbia/fairtest/master/data/adult/adult.csv"

X = pd.read_csv(urlfile)
df = clean_dataset(X, "adult")

age = [val.left for val in df.age.unique()]
age.sort()

df.head()

Unnamed: 0,age,Workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,"(34.0, 41.0]",State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,FullTime,United-States,0
1,"(41.0, 50.0]",Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Married,White,Male,0,0,PartTime,United-States,0
2,"(34.0, 41.0]",Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,FullTime,United-States,0
3,"(50.0, 90.0]",Private,7,Married-civ-spouse,Handlers-cleaners,Married,Black,Male,0,0,FullTime,United-States,0
4,"(26.0, 34.0]",Private,13,Married-civ-spouse,Prof-specialty,Married,Black,Female,0,0,FullTime,Cuba,0


In [4]:
y = df['income']
X = df.drop('income', axis=1)

num_features = X._get_numeric_data().columns.to_list()
cate_features = X.select_dtypes(include=['object','category']).columns.to_list()

# Rules

In [5]:
rules_with_atomic_correctness = load_rules_by_if("rulesAdultFairTest_preproc.data")
rules_with_cumulative_correctness = load_rules_by_if("rulesAdultFairTest_cumulative.data")

In [6]:
feature_weights = {"race" : 100 , "sex": 100, "marital-status": 5, "relationship":5, "age": 10, "occupation": 4, "Workclass": 2, "native-country": 4,
                    "hours-per-week":2, "capital-gain": 1, "capital-loss":1, "education-num": 3}
features_with_binary_cost = cate_features
features_with_proportional_cost = num_features


comparators = facts.feature_change_builder(
    X,
    num_cols=features_with_proportional_cost,
    cate_cols=features_with_binary_cost,
    feature_weights=feature_weights,
    num_normalization=True,
    feats_to_normalize = ["capital-gain","capital-loss"]
)
params = ParameterProxy(featureChanges=comparators)

In [7]:
facts.update_costs_cumulative(rules_with_cumulative_correctness, params)

# Old Metrics

## Weighted Average

In [8]:
top_rules, subgroup_costs = facts.select_rules_subset(
    rules_with_atomic_correctness,
    metric = "weighted-average",
    sort_strategy = "abs-diff-decr",
    top_count = top_count,
    filter_sequence = [
        # "remove-contained",
        # "remove-fair-rules",
    ],
    params=params
)

print_recourse_report(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True
)

If [1mage = (50.0, 90.0], capital-gain = 0, capital-loss = 0[0m:
	Protected Subgroup '[1m Male[0m', [34m17.54%[39m covered
		Make [1m[31mcapital-gain = 15024[39m[0m with effectiveness [32m96.94%[39m.
		Make [1m[31mcapital-gain = 7298[39m[0m with effectiveness [32m70.55%[39m.
		[1mAggregate cost[0m of the above recourses = [35m-8.06[39m
	Protected Subgroup '[1m Female[0m', [34m15.83%[39m covered
		Make [1m[31mcapital-gain = 15024[39m[0m with effectiveness [32m87.08%[39m.
		Make [1m[31mcapital-gain = 7298[39m[0m with effectiveness [32m28.72%[39m.
		[1mAggregate cost[0m of the above recourses = [35m-4.87[39m
	[35mBias against  Female. Unfairness measure = 3.19.[39m
If [1mage = (50.0, 90.0], capital-gain = 0[0m:
	Protected Subgroup '[1m Male[0m', [34m18.00%[39m covered
		Make [1m[31mcapital-gain = 15024[39m[0m with effectiveness [32m97.01%[39m.
		Make [1m[31mcapital-gain = 7298[39m[0m with effectiveness [32m71.06%[39m.
		[1mAgg

## Minimum Cost Above Threshold -- Equal Cost of Effectiveness (Macro)

In [9]:
top_rules, subgroup_costs = facts.select_rules_subset(
    rules_with_atomic_correctness,
    metric="min-above-thr",
    sort_strategy="generic-sorting-ignore-exists-subgroup-empty",
    top_count=top_count,
    cor_threshold=cor_thres,
    filter_sequence = [
        # "remove-contained",
        # "remove-below-thr",        
        # "remove-fair-rules",
        # "keep-only-min-change"
    ],
    params=params
)

print_recourse_report(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True
)

If [1mage = (34.0, 41.0], capital-loss = 0, marital-status =  Never-married, relationship =  Not-in-family[0m:
	Protected Subgroup '[1m Male[0m', [34m2.28%[39m covered
		Make [1m[31mage = (41.0, 50.0][39m, [31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m48.37%[39m.
		Make [1m[31mage = (50.0, 90.0][39m, [31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m43.79%[39m.
		Make [1m[31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m41.83%[39m.
		Make [1m[31mage = (41.0, 50.0][39m, [31mcapital-loss = 1902[39m, [31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m84.97%[39m.
		[1mAggregate cost[0m of the above recourses = [35m20.44[39m
	Protected Subgroup '[1m Female[0m', [34m2.77%[39m covered
		Make [1m[31mage = (41.0, 50.0][39m,

## Number of Rules Above Threshold -- Equal Choice for Recourse

In [10]:
top_rules, subgroup_costs = facts.select_rules_subset(
    rules_with_atomic_correctness,
    metric="num-above-thr",
    sort_strategy="generic-sorting-ignore-forall-subgroups-empty",
    top_count=top_count,
    cor_threshold=cor_thres,
    filter_sequence = [
        # "remove-contained",
        # "remove-below-thr",
        # "remove-fair-rules"
    ],
    params=params
)

print_recourse_report(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True
)

If [1mWorkclass =  Private, age = (34.0, 41.0], occupation =  Sales[0m:
	Protected Subgroup '[1m Male[0m', [34m1.28%[39m covered
		Make [1m[31mage = (41.0, 50.0][39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m61.63%[39m.
		Make [1m[31moccupation =  Exec-managerial[39m[0m with effectiveness [32m51.16%[39m.
		Make [1m[31mage = (41.0, 50.0][39m, [31moccupation =  Prof-specialty[39m[0m with effectiveness [32m60.47%[39m.
		Make [1m[31mage = (50.0, 90.0][39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m51.16%[39m.
		Make [1m[31moccupation =  Prof-specialty[39m[0m with effectiveness [32m45.35%[39m.
		Make [1m[31mage = (50.0, 90.0][39m, [31moccupation =  Prof-specialty[39m[0m with effectiveness [32m51.16%[39m.
		Make [1m[31mage = (41.0, 50.0][39m[0m with effectiveness [32m44.19%[39m.
		Make [1m[31mWorkclass =  Local-gov[39m, [31mage = (41.0, 50.0][39m, [31moccupation =  Prof-specialty[39m[

# New Metrics

## Fairness of Mean Recourse Cost ($c_\infty$ implementation) -- Equal Mean Recourse

In [11]:
top_rules, subgroup_costs = facts.select_rules_subset_cumulative(
    rules_with_cumulative_correctness,
    metric="fairness-of-mean-recourse-cinf",
    c_inf=c_inf,
    sort_strategy="generic-sorting",
    top_count=top_count,
    filter_sequence = [
        # "remove-contained",
        # "remove-fair-rules"
    ],
    params=params
)

print_recourse_report_cumulative(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True,
    show_then_costs=True
)

If [1mcapital-loss = 0, education-num = 11, race =  White[0m:
	Protected Subgroup '[1m Male[0m', [34m3.75%[39m covered
		Make [1m[31meducation-num = 12[39m[0m with effectiveness [32m10.42%[39m and counterfactual cost = 3.0.
		Make [1m[31meducation-num = 13[39m[0m with effectiveness [32m25.83%[39m and counterfactual cost = 6.0.
		Make [1m[31mcapital-loss = 1902[39m, [31meducation-num = 13[39m[0m with effectiveness [32m56.67%[39m and counterfactual cost = 6.44.
		Make [1m[31meducation-num = 13[39m, [31mrace =  Asian-Pac-Islander[39m[0m with effectiveness [32m56.67%[39m and counterfactual cost = 106.0.
		[1mAggregate cost[0m of the above recourses = [35m3.22[39m
	Protected Subgroup '[1m Female[0m', [34m3.73%[39m covered
		Make [1m[31meducation-num = 12[39m[0m with effectiveness [32m0.79%[39m and counterfactual cost = 3.0.
		Make [1m[31meducation-num = 13[39m[0m with effectiveness [32m3.94%[39m and counterfactual cost = 6.0.
		Make [1m

## Fairness of Mean Recourse Cost (conditional expectation implementation) -- Equal Conditional Mean Recourse

In [12]:
top_rules, subgroup_costs = facts.select_rules_subset_cumulative(
    rules_with_cumulative_correctness,
    metric="fairness-of-mean-recourse-conditional",
    sort_strategy="generic-sorting-ignore-exists-subgroup-empty",
    top_count=top_count,
    filter_sequence = [
        # "remove-contained",
        # "remove-fair-rules"
    ],
    params=params
)

print_recourse_report_cumulative(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True,
    show_then_costs=True
)

If [1mrace =  Black, relationship =  Not-in-family[0m:
	Protected Subgroup '[1m Male[0m', [34m2.49%[39m covered
		Make [1m[31mrelationship =  Married[39m[0m with effectiveness [32m0.59%[39m and counterfactual cost = 5.
		Make [1m[31mrace =  White[39m[0m with effectiveness [32m0.59%[39m and counterfactual cost = 100.
		Make [1m[31mrace =  Asian-Pac-Islander[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m1.76%[39m and counterfactual cost = 105.
		Make [1m[31mrace =  White[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m2.35%[39m and counterfactual cost = 105.
		[1mAggregate cost[0m of the above recourses = [35m80.00[39m
	Protected Subgroup '[1m Female[0m', [34m4.26%[39m covered
		Make [1m[31mrelationship =  Married[39m[0m with effectiveness [32m0.69%[39m and counterfactual cost = 5.
		Make [1m[31mrace =  White[39m[0m with effectiveness [32m0.69%[39m and counterfactual cost = 100.
		Make [1m[31mrace =  

## Fairness of Recourse at Effectiveness Level -- Equal Cost of Effectiveness (Micro)

In [13]:
top_rules, subgroup_costs = facts.select_rules_subset_cumulative(
    rules_with_cumulative_correctness,
    metric="min-above-corr",
    sort_strategy="generic-sorting-ignore-exists-subgroup-empty",
    top_count=top_count,
    cor_threshold = cor_thres,
    filter_sequence = [
        # "remove-contained",
        # "remove-fair-rules",
        # "keep-cheap-rules-above-thr-cor",
        # "keep-only-min-change",
        
    ],
    params=params
)

print_recourse_report_cumulative(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True,
    show_then_costs=True
)

If [1mage = (34.0, 41.0], capital-loss = 0, marital-status =  Never-married, relationship =  Not-in-family[0m:
	Protected Subgroup '[1m Male[0m', [34m2.28%[39m covered
		Make [1m[31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m42.29%[39m and counterfactual cost = 10.0.
		Make [1m[31mage = (50.0, 90.0][39m, [31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m42.29%[39m and counterfactual cost = 20.0.
		Make [1m[31mage = (41.0, 50.0][39m, [31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m46.29%[39m and counterfactual cost = 20.0.
		Make [1m[31mage = (41.0, 50.0][39m, [31mcapital-loss = 1902[39m, [31mmarital-status =  Married-civ-spouse[39m, [31mrelationship =  Married[39m[0m with effectiveness [32m81.71%[39m and counterfactual cost = 20.44.
		[1mAggregate cost[0m of the above recour

## Fairness of Effectiveness -- Equal Effectiveness

In [14]:
top_rules, subgroup_costs = facts.select_rules_subset_cumulative(
    rules_with_cumulative_correctness,
    metric="total-correctness",
    sort_strategy="generic-sorting-ignore-forall-subgroups-empty",
    top_count=top_count,
    filter_sequence = [
        # "remove-contained",
        # "remove-fair-rules",
        # "keep-only-min-change"
    ],
    params=params
)

print_recourse_report_cumulative(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True,
    show_then_costs=True,
    correctness_metric=True
)

If [1mcapital-loss = 0, education-num = 9, hours-per-week = FullTime, native-country =  United-States, occupation =  Machine-op-inspct, race =  White[0m:
	Protected Subgroup '[1m Male[0m', [34m2.55%[39m covered
		Make [1m[31mhours-per-week = OverTime[39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m42.68%[39m and counterfactual cost = 6.0.
		Make [1m[31meducation-num = 10[39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m42.68%[39m and counterfactual cost = 7.0.
		Make [1m[31meducation-num = 10[39m, [31mhours-per-week = OverTime[39m, [31moccupation =  Sales[39m[0m with effectiveness [32m42.68%[39m and counterfactual cost = 9.0.
		Make [1m[31meducation-num = 10[39m, [31mhours-per-week = OverTime[39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m59.15%[39m and counterfactual cost = 9.0.
		[1mAggregate cost[0m of the above recourses = [35m0.59[39m
	Protected Subgroup '[1m Female[0m', 

## Fairness of Recourse (using two-sample Kolmogorov-Smirnov test) -- Fair Effectiveness-Cost Trade-Off

In [15]:
# taken from the other notebooks
affected_pop_sizes = {" Male": 6732, " Female": 4106}

top_rules,unfairness = facts.select_rules_subset_KStest(
    rules_with_cumulative_correctness,
    affected_pop_sizes,
    top_count=top_count
)

print_recourse_report_KStest_cumulative(
    top_rules,
    population_sizes=affected_pop_sizes,
    unfairness = unfairness,
    show_then_costs=True
    # show_cumulative_plots=True
)

If [1mcapital-gain = 0, capital-loss = 0[0m:
	Protected Subgroup '[1m Male[0m', [34m94.15%[39m covered out of 6732
		Make [1m[31mcapital-gain = 5013[39m[0m with effectiveness [32m36.86%[39m and counterfactual cost = 0.05.
		Make [1m[31mcapital-gain = 5178[39m[0m with effectiveness [32m37.67%[39m and counterfactual cost = 0.05.
		Make [1m[31mcapital-gain = 7298[39m[0m with effectiveness [32m49.65%[39m and counterfactual cost = 0.07.
		Make [1m[31mcapital-gain = 7688[39m[0m with effectiveness [32m51.56%[39m and counterfactual cost = 0.08.
		Make [1m[31mcapital-gain = 15024[39m[0m with effectiveness [32m80.51%[39m and counterfactual cost = 0.15.
		Make [1m[31mcapital-loss = 1887[39m[0m with effectiveness [32m80.51%[39m and counterfactual cost = 0.43.
		Make [1m[31mcapital-loss = 1902[39m[0m with effectiveness [32m80.51%[39m and counterfactual cost = 0.44.
		Make [1m[31mcapital-loss = 1977[39m[0m with effectiveness [32m80.51%[39m and co

## Fairness of Effectiveness at Recourse Budget -- Equal Effectiveness within Budget (Micro)

In [16]:
top_rules, subgroup_costs = facts.select_rules_subset_cumulative(
    rules_with_cumulative_correctness,
    metric="max-upto-cost",
    sort_strategy="generic-sorting-ignore-exists-subgroup-empty",
    top_count=top_count,
    cost_threshold = cost_budget,
    filter_sequence = [
        # "remove-contained",
        # "remove-fair-rules",
        # "remove-above-thr-cost"
        #"remove-below-thr",
        #"keep-only-min-change",
        
    ],
    params=params
)

print_recourse_report_cumulative(
    top_rules,
    subgroup_costs=subgroup_costs,
    show_subgroup_costs=True,
    show_then_costs=True,
    correctness_metric = True
)

If [1mcapital-loss = 0, education-num = 9, hours-per-week = FullTime, native-country =  United-States, occupation =  Machine-op-inspct, race =  White[0m:
	Protected Subgroup '[1m Male[0m', [34m2.55%[39m covered
		Make [1m[31mhours-per-week = OverTime[39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m42.68%[39m and counterfactual cost = 6.0.
		Make [1m[31meducation-num = 10[39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m42.68%[39m and counterfactual cost = 7.0.
		Make [1m[31meducation-num = 10[39m, [31mhours-per-week = OverTime[39m, [31moccupation =  Sales[39m[0m with effectiveness [32m42.68%[39m and counterfactual cost = 9.0.
		Make [1m[31meducation-num = 10[39m, [31mhours-per-week = OverTime[39m, [31moccupation =  Exec-managerial[39m[0m with effectiveness [32m59.15%[39m and counterfactual cost = 9.0.
		[1mAggregate cost[0m of the above recourses = [35m0.59[39m
	Protected Subgroup '[1m Female[0m', 

# Gather all metrics in a table (under construction)

In [17]:
from facts.fairness_metrics_aggr import make_table, auto_budget_calculation

In [18]:
rules_with_atomic_correctness.keys() == rules_with_cumulative_correctness.keys()

True

In [19]:
thens1 = {ifc: {sg: (cov, sorted([t for t, _ in thens])) for sg, (cov, thens) in thencs.items()} for ifc, thencs in rules_with_atomic_correctness.items()}
thens2 = {ifc: {sg: (cov, sorted([t for t, _, _ in thens])) for sg, (cov, thens) in thencs.items()} for ifc, thencs in rules_with_cumulative_correctness.items()}

thens1 == thens2

True

In [20]:
rules_with_both = {}
for ifc, all_thens in rules_with_cumulative_correctness.items():
    new_all_thens = {}
    for sg, (cov, thens_cum) in all_thens.items():
        thens_atom = rules_with_atomic_correctness[ifc][sg][1]
        thens_atom_dict = {then: atom_cor for then, atom_cor in thens_atom}
        new_all_thens[sg] = (cov, [(then, thens_atom_dict[then], cumcor) for then, cumcor, _cost in thens_cum])
    rules_with_both[ifc] = new_all_thens

In [21]:
budgets = auto_budget_calculation(rules_with_cumulative_correctness, cor_thres=0.5, percentiles=[0.3, 0.6, 0.9])
budgets

[5.076880768807688, 10.0, 18.0]

In [22]:
df = make_table(
    rules_with_both,
    sensitive_attribute_vals=["Male", "Female"],
    effectiveness_thresholds=[0.3, 0.7],
    cost_budgets=budgets,
    params=params
)

In [23]:
df.head()

Unnamed: 0_level_0,subgroup,weighted-average,weighted-average,"(mincost-above-th, 0.3)","(mincost-above-th, 0.3)","(mincost-above-th, 0.7)","(mincost-above-th, 0.7)","(number-above-th, 0.3)","(number-above-th, 0.3)","(number-above-th, 0.7)",...,"(eff-within-budget, 18.0)","(eff-within-budget, 18.0)","(cost-of-effectiveness, 0.3)","(cost-of-effectiveness, 0.3)","(cost-of-effectiveness, 0.7)","(cost-of-effectiveness, 0.7)",mean-cost-cinf,mean-cost-cinf,mean-cost-conditional,mean-cost-conditional
Unnamed: 0_level_1,subgroup,Male,Female,Male,Female,Male,Female,Male,Female,Male,...,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female
0,capital-loss = 0,-0.678264,-0.19336,0.453857,inf,inf,inf,-1,0,0,...,0.305134,0.093235,0.453857,inf,inf,inf,0.1324,0.232881,0.433907,0.434796
1,capital-gain = 0,-5.905823,-2.94105,0.050131,0.150242,0.150242,0.150242,-6,-2,-2,...,1.0,1.0,0.050131,0.150242,0.150242,0.150242,0.259215,0.354914,0.259215,0.354914
2,"capital-gain = 0, capital-loss = 0",-4.118241,-1.944884,0.050131,0.150242,0.150242,0.150242,-6,-2,-2,...,1.0,1.0,0.050131,0.150242,0.150242,0.150242,0.262978,0.360717,0.262978,0.360717
3,"capital-loss = 0, native-country = United-States",-0.702256,-0.195815,0.433196,inf,inf,inf,-3,0,0,...,0.316954,0.095106,0.433196,inf,inf,inf,0.137527,0.242736,0.433902,0.434907
4,"capital-gain = 0, native-country = United-States",-5.708898,-3.038852,0.051781,0.150242,0.150242,0.150242,-5,-2,-2,...,1.0,1.0,0.051781,0.150242,0.150242,0.150242,0.250856,0.349534,0.250856,0.349534


## 2-level index: Examples of usage

In [24]:
df["subgroup"]

Unnamed: 0,subgroup
0,capital-loss = 0
1,capital-gain = 0
2,"capital-gain = 0, capital-loss = 0"
3,"capital-loss = 0, native-country = United-States"
4,"capital-gain = 0, native-country = United-States"
...,...
12875,"Workclass = Private, capital-loss = 0, marita..."
12876,"Workclass = Private, age = (50.0, 90.0], race..."
12877,"capital-gain = 0, hours-per-week = OverTime, n..."
12878,"capital-gain = 0, capital-loss = 0, hours-per-..."


In [25]:
df["weighted-average", "Male"]

0       -0.678264
1       -5.905823
2       -4.118241
3       -0.702256
4       -5.708898
           ...   
12875   -0.002538
12876   -0.000514
12877   -0.004076
12878   -0.003880
12879   -0.004146
Name: (weighted-average, Male), Length: 12880, dtype: float64

In [26]:
df[("mincost-above-th", 0.3), "Female"]

0               inf
1          0.150242
2          0.150242
3               inf
4          0.150242
            ...    
12875    110.433196
12876           inf
12877    100.072981
12878    100.072981
12879           inf
Name: ((mincost-above-th, 0.3), Female), Length: 12880, dtype: float64

In [27]:
(df[("mincost-above-th", 0.3), "Male"] - df[("mincost-above-th", 0.3), "Female"]).dropna().value_counts()

 0.000000     3041
-inf          1288
 inf           385
 2.000000       94
 10.000000      86
              ... 
 0.003900        1
 0.003900        1
 0.073361        1
 0.001650        1
-0.022850        1
Length: 78, dtype: int64

## 3- Get ranking of subgroups based on metrics

In [28]:
def get_diff_table(
        df,
        sensitive_attribute_vals=["Male", "Female"]    
    ):
    z = df.copy()
    z = z.drop(columns=[('subgroup','subgroup')])
    diff = pd.DataFrame()

    for col in z.columns.get_level_values(0):
        diff[col] = abs(z[col,sensitive_attribute_vals[0]] - z[col,sensitive_attribute_vals[1]])

    diff['subgroup'] = df['subgroup','subgroup']
    first = diff.pop('subgroup')
    diff.insert(0,'subgroup',first)
    diff = diff.fillna(0)

    return diff

In [29]:
diff = get_diff_table(df)
diff.head()

Unnamed: 0,subgroup,weighted-average,"(mincost-above-th, 0.3)","(mincost-above-th, 0.7)","(number-above-th, 0.3)","(number-above-th, 0.7)",total-effectiveness,"(eff-within-budget, 5.076880768807688)","(eff-within-budget, 10.0)","(eff-within-budget, 18.0)","(cost-of-effectiveness, 0.3)","(cost-of-effectiveness, 0.7)",mean-cost-cinf,mean-cost-conditional
0,capital-loss = 0,0.484904,inf,0.0,1,0,0.211898,0.211898,0.211898,0.211898,inf,0.0,0.100482,0.000889
1,capital-gain = 0,2.964773,0.100111,0.0,4,0,0.0,0.0,0.0,0.0,0.100111,0.0,0.0957,0.0957
2,"capital-gain = 0, capital-loss = 0",2.173357,0.100111,0.0,4,0,0.0,0.0,0.0,0.0,0.100111,0.0,0.097738,0.097738
3,"capital-loss = 0, native-country = United-States",0.50644,inf,0.0,3,0,0.221847,0.221847,0.221847,0.221847,inf,0.0,0.105209,0.001005
4,"capital-gain = 0, native-country = United-States",2.670047,0.098461,0.0,3,0,0.0,0.0,0.0,0.0,0.098461,0.0,0.098678,0.098678


In [30]:
diff = diff.set_index('subgroup')
ranked = diff.mask(diff == 0).rank(ascending = False,axis=0,method='dense')
ranked = ranked.replace(np.nan,"Fair")

In [32]:
ranked = ranked.reset_index()
ranked.head()

Unnamed: 0,subgroup,weighted-average,"(mincost-above-th, 0.3)","(mincost-above-th, 0.7)","(number-above-th, 0.3)","(number-above-th, 0.7)",total-effectiveness,"(eff-within-budget, 5.076880768807688)","(eff-within-budget, 10.0)","(eff-within-budget, 18.0)","(cost-of-effectiveness, 0.3)","(cost-of-effectiveness, 0.7)",mean-cost-cinf,mean-cost-conditional
0,capital-loss = 0,69.0,1.0,Fair,13.0,Fair,1211.0,440.0,993.0,1183.0,1.0,Fair,9563.0,7428.0
1,capital-gain = 0,3.0,21.0,Fair,10.0,Fair,Fair,Fair,Fair,Fair,17.0,Fair,9626.0,4736.0
2,"capital-gain = 0, capital-loss = 0",7.0,21.0,Fair,10.0,Fair,Fair,Fair,Fair,Fair,17.0,Fair,9597.0,4713.0
3,"capital-loss = 0, native-country = United-States",66.0,1.0,Fair,11.0,Fair,1104.0,368.0,903.0,1087.0,1.0,Fair,9493.0,7409.0
4,"capital-gain = 0, native-country = United-States",4.0,23.0,Fair,11.0,Fair,Fair,Fair,Fair,Fair,19.0,Fair,9583.0,4701.0


In [33]:
ranked.loc[11831].to_frame()

Unnamed: 0,11831
subgroup,"Workclass = Private, age = (50.0, 90.0], nati..."
weighted-average,239.0
"(mincost-above-th, 0.3)",1.0
"(mincost-above-th, 0.7)",Fair
"(number-above-th, 0.3)",9.0
"(number-above-th, 0.7)",Fair
total-effectiveness,12.0
"(eff-within-budget, 5.076880768807688)",46.0
"(eff-within-budget, 10.0)",8.0
"(eff-within-budget, 18.0)",12.0
