In [176]:
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
from fairlearn.datasets import fetch_adult
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

from concurrent.futures import ThreadPoolExecutor
import tqdm as notebook_tqdm
from metrics import (
    equality_opportunity_difference,
    predictive_equality_difference,
    predictive_parity_difference,
    metrics,
    average_absolute_odds_difference,
    metric_evaluation, 
    get_metric_evaluation,
    
)
from fairlearn.metrics import demographic_parity_difference
from sklearn.utils import resample

import numpy as np
import pandas as pd
import optuna
import dill
import pickle

from sklearn.metrics import (
    f1_score, 
    confusion_matrix, 
    make_scorer, 
    accuracy_score, 
    recall_score, 
    matthews_corrcoef,
    precision_score
)

In [178]:
def detailed_objective(trial, data_dict, sensitive_col, models, preprocessor):
    classifier_name = trial.suggest_categorical("classifier", models)

    if classifier_name == "logit":        
        params = {
            "penalty" : trial.suggest_categorical('logit_penalty', ['l1','l2']),
            "C" : trial.suggest_float('logit_c', 0.001, 10),
            "max_iter": 2000,
            "solver" : 'saga'
            }
        classifier = LogisticRegression(**params)

    elif classifier_name =="RF":
        params = {
            'n_estimators': trial.suggest_int("rf_n_estimators", 100, 1000),
            'criterion': trial.suggest_categorical("rf_criterion", ['gini', 'entropy']),
            'max_depth': trial.suggest_int("rf_max_depth", 1, 4),
            'min_samples_split': trial.suggest_float("rf_min_samples_split", 0.01, 1),
            }
        classifier = RandomForestClassifier(**params)

    elif classifier_name =="LGBM":
        params = {
            'n_estimators': trial.suggest_int("lgbm_n_estimators", 20, 10000),
            'num_leaves': trial.suggest_int("lgbm_num_leaves", 10, 1000),
            'max_depth': trial.suggest_int("lgbm_max_depth", 2, 20),
            'min_child_samples': trial.suggest_int("lgbm_min_child_samples", 5, 300),
            'learning_rate': trial.suggest_float('lgbm_learning_rate', 1e-5, 1e-2),
            'boosting_type': trial.suggest_categorical("lgbm_boosting_type", ['goss', 'gbdt'])
            }
        classifier = LGBMClassifier(**params)  

    elif classifier_name =="GBM":
        params = {
            'n_estimators': trial.suggest_int("gbm_n_estimators", 100, 1000), 
            'criterion': trial.suggest_categorical("gbm_criterion", ['squared_error', 'friedman_mse']),
            'max_depth': trial.suggest_int("gbm_max_depth", 1, 4),
            'min_samples_split': trial.suggest_int("gbm_min_samples_split", 5, 300),
            }
        classifier = GradientBoostingClassifier(**params)            

    else:
        None

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("classifier", classifier),
        ]
    )

    pipeline.fit(data_dict['X_train'], data_dict['y_train'])
    y_pred = pipeline.predict(data_dict['X_test'])
    metrics = metric_evaluation(
        y_true= data_dict['y_test'], 
        y_pred= y_pred, 
        sensitive_features=data_dict['X_test'][sensitive_col]
        )
    return classifier_name, metrics


def get_default_metrics(metrics, data_dict, sensitive_col, preprocessor):
    models = metrics['overall']['model_name'].unique()
    classifier = {
        'logit' : LogisticRegression(),
        'GBM' : GradientBoostingClassifier(),
        'LGBM' : LGBMClassifier(),
        'RF' : RandomForestClassifier(),
    }

    metrics['default_overall'] = pd.DataFrame()
    metrics['default_bygroup'] = pd.DataFrame()
    for model in models:
        clf = classifier[model]
        pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("classifier", clf),
            ]
        )

        pipeline.fit(data_dict['X_train'], data_dict['y_train'])
        y_pred = pipeline.predict(data_dict['X_test'])
        metric_frame = metric_evaluation(
            y_true= data_dict['y_test'], 
            y_pred=y_pred, 
            sensitive_features=data_dict['X_test'][sensitive_col]
        )
        # Overall
        fair_records = pd.DataFrame.from_records([get_metric_evaluation(metric_frame)])
        new_metric_overall = pd.concat([fair_records, pd.DataFrame(metric_frame.overall).T], axis = 1)
        new_metric_overall['model'] = model
        metrics['default_overall'] = pd.concat([metrics['default_overall'], new_metric_overall])
        # By group
        new_metric_bygroup = metric_frame.by_group.reset_index()
        new_metric_bygroup['model'] = model
        metrics['default_bygroup'] = pd.concat([metrics['default_bygroup'], new_metric_bygroup])
    return metrics

def get_metrics(study, data_dict, sensitive_col, models, preprocessor):
    metrics = {}
    metrics['overall'] = pd.DataFrame()
    metrics['bygroup'] = pd.DataFrame()
    try:
        metrics['fair_metric'] = study.user_attrs['fair_metric']
        metrics['model_metric'] = study.user_attrs['model_metric']
    except:
        print('User attributes not found')
    i = 1
    for best_trial in study.best_trials:
        if best_trial.values != [0,0]:
            fair_value, model_value = best_trial.values
            clf_name, metric = detailed_objective(best_trial, data_dict, sensitive_col, models, preprocessor)
            # Overall
            fair_records = pd.DataFrame.from_records([get_metric_evaluation(metric)])
            new_metric_overall = pd.concat([fair_records, pd.DataFrame(metric.overall).T], axis = 1)
            new_metric_overall['best_trial'] = i
            new_metric_overall['fair_metric'] = fair_value
            new_metric_overall['model_metric'] = model_value
            new_metric_overall['model_name'] = clf_name
            metrics['overall'] = pd.concat([metrics['overall'], new_metric_overall])
            # By Groups
            new_metric_bygroup = metric.by_group.reset_index()
            new_metric_bygroup['best_trial'] = i
            metrics['bygroup'] = pd.concat([metrics['bygroup'], new_metric_bygroup])
            i += 1
    return metrics

In [181]:
numeric_transformer = Pipeline(
    steps=[
        ("impute", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)


n_sim = 123
sensitive_col = 'sex'
sensitive_attribute = 'sex'
models =  ["GBM","LGBM","RF"]
file_name = 'results/recall_score_demographic_parity_difference_100_20230724151855.pkl'

with open(file_name, 'rb') as in_strm:
    study = dill.load(in_strm)

data = fetch_adult(as_frame=True)
X_raw = data.data
y = (data.target == ">50K") * 1

if sensitive_attribute == 'race':
    mapping = {'White':'white','Black':'black','Asian-Pac-Islander':'others','Amer-Indian-Eskimo':'others','Other':'others'}
    X_raw.loc[:,'race'] = X_raw['race'].map(mapping).astype("category")

perc = .5
X_raw, y = resample(X_raw, y, n_samples=int(perc*X_raw.shape[0]), random_state = n_sim)  
  
(X_train, X_test, y_train, y_test) = train_test_split(
    X_raw, y, test_size=0.8, stratify=y, random_state=n_sim
)

data_dict = {}
data_dict['X_train'] = X_train.reset_index(drop=True)
data_dict['X_test'] = X_test.reset_index(drop=True)
data_dict['y_train'] = y_train.reset_index(drop=True)
data_dict['y_test'] = y_test.reset_index(drop=True)

metrics = get_metrics(study, data_dict, sensitive_col, models, preprocessor)
metrics = get_default_metrics(metrics, data_dict, sensitive_col, preprocessor)
metrics['file_name'] = file_name

  warn(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [182]:
metrics

{'overall':    demographic parity  predictive parity  equality opportunity   
 0            0.152969           0.067696              0.105239  \
 0            0.187880           0.008934              0.075670   
 0            0.187071           0.001063              0.078167   
 0            0.033137           0.001903              0.032304   
 0            0.144394           0.173891              0.337491   
 0            0.153196           0.067009              0.107707   
 0            0.161669           0.057531              0.085767   
 0            0.160157           0.061351              0.095634   
 0            0.190324           0.020519              0.090863   
 0            0.158695           0.062478              0.089621   
 0            0.165751           0.055439              0.100390   
 0            0.140366           0.167883              0.336126   
 0            0.020599           0.996255              0.067359   
 0            0.025073           0.996923          

In [184]:
file_name = 'results/recall_score_demographic_parity_difference_100_20230724151855.pkl'
file_name = file_name[:-4] + '-metrics.pkl'
file_name

'results/recall_score_demographic_parity_difference_100_20230724151855-metrics.pkl'

In [192]:
        dif = metrics['bygroup'][metrics['bygroup'].best_trial == 5].apply(pd.to_numeric, errors='coerce').diff().abs().iloc[-1,:]
        #dif[0] = 'Difference'
        df_groups_m = metrics['bygroup'][metrics['bygroup'].best_trial == 5].T
        df_groups_m = pd.concat([df_groups_m,dif], axis = 1).T

In [198]:
fair_col = 'demographic parity'
model_col = 'recall'
df_default_overall = metrics['default_overall'].rename(columns = {'model':'model_name'})
df_optimized_overall = metrics['overall']
df_overall = pd.concat([df_optimized_overall,df_default_overall])
df_overall = df_overall.sort_values([fair_col]).reset_index(drop = True)


In [207]:
train_col = 'fair_metric'
train_col.replace('_',' ')

'fair metric'

In [185]:
with open(file_name, 'wb') as file:
    dill.dump(metrics, file)
    print(f'Object successfully saved to "{file_name}"')

Object successfully saved to "results/recall_score_demographic_parity_difference_100_20230724151855-metrics.pkl"


In [73]:
model = metrics['overall'][metrics['overall'].index == 5].model_name
n_model = metrics['default_bygroup']['model'].isin(model)

In [216]:
metrics.keys()

dict_keys(['overall', 'bygroup', 'fair_metric', 'model_metric', 'default_overall', 'default_bygroup', 'file_name'])

In [218]:
metrics['overall'].rename(columns = {'precision':'prec'})

Unnamed: 0,demographic parity,predictive parity,equality opportunity,predictive equality,average absolute odds,accuracy,prec,recall,f1 score,mcc,selection rate,false positive rate,true positive rate,false negative rate,true negative rate,count,best_trial,fair_metric,model_metric,model_name
0,0.152969,0.067696,0.105239,0.05455,0.079895,0.854686,0.797293,0.528181,0.635418,0.567466,0.158827,0.042348,0.528181,0.471819,0.957652,19537.0,1,0.155004,0.546969,GBM
0,0.18788,0.008934,0.07567,0.084355,0.080012,0.852741,0.713442,0.644748,0.677358,0.583526,0.216666,0.081667,0.644748,0.355252,0.918333,19537.0,2,0.183918,0.730572,LGBM
0,0.187071,0.001063,0.078167,0.083414,0.080791,0.853611,0.7186,0.640051,0.677055,0.58445,0.213544,0.079041,0.640051,0.359949,0.920959,19537.0,3,0.17823,0.706446,LGBM
0,0.033137,0.001903,0.032304,0.000716,0.01651,0.794697,0.986975,0.145602,0.253767,0.335138,0.035369,0.000606,0.145602,0.854398,0.999394,19537.0,4,0.047478,0.248936,LGBM
0,0.144394,0.173891,0.337491,0.038278,0.187884,0.829401,0.830965,0.362084,0.504387,0.472977,0.104468,0.023228,0.362084,0.637916,0.976772,19537.0,5,0.110999,0.305293,RF
0,0.153196,0.067009,0.107707,0.05455,0.081129,0.854532,0.797097,0.527541,0.634892,0.56694,0.158673,0.042348,0.527541,0.472459,0.957652,19537.0,6,0.154452,0.545688,GBM
0,0.161669,0.057531,0.085767,0.058144,0.071956,0.862927,0.795,0.577071,0.668728,0.596979,0.174029,0.046927,0.577071,0.422929,0.953073,19537.0,7,0.166155,0.589882,GBM
0,0.160157,0.061351,0.095634,0.057447,0.076541,0.860675,0.79566,0.563621,0.659835,0.588942,0.169832,0.045647,0.563621,0.436379,0.954353,19537.0,8,0.162997,0.574723,GBM
0,0.190324,0.020519,0.090863,0.079028,0.084946,0.864258,0.746124,0.657558,0.699047,0.613898,0.211291,0.070558,0.657558,0.342442,0.929442,19537.0,9,0.186306,0.731855,LGBM
0,0.158695,0.062478,0.089621,0.057225,0.073423,0.860214,0.795461,0.561272,0.658155,0.587342,0.169166,0.045513,0.561272,0.438728,0.954487,19537.0,10,0.162373,0.572801,GBM


In [86]:
df = metrics['default_bygroup'][n_model].apply(pd.to_numeric, errors='coerce').diff().abs().iloc[-1,:]


sex                            NaN
accuracy                  0.092950
precision                 0.014980
recall                    0.139323
f1 score                  0.076544
mcc                       0.008755
selection rate            0.202506
false positive rate       0.080465
true positive rate        0.139323
false negative rate       0.139323
true negative rate        0.080465
count                  6461.000000
model                          NaN
Name: 1, dtype: float64

In [110]:
def create_df_groups_metrics(n, results_dict, model_mapping):
    model = metrics['overall'][metrics['overall'].index == 5].model_name
    n_model = metrics['default_bygroup']['model'].isin(model)
    #models = list(map(model_mapping.get,results_dict['models_sim_u'][0]))
    #n_model = models.index(results_dict['models_sim'][0][n])

    #df_groups_u = metrics['']
    df_groups_u = results_dict['metrics_sim_u'][0][n_model].by_group.T
    d = results_dict['metrics_sim_u'][0][n_model].difference()
    d.name = 'Difference'
    df_groups_u = pd.concat([df_groups_u,d], axis = 1).T
    df_groups_u.columns = df_groups_u.columns + ' u'

    df_groups_m = results_dict['metrics_sim'][0][n].by_group.T
    d = results_dict['metrics_sim'][0][n].difference()
    d.name = 'Difference'
    df_groups_m = pd.concat([df_groups_m,d], axis = 1).T
    df_groups = pd.concat([df_groups_u,df_groups_m],axis = 1).reset_index()
    return df_groups

In [130]:
file_name = '../notebooks/metrics.json'
with open(file_name, 'rb') as f:
    metrics_info = json.load(f)

In [136]:
metrics['overall']


Unnamed: 0,demographic parity,predictive parity,equality opportunity,predictive equality,average absolute odds,accuracy,precision,recall,f1 score,mcc,selection rate,false positive rate,true positive rate,false negative rate,true negative rate,count,best_trial,fair_metric,model_metric,model_name
0,0.011385,0.993243,0.037206,0.000111,0.018658,0.767723,0.993243,0.031383,0.060844,0.154197,0.007575,6.7e-05,0.031383,0.968617,0.999933,19537.0,2,0.0,0.052761,RF
1,0.02431,0.996835,0.079727,0.000111,0.039919,0.776322,0.996835,0.06725,0.126,0.227375,0.016174,6.7e-05,0.06725,0.93275,0.999933,19537.0,1,0.000111,0.129615,RF
2,0.038655,0.006873,0.091722,0.000442,0.046082,0.791677,0.993569,0.131939,0.232944,0.320185,0.031837,0.000269,0.131939,0.868061,0.999731,19537.0,3,0.000219,0.211406,RF
3,0.021673,0.0,0.017922,0.0,0.008961,0.784153,1.0,0.099701,0.181324,0.278665,0.023903,0.0,0.099701,0.900299,1.0,19537.0,17,0.000328,0.298484,LGBM
4,0.028683,0.005814,0.031614,0.000332,0.015973,0.790039,0.994898,0.124893,0.221927,0.31158,0.030097,0.000202,0.124893,0.875107,0.999798,19537.0,4,0.000726,0.392168,LGBM
5,0.036005,0.002134,0.043393,0.000491,0.021942,0.796796,0.990385,0.153928,0.266445,0.345902,0.037263,0.000471,0.153928,0.846072,0.999529,19537.0,5,0.000812,0.398623,LGBM
6,0.057346,0.00696,0.008146,0.003117,0.005632,0.825869,0.969941,0.282451,0.4375,0.468568,0.069816,0.00276,0.282451,0.717549,0.99724,19537.0,19,0.000948,0.417735,LGBM
7,0.054103,0.001775,0.010189,0.002454,0.006322,0.822337,0.97272,0.266439,0.418301,0.455159,0.06567,0.002356,0.266439,0.733561,0.997644,19537.0,20,0.001751,0.427776,LGBM
8,0.068367,0.026821,0.017659,0.007795,0.012727,0.831448,0.942149,0.316396,0.473709,0.486843,0.080514,0.006127,0.316396,0.683604,0.993873,19537.0,18,0.003929,0.47181,LGBM
9,0.042162,0.00625,0.010208,0.001349,0.005778,0.81251,0.976657,0.223313,0.36351,0.415684,0.054819,0.001683,0.223313,0.776687,0.998317,19537.0,7,0.004248,0.472184,LGBM


In [135]:
metrics['default_overall']

Unnamed: 0,demographic parity,predictive parity,equality opportunity,predictive equality,average absolute odds,accuracy,precision,recall,f1 score,mcc,selection rate,false positive rate,true positive rate,false negative rate,true negative rate,count,model
0,0.20477,0.056846,0.147274,0.086014,0.116644,0.866561,0.751149,0.663108,0.704388,0.620656,0.21165,0.069279,0.663108,0.336892,0.930721,19537.0,RF
0,0.202506,0.01498,0.139323,0.080465,0.109894,0.869939,0.752177,0.682323,0.715549,0.632765,0.217485,0.070895,0.682323,0.317677,0.929105,19537.0,LGBM


In [154]:
model = 'LGBM'

In [155]:
(df_overall.model_name == model) & (~df_overall.best_trial.isna())

0     False
1     False
2     False
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22    False
23    False
dtype: bool

In [150]:
df_default_overall = metrics['default_overall'].rename(columns = {'model':'model_name'})
df_optimized_overall = metrics['overall']
df_overall = pd.concat([df_optimized_overall,df_default_overall]).reset_index(drop = True)

In [151]:
df_overall

Unnamed: 0,demographic parity,predictive parity,equality opportunity,predictive equality,average absolute odds,accuracy,precision,recall,f1 score,mcc,selection rate,false positive rate,true positive rate,false negative rate,true negative rate,count,best_trial,fair_metric,model_metric,model_name
0,0.011385,0.993243,0.037206,0.000111,0.018658,0.767723,0.993243,0.031383,0.060844,0.154197,0.007575,6.7e-05,0.031383,0.968617,0.999933,19537.0,2.0,0.0,0.052761,RF
1,0.02431,0.996835,0.079727,0.000111,0.039919,0.776322,0.996835,0.06725,0.126,0.227375,0.016174,6.7e-05,0.06725,0.93275,0.999933,19537.0,1.0,0.000111,0.129615,RF
2,0.038655,0.006873,0.091722,0.000442,0.046082,0.791677,0.993569,0.131939,0.232944,0.320185,0.031837,0.000269,0.131939,0.868061,0.999731,19537.0,3.0,0.000219,0.211406,RF
3,0.021673,0.0,0.017922,0.0,0.008961,0.784153,1.0,0.099701,0.181324,0.278665,0.023903,0.0,0.099701,0.900299,1.0,19537.0,17.0,0.000328,0.298484,LGBM
4,0.028683,0.005814,0.031614,0.000332,0.015973,0.790039,0.994898,0.124893,0.221927,0.31158,0.030097,0.000202,0.124893,0.875107,0.999798,19537.0,4.0,0.000726,0.392168,LGBM
5,0.036005,0.002134,0.043393,0.000491,0.021942,0.796796,0.990385,0.153928,0.266445,0.345902,0.037263,0.000471,0.153928,0.846072,0.999529,19537.0,5.0,0.000812,0.398623,LGBM
6,0.057346,0.00696,0.008146,0.003117,0.005632,0.825869,0.969941,0.282451,0.4375,0.468568,0.069816,0.00276,0.282451,0.717549,0.99724,19537.0,19.0,0.000948,0.417735,LGBM
7,0.054103,0.001775,0.010189,0.002454,0.006322,0.822337,0.97272,0.266439,0.418301,0.455159,0.06567,0.002356,0.266439,0.733561,0.997644,19537.0,20.0,0.001751,0.427776,LGBM
8,0.068367,0.026821,0.017659,0.007795,0.012727,0.831448,0.942149,0.316396,0.473709,0.486843,0.080514,0.006127,0.316396,0.683604,0.993873,19537.0,18.0,0.003929,0.47181,LGBM
9,0.042162,0.00625,0.010208,0.001349,0.005778,0.81251,0.976657,0.223313,0.36351,0.415684,0.054819,0.001683,0.223313,0.776687,0.998317,19537.0,7.0,0.004248,0.472184,LGBM


In [158]:
n_best_trial = df_overall.loc[df_overall.index == 23,'best_trial'].values[0]

In [162]:
import numpy as np
np.isnan(n_best_trial)

True

In [165]:
any(df_overall.columns == 'o')

False

In [170]:
file_name = 'f1-ppv-models-motpe-succesivehalving-parallel-150trials-4sim.pkl'

'metrics-' + file_name[:-8] + str(sim_n) + 'sim.pkl'

'metrics-f1-ppv-models-motpe-succesivehalving-parallel-150trials-1sim.pkl'

In [168]:
df_overall.loc[:,'fair_metric']

0     0.000000
1     0.000111
2     0.000219
3     0.000328
4     0.000726
5     0.000812
6     0.000948
7     0.001751
8     0.003929
9     0.004248
10    0.006572
11    0.006675
12    0.011271
13    0.016921
14    0.021963
15    0.039348
16    0.039683
17    0.041186
18    0.042791
19    0.049580
20    0.052571
21    0.055012
22         NaN
23         NaN
Name: fair_metric, dtype: float64