In [1]:
#! pip install fairlearn
#! pip install lightgbm
#! pip install optuna

In [2]:
from sklearn.metrics import (
    f1_score, 
    confusion_matrix, 
    make_scorer, 
    accuracy_score, 
    recall_score, 
    matthews_corrcoef
)
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
from fairlearn.datasets import fetch_adult
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

from concurrent.futures import ThreadPoolExecutor
import tqdm as notebook_tqdm
from metrics import (
    equality_opportunity_difference,
    predictive_equality_difference,
    predictive_parity_difference,
    metrics,
    average_absolute_odds_difference
    
)
from fairlearn.metrics import demographic_parity_difference
from sklearn.utils import resample

import numpy as np
import optuna
import dill
import pickle


In [3]:
sensitive_attribute = 'sex'
data = fetch_adult(as_frame=True)
X_raw = data.data
y = (data.target == ">50K") * 1

if sensitive_attribute == 'race':
    mapping = {'White':'white','Black':'black','Asian-Pac-Islander':'others','Amer-Indian-Eskimo':'others','Other':'others'}
    X_raw.loc[:,'race'] = X_raw['race'].map(mapping).astype("category")

A = X_raw[sensitive_attribute]



perc = .5
X_raw, y, A = resample(X_raw, y, A, n_samples=int(perc*X_raw.shape[0]), random_state = 123)

numeric_transformer = Pipeline(
    steps=[
        ("impute", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)


  warn(


In [4]:
results = []


In [None]:
n_trials = 100
num_sim = 4
directions = ['minimize', 'maximize']
metric_scorer_decorated =  metrics(f1_score, 
                                   predictive_parity_difference, 
                                   sensitive_col = sensitive_attribute)
for sim in range(num_sim):
    print(sim)
    study_name = f'recall-demographic-models-motpe-succesivehalving-parallel-{n_trials}trials-{sim+1}sim'
    def objective(trial):
        (X_train, X_test, y_train, y_test, A_train, A_test) = train_test_split(
        X_raw, y, A, test_size=0.2, random_state=sim, stratify=y
        )

        X_train = X_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)
        A_train = A_train.reset_index(drop=True)
        A_test = A_test.reset_index(drop=True)

        classifier_name = trial.suggest_categorical("classifier", ["RF", 'GBM','LGBM'])

        if classifier_name == "logit":        
            params = {
                "penalty" : trial.suggest_categorical('logit_penalty', ['l1','l2']),
                "C" : trial.suggest_float('logit_c', 0.001, 10),
                "max_iter": 2000,
                "solver" : 'saga'
                }
            classifier = LogisticRegression(**params)
        
        elif classifier_name =="RF":
            params = {
                'n_estimators': trial.suggest_int("rf_n_estimators", 100, 1000),
                'criterion': trial.suggest_categorical("rf_criterion", ['gini', 'entropy']),
                'max_depth': trial.suggest_int("rf_max_depth", 1, 4),
                'min_samples_split': trial.suggest_float("rf_min_samples_split", 0.01, 1),
                }
            classifier = RandomForestClassifier(**params)

        elif classifier_name =="LGBM":
            params = {
                'n_estimators': trial.suggest_int("lgbm_n_estimators", 20, 10000),
                'num_leaves': trial.suggest_int("lgbm_num_leaves", 10, 1000),
                'max_depth': trial.suggest_int("lgbm_max_depth", 2, 20),
                'min_child_samples': trial.suggest_int("lgbm_min_child_samples", 5, 300),
                'learning_rate': trial.suggest_float('lgbm_learning_rate', 1e-5, 1e-2),
                'boosting_type': trial.suggest_categorical("lgbm_boosting_type", ['goss', 'gbdt'])
                }
            classifier = LGBMClassifier(**params)  
        
        elif classifier_name =="GBM":
            params = {
                'n_estimators': trial.suggest_int("gbm_n_estimators", 100, 1000), 
                'criterion': trial.suggest_categorical("gbm_criterion", ['squared_error', 'friedman_mse']),
                'max_depth': trial.suggest_int("gbm_max_depth", 1, 4),
                'min_samples_split': trial.suggest_int("gbm_min_samples_split", 5, 300),
                }
            classifier = GradientBoostingClassifier(**params)            
        
        else:
            None
        

        pipeline = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("classifier", classifier),
            ]
        )
                
        scores = cross_validate(
                pipeline, 
                X_train,
                y_train, 
                cv=5,
                scoring = metric_scorer_decorated,
                return_train_score=True)

        fair_metric = scores['test_fairness'].mean()
        model_metric = scores['test_model'].mean()

        return fair_metric, model_metric
    
    storage_name = "sqlite:///{}.db".format(study_name)
    
    study = optuna.create_study(
        directions = directions, 
        pruner = optuna.pruners.SuccessiveHalvingPruner(), 
        sampler = optuna.samplers.TPESampler(),
        storage = storage_name
        )
  
    study.optimize(objective, n_trials=n_trials, n_jobs=1)
    
    print("Number of finished trials: ", len(study.trials))
    results.append(study)
    
    file_name = study_name +'.pkl'
    with open(file_name, 'wb') as file:
        dill.dump(results, file)
        print(f'Object successfully saved to "{file_name}"')

0


[I 2023-07-10 15:52:22,115] A new study created in RDB with name: no-name-1c60fb52-345e-455a-b1d5-c55bb6e7816b
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_p

In [None]:
file_name = study_name +'.pkl'
with open(file_name, 'wb') as file:
    dill.dump(results, file)
    print(f'Object successfully saved to "{file_name}"')