In [1]:
#! pip install fairlearn
#! pip install lightgbm
#! pip install optuna

In [7]:
from sklearn.metrics import f1_score, confusion_matrix, make_scorer, accuracy_score, recall_score
from sklearn.model_selection import cross_val_score, train_test_split, cross_validate
from fairlearn.datasets import fetch_adult
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

import numpy as np
import optuna
import tqdm as notebook_tqdm
from metrics import (
    equality_opportunity_difference,
    predictive_equality_difference,
    metrics
)

from fairlearn.metrics import demographic_parity_difference


In [3]:
data = fetch_adult(as_frame=True)
X_raw = data.data
y = (data.target == ">50K") * 1
A = X_raw["sex"]

numeric_transformer = Pipeline(
    steps=[
        ("impute", SimpleImputer()),
        ("scaler", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="category")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]
)

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "classifier",
            LGBMClassifier(n_jobs=-1),
        ),
    ]
)

  warn(


In [4]:
results = []

In [5]:

directions = ['minimize', 'maximize']
metric_scorer_decorated =  metrics(recall_score, predictive_equality_difference, sensitive_col = 'sex')
for sim in [0,1,2,3]:
    print(sim)
    def objective(trial):

        (X_train, X_test, y_train, y_test, A_train, A_test) = train_test_split(
        X_raw, y, A, test_size=0.8, random_state=sim, stratify=y
        )

        X_train = X_train.reset_index(drop=True)
        X_test = X_test.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        y_test = y_test.reset_index(drop=True)
        A_train = A_train.reset_index(drop=True)
        A_test = A_test.reset_index(drop=True)


        params = {
            'n_estimators': trial.suggest_int("n_estimators", 20, 10000),
            'num_leaves': trial.suggest_int("num_leaves", 10, 1000),
            'max_depth': trial.suggest_int("max_depth", 2, 20),
            'min_child_samples': trial.suggest_int("min_child_samples", 5, 300),
            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-2),
            'boosting_type': trial.suggest_categorical("boosting_type", ['goss', 'gbdt'])
            }
        pipeline['classifier'].set_params(**params)

        scores = cross_validate(
                pipeline, 
                X_train,
                y_train, 
                cv=5,
                scoring = metric_scorer_decorated,
                return_train_score=True)

        fair_metric = scores['test_fairness'].mean()
        model_metric = scores['test_model'].mean()

        return fair_metric, model_metric
    
    
    
    study = optuna.create_study(
        directions = directions, 
        pruner = optuna.pruners.SuccessiveHalvingPruner(), 
        sampler = optuna.samplers.TPESampler() 
        )
    study.optimize(objective, n_trials=100)
    print("Number of finished trials: ", len(study.trials))
    results.append(study)

[32m[I 2023-05-15 09:43:08,183][0m A new study created in memory with name: no-name-735d9eab-ceb0-4086-8710-ec9540b40bdb[0m


0


[32m[I 2023-05-15 09:43:20,303][0m Trial 0 finished with values: [0.06523645275832397, 0.5990464686396164] and parameters: {'n_estimators': 1834, 'num_leaves': 505, 'max_depth': 5, 'min_child_samples': 111, 'learning_rate': 0.004084613385254694, 'boosting_type': 'goss'}. [0m
[32m[I 2023-05-15 09:44:37,085][0m Trial 1 finished with values: [0.08100287008851163, 0.6315745163710902] and parameters: {'n_estimators': 8901, 'num_leaves': 246, 'max_depth': 10, 'min_child_samples': 167, 'learning_rate': 0.007421868515780198, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 09:47:44,462][0m Trial 2 finished with values: [0.06444618532595413, 0.6345751203352916] and parameters: {'n_estimators': 8827, 'num_leaves': 664, 'max_depth': 20, 'min_child_samples': 100, 'learning_rate': 0.0011897051316088038, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 09:47:51,176][0m Trial 3 finished with values: [0.07036414649604118, 0.6191722030051795] and parameters: {'n_estimators': 459, 'num_leaves':

Number of finished trials:  100
1


[32m[I 2023-05-15 10:52:45,597][0m Trial 0 finished with values: [0.08349466155755599, 0.6302942952835886] and parameters: {'n_estimators': 6468, 'num_leaves': 811, 'max_depth': 4, 'min_child_samples': 151, 'learning_rate': 0.00668890232386543, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 10:53:46,544][0m Trial 1 finished with values: [0.09811034201778757, 0.6277256172331118] and parameters: {'n_estimators': 5137, 'num_leaves': 971, 'max_depth': 13, 'min_child_samples': 240, 'learning_rate': 0.009861946043767443, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 10:54:35,947][0m Trial 2 finished with values: [0.054038982731674276, 0.4719742308607405] and parameters: {'n_estimators': 4183, 'num_leaves': 566, 'max_depth': 11, 'min_child_samples': 263, 'learning_rate': 0.0003494389351289893, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 10:54:45,683][0m Trial 3 finished with values: [0.07991337575886587, 0.6058978019363459] and parameters: {'n_estimators': 1117, 'num_leaves'

Number of finished trials:  100
2


[32m[I 2023-05-15 11:52:11,364][0m Trial 0 finished with values: [0.07933137715981317, 0.6328538223613169] and parameters: {'n_estimators': 9095, 'num_leaves': 789, 'max_depth': 17, 'min_child_samples': 7, 'learning_rate': 0.0033770598592845905, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 11:54:45,825][0m Trial 1 finished with values: [0.08181124557715733, 0.6495424513625798] and parameters: {'n_estimators': 8289, 'num_leaves': 514, 'max_depth': 11, 'min_child_samples': 31, 'learning_rate': 0.005424441957398881, 'boosting_type': 'goss'}. [0m
[32m[I 2023-05-15 11:55:22,031][0m Trial 2 finished with values: [0.07526054913718676, 0.6320046120902651] and parameters: {'n_estimators': 6588, 'num_leaves': 877, 'max_depth': 2, 'min_child_samples': 75, 'learning_rate': 0.0035420502143955765, 'boosting_type': 'goss'}. [0m
[32m[I 2023-05-15 11:57:15,384][0m Trial 3 finished with values: [0.07893929184324908, 0.6401361664744962] and parameters: {'n_estimators': 7767, 'num_leaves': 2

Number of finished trials:  100
3


[32m[I 2023-05-15 13:55:54,806][0m Trial 0 finished with values: [0.08469921152409457, 0.6401691099763904] and parameters: {'n_estimators': 8909, 'num_leaves': 156, 'max_depth': 11, 'min_child_samples': 105, 'learning_rate': 0.00684861902437049, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 13:56:02,517][0m Trial 1 finished with values: [0.0929872072070013, 0.5866652025110269] and parameters: {'n_estimators': 1073, 'num_leaves': 570, 'max_depth': 8, 'min_child_samples': 228, 'learning_rate': 0.0068947582925262116, 'boosting_type': 'goss'}. [0m
[32m[I 2023-05-15 13:56:19,063][0m Trial 2 finished with values: [0.08146890654788493, 0.6264563773129084] and parameters: {'n_estimators': 1902, 'num_leaves': 49, 'max_depth': 20, 'min_child_samples': 290, 'learning_rate': 0.007660583009013946, 'boosting_type': 'gbdt'}. [0m
[32m[I 2023-05-15 13:57:32,746][0m Trial 3 finished with values: [0.0795421821857093, 0.6307362872673365] and parameters: {'n_estimators': 5852, 'num_leaves': 54

Number of finished trials:  100


In [6]:
import dill
file_name = 'recall-fpr-lgbm-motpe-succesivehalving-100trials-4sim.pkl'
#f1-eod-lgbm-succesivehalving-30trails.pkl
with open(file_name, 'wb') as file:
    dill.dump(results, file)
    print(f'Object successfully saved to "{file_name}"')

Object successfully saved to "recall-fpr-lgbm-motpe-succesivehalving-100trials-4sim.pkl"


In [7]:
optuna.visualization.plot_pareto_front(study, target_names=["fair_metric", "f1_score"], include_dominated_trials = True)