In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import optuna
import plotly.express as px

# Load and prepare data
DATA_PATH = 'german_credit_data_cleaned.csv'
df = pd.read_csv(DATA_PATH)
y = df['Risk_good'].values
X_full = df.drop(columns=['Risk_good'])
sensitive = df['Sex_male'].values

# Split data
X_train_full, X_test_full, y_train, y_test, sens_train, sens_test = train_test_split(
    X_full, y, sensitive, test_size=0.2, random_state=200
)

# Remove sensitive attribute from features
X_train = X_train_full.drop(columns=['Sex_male']).values
X_test = X_test_full.drop(columns=['Sex_male']).values



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the custom objective function factory
def make_custom_obj(sens_array, alpha=0.5):
    def custom_obj(preds, dmatrix):
        labels = dmatrix.get_label()
        preds_prob = 1.0 / (1.0 + np.exp(-preds))
        grad_log = preds_prob - labels
        hess_log = preds_prob * (1.0 - preds_prob)
        mask_p = (sens_array == 1)
        mask_u = (sens_array == 0)
        n_p, n_u = mask_p.sum(), mask_u.sum()
        mean_p = preds_prob[mask_p].mean() if n_p > 0 else 0
        mean_u = preds_prob[mask_u].mean() if n_u > 0 else 0
        diff = mean_p - mean_u
        sign = np.sign(diff)
        sigmoid_deriv = preds_prob * (1.0 - preds_prob)
        grad_fair = np.zeros_like(preds)
        grad_fair[mask_p] = sign * (1.0 / n_p) * sigmoid_deriv[mask_p] if n_p > 0 else 0
        grad_fair[mask_u] = -sign * (1.0 / n_u) * sigmoid_deriv[mask_u] if n_u > 0 else 0
        hess_fair = np.zeros_like(preds)
        grad = alpha * grad_log + (1 - alpha) * grad_fair
        hess = alpha * hess_log + (1 - alpha) * hess_fair
        return grad, hess
    return custom_obj

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    alpha = trial.suggest_float('alpha', 0.0, 1.0)
    max_depth = trial.suggest_int('max_depth', 3, 7)
    eta = trial.suggest_float('eta', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0)
    lambda_penalty = trial.suggest_float('lambda_penalty', 0.0, 1.0)  # Trade-off parameter

    # Create custom objective
    train_obj = make_custom_obj(sens_train, alpha=alpha)

    # Set up DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    # Train the model
    params = {
        'max_depth': max_depth,
        'eta': eta,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'verbosity': 0
    }
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=100,
        obj=train_obj,
        evals=[(dtrain, 'train'), (dtest, 'test')],
        early_stopping_rounds=10,
        verbose_eval=False
    )

    # Predict on test set
    pred_prob = bst.predict(dtest)
    pred = (pred_prob > 0.5).astype(int)

    # Compute accuracy and DPD
    acc = accuracy_score(y_test, pred)
    dpd_val = abs(pred[sens_test == 1].mean() - pred[sens_test == 0].mean())

    # Combined objective: maximize accuracy, minimize DPD
    objective_value = acc - lambda_penalty * dpd_val

    # Store additional information
    trial.set_user_attr('accuracy', acc)
    trial.set_user_attr('dpd', dpd_val)

    return objective_value


In [3]:

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2000)

# Collect results
results = []
for trial in study.trials:
    if trial.value is not None:
        results.append({
            'alpha': trial.params['alpha'],
            'max_depth': trial.params['max_depth'],
            'eta': trial.params['eta'],
            'subsample': trial.params['subsample'],
            'colsample_bytree': trial.params['colsample_bytree'],
            'lambda_penalty': trial.params['lambda_penalty'],
            'accuracy': trial.user_attrs['accuracy'],
            'dpd': trial.user_attrs['dpd']
        })

# Print summary of results
print("Summary of Optuna optimization results:")
for res in results:
    print(f"Params: {res}, Accuracy: {res['accuracy']:.4f}, DPD: {res['dpd']:.4f}")

# Prepare data for plotting
acc_vals = [res['accuracy'] for res in results]
dpd_vals = [res['dpd'] for res in results]
alpha_vals = [res['alpha'] for res in results]



[I 2025-05-14 23:25:24,548] A new study created in memory with name: no-name-3f38b8dc-7425-4407-894b-d975f274e117
[I 2025-05-14 23:25:24,952] Trial 0 finished with value: 0.6863186170471824 and parameters: {'alpha': 0.4592953729570479, 'max_depth': 4, 'eta': 0.1192532755520472, 'subsample': 0.6683221260921223, 'colsample_bytree': 0.7793229867616808, 'lambda_penalty': 0.6627416286429715}. Best is trial 0 with value: 0.6863186170471824.
[I 2025-05-14 23:25:25,024] Trial 1 finished with value: 0.7061907199122462 and parameters: {'alpha': 0.1599471226960153, 'max_depth': 7, 'eta': 0.2730501914706804, 'subsample': 0.7667283204264608, 'colsample_bytree': 0.8878639133226968, 'lambda_penalty': 0.4445760470244461}. Best is trial 1 with value: 0.7061907199122462.
[I 2025-05-14 23:25:25,095] Trial 2 finished with value: 0.7274242306120136 and parameters: {'alpha': 0.3394135029462937, 'max_depth': 6, 'eta': 0.29550010499355817, 'subsample': 0.8651167453625013, 'colsample_bytree': 0.961296059748703

Summary of Optuna optimization results:
Params: {'alpha': 0.4592953729570479, 'max_depth': 4, 'eta': 0.1192532755520472, 'subsample': 0.6683221260921223, 'colsample_bytree': 0.7793229867616808, 'lambda_penalty': 0.6627416286429715, 'accuracy': 0.6892857142857143, 'dpd': np.float64(0.004477004477004498)}, Accuracy: 0.6893, DPD: 0.0045
Params: {'alpha': 0.1599471226960153, 'max_depth': 7, 'eta': 0.2730501914706804, 'subsample': 0.7667283204264608, 'colsample_bytree': 0.8878639133226968, 'lambda_penalty': 0.4445760470244461, 'accuracy': 0.7107142857142857, 'dpd': np.float64(0.010175010175010157)}, Accuracy: 0.7107, DPD: 0.0102
Params: {'alpha': 0.3394135029462937, 'max_depth': 6, 'eta': 0.29550010499355817, 'subsample': 0.8651167453625013, 'colsample_bytree': 0.9612960597487031, 'lambda_penalty': 0.1084102071647115, 'accuracy': 0.7285714285714285, 'dpd': np.float64(0.010582010582010581)}, Accuracy: 0.7286, DPD: 0.0106
Params: {'alpha': 0.5860185064798528, 'max_depth': 6, 'eta': 0.19327343

In [None]:
# 6. Plot only non-dominated (Pareto-optimal) points
import pandas as pd
import plotly.express as px

def is_pareto_efficient(acc, dpd):
    n = len(acc)
    mask = [True]*n
    for i in range(n):
        for j in range(n):
            if (acc[j]>=acc[i] and dpd[j]<=dpd[i]) and (acc[j]>acc[i] or dpd[j]<dpd[i]):
                mask[i] = False
                break
    return mask

# compute mask and build DataFrame
pareto_mask = is_pareto_efficient(acc_vals, dpd_vals)
plot_df = pd.DataFrame({
    'DPD':       dpd_vals,
    'Accuracy':  acc_vals,
    'alpha':     alpha_vals,
    'Pareto':    ['Yes' if m else 'No' for m in pareto_mask]
})

# filter to only Pareto-optimal
pareto_df = plot_df[plot_df['Pareto']=='Yes']

# scatter
fig = px.scatter(
    pareto_df,
    x='DPD',
    y='Accuracy',
    color='alpha',
    color_continuous_scale='Viridis',
    hover_data=['alpha','DPD','Accuracy'],
    title='Pareto-optimal Points: DPD vs Accuracy'
)
fig.show()