In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import optuna
import plotly.express as px

# Load and prepare data
DATA_PATH = 'german_credit_data_cleaned.csv'
df = pd.read_csv(DATA_PATH)
y = df['Risk_good'].values
X_full = df.drop(columns=['Risk_good'])
sensitive = df['Sex_male'].values



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the custom objective function factory
def make_custom_obj(sens_array, alpha=0.5):
    def custom_obj(preds, dmatrix):
        labels = dmatrix.get_label()
        preds_prob = 1.0 / (1.0 + np.exp(-preds))
        grad_log = preds_prob - labels
        hess_log = preds_prob * (1.0 - preds_prob)
        mask_p = (sens_array == 1)
        mask_u = (sens_array == 0)
        n_p, n_u = mask_p.sum(), mask_u.sum()
        mean_p = preds_prob[mask_p].mean() if n_p > 0 else 0
        mean_u = preds_prob[mask_u].mean() if n_u > 0 else 0
        diff = mean_p - mean_u
        sign = np.sign(diff)
        sigmoid_deriv = preds_prob * (1.0 - preds_prob)
        grad_fair = np.zeros_like(preds)
        grad_fair[mask_p] = sign * (1.0 / n_p) * sigmoid_deriv[mask_p] if n_p > 0 else 0
        grad_fair[mask_u] = -sign * (1.0 / n_u) * sigmoid_deriv[mask_u] if n_u > 0 else 0
        hess_fair = np.zeros_like(preds)
        grad = alpha * grad_log + (1 - alpha) * grad_fair
        hess = alpha * hess_log + (1 - alpha) * hess_fair
        return grad, hess
    return custom_obj

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    alpha = trial.suggest_float('alpha', 0.0, 1.0)
    max_depth = trial.suggest_int('max_depth', 3, 7)
    eta = trial.suggest_float('eta', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0)

    # Initialize cross-validation
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    acc_scores = []
    dpd_scores = []

    # Perform 3-fold cross-validation
    for train_idx, val_idx in kf.split(X_full):
        # Split data
        X_train = X_full.iloc[train_idx].drop(columns=['Sex_male']).values
        y_train = y[train_idx]
        sens_train = sensitive[train_idx]
        X_val = X_full.iloc[val_idx].drop(columns=['Sex_male']).values
        y_val = y[val_idx]
        sens_val = sensitive[val_idx]

        # Create custom objective
        train_obj = make_custom_obj(sens_train, alpha=alpha)

        # Set up DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        # Train the model
        params = {
            'max_depth': max_depth,
            'eta': eta,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'verbosity': 0
        }
        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=100,
            obj=train_obj,
            evals=[(dtrain, 'train'), (dval, 'val')],
            early_stopping_rounds=10,
            verbose_eval=False
        )

        # Predict on validation set
        pred_prob = bst.predict(dval)
        pred = (pred_prob > 0.5).astype(int)

        # Compute accuracy and DPD
        acc = accuracy_score(y_val, pred)
        dpd_val = abs(pred[sens_val == 1].mean() - pred[sens_val == 0].mean())
        acc_scores.append(acc)
        dpd_scores.append(dpd_val)

    # Compute mean metrics across folds
    mean_acc = np.mean(acc_scores)
    mean_dpd = np.mean(dpd_scores)

    # Store metrics for analysis
    trial.set_user_attr('accuracy', mean_acc)
    trial.set_user_attr('dpd', mean_dpd)
    trial.set_user_attr('params', {
        'alpha': alpha,
        'max_depth': max_depth,
        'eta': eta,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree
    })

    return mean_acc, mean_dpd


In [3]:

# Create and run Optuna study for multi-objective optimization
study = optuna.create_study(directions=['maximize', 'minimize'])
study.optimize(objective, n_trials=2000)

# Collect Pareto front results
results = []
for trial in study.best_trials:  # Best trials are the Pareto-optimal ones
    results.append({
        'accuracy': trial.values[0],
        'dpd': trial.values[1],
        'alpha': trial.user_attrs['params']['alpha'],
        'params': trial.user_attrs['params']
    })

# Print summary of Pareto-optimal results
print("Pareto-optimal solutions:")
for res in results:
    print(f"Params: {res['params']}, Accuracy: {res['accuracy']:.4f}, DPD: {res['dpd']:.4f}")

# Prepare data for plotting
plot_df = pd.DataFrame({
    'DPD': [res['dpd'] for res in results],
    'Accuracy': [res['accuracy'] for res in results],
    'alpha': [res['alpha'] for res in results]
})



[I 2025-05-15 00:13:05,003] A new study created in memory with name: no-name-e7eefbff-a6ad-4701-9d90-5c9ca90f2058
[I 2025-05-15 00:13:05,696] Trial 0 finished with values: [0.6842889658827387, 0.05078741226053076] and parameters: {'alpha': 0.7322567767359832, 'max_depth': 4, 'eta': 0.24780138596990386, 'subsample': 0.9604171906491878, 'colsample_bytree': 0.9395369071061972}.
[I 2025-05-15 00:13:05,944] Trial 1 finished with values: [0.6678629305247936, 0.03779013877147597] and parameters: {'alpha': 0.18424743440551838, 'max_depth': 5, 'eta': 0.11405322925341065, 'subsample': 0.9857638680246539, 'colsample_bytree': 0.6794095077602501}.
[I 2025-05-15 00:13:06,217] Trial 2 finished with values: [0.6950109210772196, 0.057206087778508115] and parameters: {'alpha': 0.9374283216392799, 'max_depth': 4, 'eta': 0.21901686836526663, 'subsample': 0.8376771934629543, 'colsample_bytree': 0.686670225449568}.
[I 2025-05-15 00:13:06,564] Trial 3 finished with values: [0.6821399797201876, 0.065948581765

Pareto-optimal solutions:
Params: {'alpha': 0.012915419123790906, 'max_depth': 7, 'eta': 0.22739873665679933, 'subsample': 0.871229825323997, 'colsample_bytree': 0.9273609939577232}, Accuracy: 0.5000, DPD: 0.0000
Params: {'alpha': 0.006398238570445525, 'max_depth': 4, 'eta': 0.1835864240964316, 'subsample': 0.872143282534935, 'colsample_bytree': 0.6846054564616372}, Accuracy: 0.5000, DPD: 0.0000
Params: {'alpha': 0.0018713778605690123, 'max_depth': 3, 'eta': 0.07253350513913152, 'subsample': 0.8088010652239656, 'colsample_bytree': 0.8778999079756078}, Accuracy: 0.5000, DPD: 0.0000
Params: {'alpha': 0.006398238570445525, 'max_depth': 4, 'eta': 0.1835864240964316, 'subsample': 0.6500142472469429, 'colsample_bytree': 0.7719975241381819}, Accuracy: 0.5000, DPD: 0.0000
Params: {'alpha': 0.006398238570445525, 'max_depth': 6, 'eta': 0.1835864240964316, 'subsample': 0.7126649411379313, 'colsample_bytree': 0.6846054564616372}, Accuracy: 0.5000, DPD: 0.0000
Params: {'alpha': 0.006398238570445525

In [4]:
# Create interactive Plotly plot
fig = px.scatter(
    plot_df,
    x='DPD',
    y='Accuracy',
    color='alpha',
    color_continuous_scale='Viridis',
    size_max=15,
    hover_data={'alpha': True, 'DPD': True, 'Accuracy': True},
    title='Pareto Front: DPD vs. Accuracy'
)
fig.update_traces(marker=dict(line_width=0))
fig.show()

In [7]:

# Create and run Optuna study for multi-objective optimization
study = optuna.create_study(directions=['maximize', 'minimize'])
study.optimize(objective, n_trials=100)

# Collect Pareto front results
results = []
for trial in study.best_trials:  # Best trials are the Pareto-optimal ones
    results.append({
        'accuracy': trial.values[0],
        'dpd': trial.values[1],
        'alpha': trial.user_attrs['params']['alpha'],
        'params': trial.user_attrs['params']
    })

# Print summary of Pareto-optimal results
print("Pareto-optimal solutions:")
for res in results:
    print(f"Params: {res['params']}, Accuracy: {res['accuracy']:.4f}, DPD: {res['dpd']:.4f}")

# Prepare data for plotting
plot_df = pd.DataFrame({
    'DPD': [res['dpd'] for res in results],
    'Accuracy': [res['accuracy'] for res in results],
    'alpha': [res['alpha'] for res in results]
})



[I 2025-05-15 00:24:54,034] A new study created in memory with name: no-name-4340bd0a-84cc-4cc8-9175-a0e8ebca6ecf
[I 2025-05-15 00:24:54,209] Trial 0 finished with values: [0.668579769814939, 0.04705854807698547] and parameters: {'alpha': 0.7173529644595884, 'max_depth': 4, 'eta': 0.040674076986809145, 'subsample': 0.9932367703957148, 'colsample_bytree': 0.7909352143832696}.
[I 2025-05-15 00:24:54,528] Trial 1 finished with values: [0.704999188194821, 0.09235785660236438] and parameters: {'alpha': 0.8617201466817577, 'max_depth': 6, 'eta': 0.24832927606538044, 'subsample': 0.6199735034178627, 'colsample_bytree': 0.9865146806199016}.
[I 2025-05-15 00:24:54,824] Trial 2 finished with values: [0.6700134483952297, 0.03622823029083646] and parameters: {'alpha': 0.7725592387321162, 'max_depth': 3, 'eta': 0.09994010638179107, 'subsample': 0.7983146992934836, 'colsample_bytree': 0.7896876756478979}.
[I 2025-05-15 00:24:55,291] Trial 3 finished with values: [0.7200007352197848, 0.06911685486847

Pareto-optimal solutions:
Params: {'alpha': 0.949771839688359, 'max_depth': 7, 'eta': 0.28976540337251927, 'subsample': 0.6237175940845682, 'colsample_bytree': 0.9117198391851572}, Accuracy: 0.7200, DPD: 0.0691
Params: {'alpha': 0.6205813998381142, 'max_depth': 5, 'eta': 0.22305705682721003, 'subsample': 0.8208929507370241, 'colsample_bytree': 0.7151684145313815}, Accuracy: 0.6871, DPD: 0.0509
Params: {'alpha': 0.32246385915926223, 'max_depth': 7, 'eta': 0.25373919752102353, 'subsample': 0.6142096021547871, 'colsample_bytree': 0.6128323138461458}, Accuracy: 0.6814, DPD: 0.0438
Params: {'alpha': 0.03417285297720973, 'max_depth': 7, 'eta': 0.22559053945727284, 'subsample': 0.6925253673518488, 'colsample_bytree': 0.6482599930718969}, Accuracy: 0.6707, DPD: 0.0130
Params: {'alpha': 0.9646708794923932, 'max_depth': 5, 'eta': 0.19039189345893598, 'subsample': 0.8438088653110281, 'colsample_bytree': 0.611858816055013}, Accuracy: 0.7007, DPD: 0.0586
Params: {'alpha': 0.24748086673576375, 'max_

In [8]:
# Create interactive Plotly plot
fig = px.scatter(
    plot_df,
    x='DPD',
    y='Accuracy',
    color='alpha',
    color_continuous_scale='Viridis',
    size_max=15,
    hover_data={'alpha': True, 'DPD': True, 'Accuracy': True},
    title='Pareto Front: DPD vs. Accuracy'
)
fig.update_traces(marker=dict(line_width=0))
fig.show()