In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import optuna
import plotly.express as px

# Load and prepare data
DATA_PATH = 'german_credit_data_cleaned.csv'
df = pd.read_csv(DATA_PATH)
y = df['Risk_good'].values
X_full = df.drop(columns=['Risk_good'])
sensitive = df['Sex_male'].values



  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Define the custom objective function factory
def make_custom_obj(sens_array, alpha=0.5):
    def custom_obj(preds, dmatrix):
        labels = dmatrix.get_label()
        preds_prob = 1.0 / (1.0 + np.exp(-preds))
        grad_log = preds_prob - labels
        hess_log = preds_prob * (1.0 - preds_prob)
        mask_p = (sens_array == 1)
        mask_u = (sens_array == 0)
        n_p, n_u = mask_p.sum(), mask_u.sum()
        mean_p = preds_prob[mask_p].mean() if n_p > 0 else 0
        mean_u = preds_prob[mask_u].mean() if n_u > 0 else 0
        diff = mean_p - mean_u
        sign = np.sign(diff)
        sigmoid_deriv = preds_prob * (1.0 - preds_prob)
        grad_fair = np.zeros_like(preds)
        grad_fair[mask_p] = sign * (1.0 / n_p) * sigmoid_deriv[mask_p] if n_p > 0 else 0
        grad_fair[mask_u] = -sign * (1.0 / n_u) * sigmoid_deriv[mask_u] if n_u > 0 else 0
        hess_fair = np.zeros_like(preds)
        grad = alpha * grad_log + (1 - alpha) * grad_fair
        hess = alpha * hess_log + (1 - alpha) * hess_fair
        return grad, hess
    return custom_obj

# Define the objective function for Optuna

def create_objective_function(alpha):
    def objective(trial):
        # Suggest hyperparameters
        # alpha = trial.suggest_float('alpha', 0.0, 1.0)
        max_depth = trial.suggest_int('max_depth', 3, 7)
        eta = trial.suggest_float('eta', 0.01, 0.3)
        subsample = trial.suggest_float('subsample', 0.6, 1.0)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.6, 1.0)

        # Initialize cross-validation
        kf = KFold(n_splits=3, shuffle=True, random_state=42)
        acc_scores = []
        dpd_scores = []

        # Perform 3-fold cross-validation
        for train_idx, val_idx in kf.split(X_full):
            # Split data
            X_train = X_full.iloc[train_idx].drop(columns=['Sex_male']).values
            y_train = y[train_idx]
            sens_train = sensitive[train_idx]
            X_val = X_full.iloc[val_idx].drop(columns=['Sex_male']).values
            y_val = y[val_idx]
            sens_val = sensitive[val_idx]

            # Create custom objective
            train_obj = make_custom_obj(sens_train, alpha=alpha)

            # Set up DMatrix
            dtrain = xgb.DMatrix(X_train, label=y_train)
            dval = xgb.DMatrix(X_val, label=y_val)

            # Train the model
            params = {
                'max_depth': max_depth,
                'eta': eta,
                'subsample': subsample,
                'colsample_bytree': colsample_bytree,
                'verbosity': 0
            }
            bst = xgb.train(
                params,
                dtrain,
                num_boost_round=100,
                obj=train_obj,
                evals=[(dtrain, 'train'), (dval, 'val')],
                early_stopping_rounds=10,
                verbose_eval=False
            )

            # Predict on validation set
            pred_prob = bst.predict(dval)
            pred = (pred_prob > 0.5).astype(int)

            # Compute accuracy and DPD
            acc = accuracy_score(y_val, pred)
            dpd_val = abs(pred[sens_val == 1].mean() - pred[sens_val == 0].mean())
            acc_scores.append(acc)
            dpd_scores.append(dpd_val)

        # Compute mean metrics across folds
        mean_acc = np.mean(acc_scores)
        mean_dpd = np.mean(dpd_scores)

        # Store metrics for analysis
        trial.set_user_attr('accuracy', mean_acc)
        trial.set_user_attr('dpd', mean_dpd)
        trial.set_user_attr('params', {
            'alpha': alpha,
            'max_depth': max_depth,
            'eta': eta,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree
        })

        return mean_acc, mean_dpd
    return objective


In [28]:

# Create and run Optuna study for multi-objective optimization
study = optuna.create_study(directions=['maximize', 'minimize'])

objective = create_objective_function(alpha=1)  # Example alpha value
study.optimize(objective, n_trials=1)
objective = create_objective_function(alpha=0)  # Example alpha value
study.optimize(objective, n_trials=1)


#adaptive weights based on distance

for i in range(250):
    max_dist = 0
    max_dist_alpha = None
    for trial_i in study.best_trials:
        for trial_j in study.best_trials:
            if trial_i != trial_j:
                # Calculate distance between trials
                dist = np.sqrt((trial_i.values[0] - trial_j.values[0])**2 + (trial_i.values[1] - trial_j.values[1])**2)
                if dist > max_dist:
                    max_dist = dist
                    max_dist_alpha = (trial_i.user_attrs['params']['alpha'] + trial_j.user_attrs['params']['alpha']) / 2
    
    # print(max_dist_alpha)
    # max_dist_alpha = np.random.uniform(0, 1)
    objective = create_objective_function(alpha=max_dist_alpha)  # Example alpha value
    study.optimize(objective, n_trials=1)

# Collect Pareto front results
results = []
for trial in study.best_trials:  # Best trials are the Pareto-optimal ones
    results.append({
        'accuracy': trial.values[0],
        'dpd': trial.values[1],
        'alpha': trial.user_attrs['params']['alpha'],
        'params': trial.user_attrs['params']
    })

# Print summary of Pareto-optimal results
print("Pareto-optimal solutions:")
for res in results:
    print(f"Params: {res['params']}, Accuracy: {res['accuracy']:.4f}, DPD: {res['dpd']:.4f}")

# Prepare data for plotting
plot_df = pd.DataFrame({
    'DPD': [res['dpd'] for res in results],
    'Accuracy': [res['accuracy'] for res in results],
    'alpha': [res['alpha'] for res in results]
})



[I 2025-05-15 04:44:06,657] A new study created in memory with name: no-name-db0960c3-b8ac-4f70-b5e8-f76b1da2b646
[I 2025-05-15 04:44:06,831] Trial 0 finished with values: [0.6978507074958721, 0.07921754396821991] and parameters: {'max_depth': 7, 'eta': 0.0140276496136848, 'subsample': 0.9730734246835414, 'colsample_bytree': 0.7493996073000908}.
[I 2025-05-15 04:44:06,885] Trial 1 finished with values: [0.5000061268315397, 0.0] and parameters: {'max_depth': 3, 'eta': 0.18417884835987844, 'subsample': 0.9090853732103509, 'colsample_bytree': 0.7901593604495847}.
[I 2025-05-15 04:44:06,964] Trial 2 finished with values: [0.6778711098448992, 0.046038774173198425] and parameters: {'max_depth': 7, 'eta': 0.14682206582840512, 'subsample': 0.6057373313664154, 'colsample_bytree': 0.633825626213555}.
[I 2025-05-15 04:44:07,025] Trial 3 finished with values: [0.6735853911828767, 0.04017633670270396] and parameters: {'max_depth': 3, 'eta': 0.23858962210197768, 'subsample': 0.9174944015497418, 'col

Pareto-optimal solutions:
Params: {'alpha': 1, 'max_depth': 7, 'eta': 0.0140276496136848, 'subsample': 0.9730734246835414, 'colsample_bytree': 0.7493996073000908}, Accuracy: 0.6979, DPD: 0.0792
Params: {'alpha': 0, 'max_depth': 3, 'eta': 0.18417884835987844, 'subsample': 0.9090853732103509, 'colsample_bytree': 0.7901593604495847}, Accuracy: 0.5000, DPD: 0.0000
Params: {'alpha': 0.5, 'max_depth': 4, 'eta': 0.28764117605408523, 'subsample': 0.9448369784642336, 'colsample_bytree': 0.9126395520111392}, Accuracy: 0.6929, DPD: 0.0614
Params: {'alpha': 0.5, 'max_depth': 6, 'eta': 0.09251777714733435, 'subsample': 0.6029080603388755, 'colsample_bytree': 0.9005279355380043}, Accuracy: 0.6893, DPD: 0.0556
Params: {'alpha': 0.5, 'max_depth': 7, 'eta': 0.2304400249507259, 'subsample': 0.8486194975103121, 'colsample_bytree': 0.8345402117370833}, Accuracy: 0.6979, DPD: 0.0813
Params: {'alpha': 0.25, 'max_depth': 7, 'eta': 0.06573246467881211, 'subsample': 0.6922279318152806, 'colsample_bytree': 0.88

In [29]:
# Create interactive Plotly plot
fig = px.scatter(
    plot_df,
    x='DPD',
    y='Accuracy',
    color='alpha',
    color_continuous_scale='Viridis',
    size_max=15,
    hover_data={'alpha': True, 'DPD': True, 'Accuracy': True},
    title='Pareto Front: DPD vs. Accuracy'
)
fig.update_traces(marker=dict(line_width=0))
fig.show()

In [30]:

# Create and run Optuna study for multi-objective optimization
study = optuna.create_study(directions=['maximize', 'minimize'])
objective = create_objective_function(alpha=np.random.uniform(0,1))  # Example alpha value
study.optimize(objective, n_trials=100)

# Collect Pareto front results
results = []
for trial in study.best_trials:  # Best trials are the Pareto-optimal ones
    results.append({
        'accuracy': trial.values[0],
        'dpd': trial.values[1],
        'alpha': trial.user_attrs['params']['alpha'],
        'params': trial.user_attrs['params']
    })

# Print summary of Pareto-optimal results
print("Pareto-optimal solutions:")
for res in results:
    print(f"Params: {res['params']}, Accuracy: {res['accuracy']:.4f}, DPD: {res['dpd']:.4f}")

# Prepare data for plotting
plot_df = pd.DataFrame({
    'DPD': [res['dpd'] for res in results],
    'Accuracy': [res['accuracy'] for res in results],
    'alpha': [res['alpha'] for res in results]
})



[I 2025-05-15 04:47:08,768] A new study created in memory with name: no-name-81c9934b-2059-41fa-9a1d-ffa5aeeee5de
[I 2025-05-15 04:47:08,897] Trial 0 finished with values: [0.6792971298857652, 0.08215217241285708] and parameters: {'max_depth': 5, 'eta': 0.14045260605383383, 'subsample': 0.6059859634768344, 'colsample_bytree': 0.9142953795430282}.
[I 2025-05-15 04:47:08,995] Trial 1 finished with values: [0.6600175227382036, 0.015081357718660824] and parameters: {'max_depth': 3, 'eta': 0.05033127668007191, 'subsample': 0.7887078508400488, 'colsample_bytree': 0.8895195187557907}.
[I 2025-05-15 04:47:09,066] Trial 2 finished with values: [0.6764435580961484, 0.04444123543631445] and parameters: {'max_depth': 3, 'eta': 0.2939283612519244, 'subsample': 0.8063484386283546, 'colsample_bytree': 0.951394948420871}.
[I 2025-05-15 04:47:09,139] Trial 3 finished with values: [0.667858335401139, 0.05062834891783413] and parameters: {'max_depth': 4, 'eta': 0.11946224856843993, 'subsample': 0.8938040

Pareto-optimal solutions:
Params: {'alpha': 0.41383153948093454, 'max_depth': 7, 'eta': 0.2804130195047165, 'subsample': 0.8838936125401613, 'colsample_bytree': 0.7188154735379857}, Accuracy: 0.7021, DPD: 0.0689
Params: {'alpha': 0.41383153948093454, 'max_depth': 5, 'eta': 0.2789019660094398, 'subsample': 0.8824040017757933, 'colsample_bytree': 0.6075094742604826}, Accuracy: 0.6914, DPD: 0.0536
Params: {'alpha': 0.41383153948093454, 'max_depth': 4, 'eta': 0.2714574989051898, 'subsample': 0.9284415700920252, 'colsample_bytree': 0.8076193158434031}, Accuracy: 0.6886, DPD: 0.0439
Params: {'alpha': 0.41383153948093454, 'max_depth': 6, 'eta': 0.20020253145881808, 'subsample': 0.8267274788350784, 'colsample_bytree': 0.6612102025456387}, Accuracy: 0.6843, DPD: 0.0384
Params: {'alpha': 0.41383153948093454, 'max_depth': 5, 'eta': 0.2939283612519244, 'subsample': 0.6818951506364179, 'colsample_bytree': 0.6589253591052271}, Accuracy: 0.6922, DPD: 0.0571
Params: {'alpha': 0.41383153948093454, 'max

In [31]:
# Create interactive Plotly plot
fig = px.scatter(
    plot_df,
    x='DPD',
    y='Accuracy',
    color='alpha',
    color_continuous_scale='Viridis',
    size_max=15,
    hover_data={'alpha': True, 'DPD': True, 'Accuracy': True},
    title='Pareto Front: DPD vs. Accuracy'
)
fig.update_traces(marker=dict(line_width=0))
fig.show()