# Run different models (Baseline LGBM, Fair models: FGBM, FTU)

In [None]:
import pandas as pd
import numpy as np
import os
import pickle

import metrics

import skops.io as sio
import lightgbm as lgb
import seaborn as sns
from scipy import stats
from scipy.stats import norm, chi2_contingency
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_percentage_error, roc_auc_score, auc, f1_score, precision_recall_curve, precision_recall_fscore_support
from sklearn.metrics import make_scorer, ConfusionMatrixDisplay, PrecisionRecallDisplay    
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_curve, accuracy_score
from fairgbm import FairGBMClassifier

import warnings
warnings.filterwarnings("ignore")

In [1]:
Run_Baseline = True
Run_LGBM_nosen = False
Run_FGBM = False

model = "fgbm" #"lgbm" "lgbm_nosen"


In [None]:
output = "xxx" # add customized local output path
input_path = "xxx" # add customized local input path

y_train=pd.read_pickle(input_path+"y_train.pkl")
y_test = pd.read_pickle(input_path+"y_test.pkl")                                    

In [None]:
X_full = pd.concat([X_train,X_test])

In [None]:
S_full = X_full[['VN_GESCHLECHT_1m_1.0', 'VN_GESCHLECHT_1m_2.0', 'VN_GESCHLECHT_1m_nan']].idxmax(axis=1).map({'VN_GESCHLECHT_1m_1.0': 0, 'VN_GESCHLECHT_1m_2.0': 1, 'VN_GESCHLECHT_1m_nan': 2})

In [None]:
S_gender = pd.read_pickle(input_path+"S_gender.pkl")
S_gender_test = pd.read_pickle(input_path+"S_gender_test.pkl")

In [None]:
S_nat = pd.read_pickle(input_path+"S_nat.pkl")
S_nat_test = pd.read_pickle(input_path+"S_nat_test.pkl")

In [None]:
PPC = False

if PPC:
    
    X_train.reset_index(inplace=True)
    X_test.reset_index(inplace=True)

    S_gender= X_train[['VN_GESCHLECHT_1m_1.0', 'VN_GESCHLECHT_1m_2.0', 'VN_GESCHLECHT_1m_nan']].idxmax(axis=1).map({'VN_GESCHLECHT_1m_1.0': 0, 'VN_GESCHLECHT_1m_2.0': 1, 'VN_GESCHLECHT_1m_nan': 2})
    S_gender_test= X_test[['VN_GESCHLECHT_1m_1.0', 'VN_GESCHLECHT_1m_2.0', 'VN_GESCHLECHT_1m_nan']].idxmax(axis=1).map({'VN_GESCHLECHT_1m_1.0': 0, 'VN_GESCHLECHT_1m_2.0': 1, 'VN_GESCHLECHT_1m_nan': 2})
    nat_nan = ['VN_NATION_1m_None', 'VN_NATION_1m_<U>']
    nat_other = X_train.columns[154:253].tolist()
    sensitive_columns = nat_other + nat_nan + ['VN_NATION_1m_A' ,'VN_GESCHLECHT_1m_1.0', 'VN_GESCHLECHT_1m_2.0', 'VN_GESCHLECHT_1m_nan']
    X_train_nats = X_train.iloc[:,154:253]
    X_test_nats = X_test.iloc[:,154:253]

    X_train["Nat_other"] = np.logical_or.reduce(X_train_nats, axis = 1).astype(int)
    X_train["Nat_nan"] = np.logical_or(X_train["VN_NATION_1m_<U>"],X_train["VN_NATION_1m_None"]).astype(int)

    X_test["Nat_other"] = np.logical_or.reduce(X_test_nats, axis = 1).astype(int)
    X_test["Nat_nan"] = np.logical_or(X_test["VN_NATION_1m_<U>"],X_test["VN_NATION_1m_None"]).astype(int)
    S_nat= X_train[['VN_NATION_1m_A', "Nat_other", "Nat_nan"]].idxmax(axis=1).map({'VN_NATION_1m_A': 0, "Nat_other": 1, "Nat_nan": 2})
    S_nat_test= X_test[['VN_NATION_1m_A', "Nat_other", "Nat_nan"]].idxmax(axis=1).map({'VN_NATION_1m_A': 0, "Nat_other": 1, "Nat_nan": 2})

    X_train.drop(columns=["Nat_other","Nat_nan"], inplace=True)
    X_test.drop(columns=["Nat_other","Nat_nan"], inplace=True)

### EDA

In [None]:
S_nat_counts = S_nat.value_counts().sort_index()
S_nat_test_counts = S_nat_test.value_counts().sort_index()

In [None]:
rel_train = S_nat_counts / len(S_nat)
rel_test = S_nat_test_counts / len(S_nat_test)

In [None]:
index = ['Austrian','Non-Austrian','Unknown']
rel_test.index = index
rel_train.index = index

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Create blue bar plots
sns.barplot(x=rel_train.index, y=rel_train.values, ax=axes[0], color='C0')
sns.barplot(x=rel_test.index, y=rel_test.values, ax=axes[1], color='C0')  # Fixed to use rel_test.values

# Set titles
fig.suptitle("Claims per Nationality")
axes[0].set_title("Training")
axes[1].set_title("Test")

# Format labels to 2 decimal places with percentage sign
train_labels = [f"{val:.2%}" for val in rel_train]
test_labels = [f"{val:.2%}" for val in rel_test]

# Add formatted labels to bars
axes[0].bar_label(axes[0].containers[0], labels=train_labels)
axes[1].bar_label(axes[1].containers[0], labels=test_labels)

# Set axis labels for each subplot
for ax in axes:
    ax.set_xlabel('Nationality')
    ax.set_ylabel('Relative Frequency')

# Remove the redundant labels and title that were applied to the overall figure
plt.tight_layout()
plt.savefig(output+"EDA_Nat.png")
plt.show()

## LGBM

In [None]:
params = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.8400000000000001,
 'importance_type': 'split',
 'learning_rate': 0.15100000000000002,
 'max_depth': 5,
 'min_child_samples': 4020,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 4,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'verbosity': -1,
 'boost_from_average': False,
 'feature_pre_filter': False,
 'lambda_l1': 3.0000000000000004,
 'lambda_l2': 7.6,
 'scale_pos_weight': 131}

In [None]:
if Run_Baseline:
    lgbm = lgb.LGBMClassifier(**params)

    lgbm.fit(X_train, y_train)
    y_pred_lgbm = lgbm.predict_proba(X_test)[:,1]
    pd.DataFrame(y_pred_lgbm).to_pickle(input_path+"y_pred_te_baseline.pkl")
    
else: 
    y_pred_baseline  = pd.read_pickle(input_path+"y_pred_te_baseline.pkl")

In [None]:
'''
model = "LGBM_baseline.skops"
unknown_types = sio.get_untrusted_types(file = model)
clf=sio.load(model,trusted=unknown_types)

test_data_pred = clf.predict_proba(X_test)[:,1].reshape(-1,1)
y_pred_lgbm_nosen = clf.predict_proba(X_test)[:,1] 
'''

In [None]:
if Run_Baseline:
    feat_imp = pd.DataFrame(lgbm.feature_importances_, columns = ['imp'])
    feat_imp['Label'] = X_train.columns
    feat_imp_sign = feat_imp.sort_values(by = 'imp', ascending = False).head(20)
    bow = feat_imp_sign.drop([1,6,8,0,14,12,20,4,16,3])
    fig, ax = plt.subplots(figsize = (8,6))
    ax.barh(bow['Label'].values, bow['imp'])
    plt.title('Feature importances of LGBM Baseline Model')
    plt.show()

In [None]:
Save = False

if Save:
    S_nat.to_pickle("S_nat.pkl")
    S_nat_test.to_pickle("S_nat_test.pkl")
    S_gender.to_pickle("S_gender.pkl")
    S_gender_test.to_pickle("S_gender_test.pkl")
    y_train.to_pickle("y_train.pkl")
    y_test.to_pickle("y_test.pkl")

# LGBM Nosen

In [None]:
sensitive_columns = nat_other + nat_nan + ['VN_NATION_1m_A' ,'VN_GESCHLECHT_1m_1.0', 'VN_GESCHLECHT_1m_2.0', 'VN_GESCHLECHT_1m_nan']

X_train_nos =  X_train.drop(columns=sensitive_columns)
X_test_nos =  X_test.drop(columns=sensitive_columns)

In [None]:
if Run_LGBM_nosen == True: 

    X_train= X_train_nos.copy()
    X_test  = X_test_nos.copy()

    lgbm = lgb.LGBMClassifier(**params)

    lgbm.fit(X_train, y_train)
    y_pred_lgbm_nosen = lgbm.predict_proba(X_test)[:,1]
    pd.DataFrame(y_pred_lgbm).to_pickle(input_path+"y_pred_lgbm_nosen.pkl")
    
else: 
    y_pred_lgbm_nosen = pd.read_pickle(input_path+"y_pred_lgbm_nosen.pkl")

# FairGBM - with and without HPT

In [None]:
params = {'boosting_type': 'goss',
 'class_weight': None,
 'colsample_bytree': 0.8400000000000001,
 'importance_type': 'split',
 'is_unbalance' : True,
 'learning_rate': 0.1,
 'max_depth': -1,
 'metric':'auc',
 'min_child_samples': 4020,
 'min_child_weight': 1,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 64,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'scale_pos_weight':1000,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'verbosity': -1,
 'boost_from_average': False,
 'feature_pre_filter': False,
 'lambda_l1': 3.0000000000000004,
 'lambda_l2': 7.6,
 'scale_pos_weight': 131}

In [None]:
VALIDATION_SIZE_N = int(0.5 * len(X_test))

indices = np.random.permutation(len(X_test))
val_indices = indices[VALIDATION_SIZE_N: ]
test_indices = indices[: VALIDATION_SIZE_N]

X_val = X_test.iloc[val_indices]
X_test_fgbm = X_test.iloc[test_indices]

S_gen_val = S_gender_test[val_indices]
S_nat_val = S_nat_test[val_indices]

S_gen_test_fgbm = S_gender_test[test_indices]
S_nat_test_fgbm = S_nat_test[test_indices]

y_test = y_test.reset_index(drop=True)

y_val = y_test[val_indices]
y_test_fgbm = y_test[test_indices]

In [None]:
from hpt.tuner import ObjectiveFunction, OptunaTuner
 
RN_HP_TUNING = True
 
if RN_HP_TUNING:
    HYPERPARAM_SPACE_PATH = (input_path+"fgbm_params.yaml")
 
    obj_func = ObjectiveFunction(
    X_train=X_train, y_train=y_train, s_train=S_gender,
    X_val=X_val, y_val=y_val, s_val=S_gen_val,
    hyperparameter_space=HYPERPARAM_SPACE_PATH,
    eval_metric="accuracy",
    other_eval_metric="equalized_odds_ratio",
    threshold=0.50,
    alpha=0.50)   # relative weight of `eval_metric` vs `other_eval_metric`
 
    tuner = OptunaTuner(
        objective_function=obj_func,
        direction="maximize",
        seed=42,
    )

In [None]:
%%time
# Then just run optimize as you would for an optuna.Study object
tuner.optimize(n_trials=20, n_jobs=max(2, os.cpu_count()), show_progress_bar=True)


In [None]:
 obj_func.plot()

In [None]:
tuner.results.sort_values(by="equalized_odds_ratio", ascending=False).head()[
    ['accuracy', "equalized_odds_ratio", "equalized_odds_diff"]
]

In [None]:
print(f"Best trial was #{obj_func.best_trial.id}.")


In [None]:
%%time
fairgbm_clf = obj_func.reconstruct_model(obj_func.best_trial)

In [None]:
break

In [None]:
if Run_FGBM == True: 
    X_train= X_train_nos.copy()
    X_test  = X_test_nos.copy()
    
    del params["objective"]

    # Gender
    fgbm = FairGBMClassifier(constraint_type="FPR",global_target_fnr=0.7,global_constraint_type="FNR", **params)
    fgbm.fit(X_train, y_train, constraint_group= S_gender)
    y_pred_fgbm_gender = fgbm.predict_proba(X_test)[:, 1] 
    pd.DataFrame(y_pred_fgbm_gender).to_pickle(output+"Predictions/y_pred_fgbm_gender.pkl")

    # Nationality
    fgbm = FairGBMClassifier(constraint_type="FPR",global_target_fnr=0.7,global_constraint_type="FNR", **params)
    # Train using features (X), labels (Y), and sensitive attributes (S)
    fgbm.fit(X_train, y_train, constraint_group= S_nat)
    y_pred_fgbm_nat = fgbm.predict_proba(X_test)[:, 1] 
    pd.DataFrame(y_pred_fgbm_nat).to_pickle(output+"Predictions/y_pred_fgbm_nat.pkl")

else: 
    y_pred_fgbm_gender = pd.read_pickle(input_path+"y_pred_fgbm_gender.pkl")
    y_pred_fgbm_gender = y_pred_fgbm_gender[0].to_numpy()

    y_pred_fgbm_nat = pd.read_pickle(output+"y_pred_fgbm_nat.pkl")
    y_pred_fgbm_nat = y_pred_fgbm_nat[0].to_numpy()
