#Experiment 7: SVM + Fairlearn ExponentiatedGradient (EqualizedOdds)
####Full pipeline: retrain, validation (GC & GMSC), metrics, SHAP, saving

In [1]:


# Step 0: Setup
!pip install fairlearn shap scikit-learn aif360 pandas matplotlib seaborn --quiet

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import shap
import matplotlib.pyplot as plt

from fairlearn.reductions import ExponentiatedGradient, EqualizedOdds
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric

RESULTS_DIR = '/content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation Results _germanCredit &GivemesomeCredit'


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.7/259.7 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25h

pip install 'aif360[inFairness]'


In [2]:

# -----------------------------
# Step 1: Load & preprocess LendingClub
# -----------------------------
df = pd.read_csv('/content/drive/MyDrive/Research_Thesis_Implementation/data_final/lendingclub_data.csv')
selected_cols = ['loan_status','annual_inc','term','grade','home_ownership','purpose','zip_code']
df = df[selected_cols].dropna()

# Binary target
df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x == 'Fully Paid' else 0)

# Encode categoricals
for col in ['term','grade','home_ownership','purpose','zip_code']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Normalize income
df['annual_inc'] = StandardScaler().fit_transform(df[['annual_inc']])

# AIF360 dataset
privileged_groups = [{'zip_code': 1}]
unprivileged_groups = [{'zip_code': 0}]
aif_data = StandardDataset(df,
                           label_name='loan_status',
                           favorable_classes=[1],
                           protected_attribute_names=['zip_code'],
                           privileged_classes=[[1]])

X = aif_data.features
y = aif_data.labels.ravel()
prot_attr = aif_data.protected_attributes.ravel()

  df = pd.read_csv('/content/drive/MyDrive/Research_Thesis_Implementation/data_final/lendingclub_data.csv')


In [3]:


# -----------------------------
# Step 2: Train SVM + Fairlearn (train/test split)
# -----------------------------
X_train, X_test, y_train, y_test, prot_train, prot_test = train_test_split(
    X, y, prot_attr, test_size=0.3, random_state=42, stratify=y
)

base_estimator = SVC(kernel='rbf', C=1.0, probability=True, random_state=42, cache_size=500)
constraint = EqualizedOdds()
mitigator = ExponentiatedGradient(base_estimator, constraints=constraint, max_iter=20)
mitigator.fit(X_train, y_train, sensitive_features=prot_train)

y_pred = mitigator.predict(X_test)

# Try to get probabilities robustly
y_prob = None
try:
    y_prob = mitigator.predict_proba(X_test)[:, 1]
except Exception:
    try:
        y_prob = mitigator._pm.predict_proba(X_test)[:, 1]
    except Exception:
        try:
            y_prob = mitigator._pmf_predict(X_test)[:, 1]
        except Exception:
            y_prob = y_pred.astype(float)


In [4]:

# -----------------------------
# Step 3: Performance + fairness on LendingClub (test set)
# -----------------------------
print("=== LendingClub (SVM + Fairlearn, test set) ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
try:
    auc_val = roc_auc_score(y_test, y_prob)
except Exception:
    auc_val = None
print("AUC:", auc_val if auc_val is not None else 'N/A')

# Fairness metrics on test set
aif_test = StandardDataset(
    pd.DataFrame(np.hstack([X_test, y_test.reshape(-1, 1)]),
                 columns=list(aif_data.feature_names) + ['loan_status']),
    label_name='loan_status',
    favorable_classes=[1],
    protected_attribute_names=['zip_code'],
    privileged_classes=[[1]]
)

pred_dataset = aif_test.copy()
pred_dataset.labels = y_pred.reshape(-1, 1)

metric = ClassificationMetric(aif_test, pred_dataset,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)

print("SPD:", metric.statistical_parity_difference())
print("DI:", metric.disparate_impact())
print("EOD:", metric.equal_opportunity_difference())
print("AOD:", metric.average_odds_difference())
print("BiasAmp:", metric.between_group_generalized_entropy_index())
print("Theil:", metric.theil_index())




=== LendingClub (SVM + Fairlearn, test set) ===
Accuracy: 0.7273333333333334
Precision: 0.7273333333333334
Recall: 1.0
F1: 0.8421458896179081
AUC: 0.5
SPD: nan
DI: nan
EOD: nan
AOD: nan
BiasAmp: 149.5
Theil: 0.055896770120989404


  return (self.num_pred_positives(privileged=privileged)
  TPR=TP / P, TNR=TN / N, FPR=FP / N, FNR=FN / P,
  GTPR=GTP / P, GTNR=GTN / N, GFPR=GFP / N, GFNR=GFN / P,


In [5]:

# -----------------------------
# Step 4: SHAP for LendingClub (KernelExplainer on sampled sets)
# -----------------------------
X_test_df = pd.DataFrame(X_test, columns=aif_data.feature_names)

background_size = min(100, X_train.shape[0])
background_idx = np.random.choice(X_train.shape[0], background_size, replace=False)
X_background = X_train[background_idx]

explain_size = min(200, X_test.shape[0])
explain_idx = np.random.choice(X_test.shape[0], explain_size, replace=False)
X_explain = X_test[explain_idx]
X_explain_df = pd.DataFrame(X_explain, columns=aif_data.feature_names)

explainer = shap.KernelExplainer(mitigator.predict, X_background)

print("Computing SHAP values for LendingClub...")
shap_values = explainer.shap_values(X_explain_df, nsamples=50)

# Save SHAP plots
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X_explain_df, show=False)
plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/exp7_shap_global.png', dpi=150, bbox_inches='tight')
plt.close()

priv_mask = X_explain_df['zip_code'] == 1
unpriv_mask = X_explain_df['zip_code'] == 0
priv_idx = np.where(priv_mask)[0]
unpriv_idx = np.where(unpriv_mask)[0]

if len(priv_idx) > 0:
    plt.figure(figsize=(10, 6))
    shap.summary_plot(np.array(shap_values)[priv_idx], X_explain_df.iloc[priv_idx], show=False)
    plt.title('SHAP Values - Privileged Group (zip_code=1)')
    plt.tight_layout()
    plt.savefig(f'{RESULTS_DIR}/exp7_shap_privileged.png', dpi=150, bbox_inches='tight')
    plt.close()

if len(unpriv_idx) > 0:
    plt.figure(figsize=(10, 6))
    shap.summary_plot(np.array(shap_values)[unpriv_idx], X_explain_df.iloc[unpriv_idx], show=False)
    plt.title('SHAP Values - Unprivileged Group (zip_code=0)')
    plt.tight_layout()
    plt.savefig(f'{RESULTS_DIR}/exp7_shap_unprivileged.png', dpi=150, bbox_inches='tight')
    plt.close()



Computing SHAP values for LendingClub...


  0%|          | 0/200 [00:00<?, ?it/s]

In [6]:
# -----------------------------
# Step 5: Validation on GermanCredit (schema-aligned)
# -----------------------------
GC_PATH = '/content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation dataset/german_credit_data.csv'
df_gc = pd.read_csv(GC_PATH)

# Target mapping
if 'Risk' in df_gc.columns:
    df_gc['loan_status'] = df_gc['Risk'].map({'good':1,'bad':0})
elif 'Creditability' in df_gc.columns:
    df_gc['loan_status'] = df_gc['Creditability']
elif 'class' in df_gc.columns:
    df_gc['loan_status'] = df_gc['class'].map({'good':1,'bad':0})
else:
    raise ValueError("Target column not found in GermanCredit.")

# Encode strings
for col in df_gc.columns:
    if df_gc[col].dtype == 'object':
        df_gc[col] = LabelEncoder().fit_transform(df_gc[col].astype(str))

# Align to LendingClub schema
common_gc = pd.DataFrame()
common_gc['annual_inc']     = df_gc['Credit amount']
common_gc['term']           = df_gc['Duration']
common_gc['grade']          = df_gc['Purpose']
common_gc['home_ownership'] = df_gc['Housing']
common_gc['purpose']        = df_gc['Purpose']
common_gc['zip_code']       = df_gc['Checking account']
common_gc['loan_status']    = df_gc['loan_status']

for col in ['term','grade','home_ownership','purpose','zip_code']:
    common_gc[col] = LabelEncoder().fit_transform(common_gc[col].astype(str))
common_gc['annual_inc'] = StandardScaler().fit_transform(common_gc[['annual_inc']])

X_gc = common_gc.drop(columns=['loan_status']).values
y_gc = common_gc['loan_status'].values

y_pred_gc = mitigator.predict(X_gc)
# Probability attempts (may not be available with mitigator)
try:
    y_prob_gc = mitigator.predict_proba(X_gc)[:,1]
except Exception:
    y_prob_gc = y_pred_gc.astype(float)

print("\n=== GermanCredit Validation (SVM + Fairlearn) ===")
print("Accuracy:", accuracy_score(y_gc, y_pred_gc))
print("Precision:", precision_score(y_gc, y_pred_gc))
print("Recall:", recall_score(y_gc, y_pred_gc))
print("F1:", f1_score(y_gc, y_pred_gc))
try:
    print("AUC:", roc_auc_score(y_gc, y_prob_gc))
except Exception:
    print("AUC: N/A")

# SHAP for GC (KernelExplainer on a sample)
gc_bg_size = min(100, X_gc.shape[0])
gc_bg_idx = np.random.choice(X_gc.shape[0], gc_bg_size, replace=False)
X_gc_background = X_gc[gc_bg_idx]

gc_explain_size = min(200, X_gc.shape[0])
gc_explain_idx = np.random.choice(X_gc.shape[0], gc_explain_size, replace=False)
X_gc_explain = X_gc[gc_explain_idx]
X_gc_explain_df = pd.DataFrame(X_gc_explain, columns=['annual_inc','term','grade','home_ownership','purpose','zip_code'])

explainer_gc = shap.KernelExplainer(mitigator.predict, X_gc_background)
gc_shap_values = explainer_gc.shap_values(X_gc_explain_df, nsamples=50)

plt.figure(figsize=(10, 6))
shap.summary_plot(gc_shap_values, X_gc_explain_df, show=False)
plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/exp7_gc_shap_global.png', dpi=150, bbox_inches='tight')
plt.close()



=== GermanCredit Validation (SVM + Fairlearn) ===
Accuracy: 0.7
Precision: 0.7
Recall: 1.0
F1: 0.8235294117647058
AUC: 0.5


  0%|          | 0/200 [00:00<?, ?it/s]

In [7]:

# -----------------------------
# Step 6: Validation on GiveMeSomeCredit (schema-aligned)
# -----------------------------
GMSC_PATH = '/content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation dataset/GiveMeSomeCredit.csv'
df_gmsc = pd.read_csv(GMSC_PATH)

df_gmsc['loan_status'] = 1 - df_gmsc['SeriousDlqin2yrs']

for col in df_gmsc.columns:
    if df_gmsc[col].dtype == 'object':
        df_gmsc[col] = LabelEncoder().fit_transform(df_gmsc[col].astype(str))

common_gmsc = pd.DataFrame()
common_gmsc['annual_inc']     = df_gmsc['MonthlyIncome'].fillna(df_gmsc['MonthlyIncome'].median())
common_gmsc['term']           = df_gmsc['NumberOfOpenCreditLinesAndLoans']
common_gmsc['grade']          = df_gmsc['NumberOfTimes90DaysLate']
common_gmsc['home_ownership'] = df_gmsc['NumberRealEstateLoansOrLines']
common_gmsc['purpose']        = df_gmsc['NumberOfTime30-59DaysPastDueNotWorse']
common_gmsc['zip_code']       = df_gmsc['NumberOfDependents'].fillna(0)
common_gmsc['loan_status']    = df_gmsc['loan_status']

for col in ['term','grade','home_ownership','purpose','zip_code']:
    common_gmsc[col] = LabelEncoder().fit_transform(common_gmsc[col].astype(str))
common_gmsc['annual_inc'] = StandardScaler().fit_transform(common_gmsc[['annual_inc']])

X_gmsc = common_gmsc.drop(columns=['loan_status']).values
y_gmsc = common_gmsc['loan_status'].values

y_pred_gmsc = mitigator.predict(X_gmsc)
try:
    y_prob_gmsc = mitigator.predict_proba(X_gmsc)[:,1]
except Exception:
    y_prob_gmsc = y_pred_gmsc.astype(float)

print("\n=== GiveMeSomeCredit Validation (SVM + Fairlearn) ===")
print("Accuracy:", accuracy_score(y_gmsc, y_pred_gmsc))
print("Precision:", precision_score(y_gmsc, y_pred_gmsc))
print("Recall:", recall_score(y_gmsc, y_pred_gmsc))
print("F1:", f1_score(y_gmsc, y_pred_gmsc))
try:
    print("AUC:", roc_auc_score(y_gmsc, y_prob_gmsc))
except Exception:
    print("AUC: N/A")

# SHAP for GMSC (KernelExplainer on a sample)
gmsc_bg_size = min(100, X_gmsc.shape[0])
gmsc_bg_idx = np.random.choice(X_gmsc.shape[0], gmsc_bg_size, replace=False)
X_gmsc_background = X_gmsc[gmsc_bg_idx]

gmsc_explain_size = min(200, X_gmsc.shape[0])
gmsc_explain_idx = np.random.choice(X_gmsc.shape[0], gmsc_explain_size, replace=False)
X_gmsc_explain = X_gmsc[gmsc_explain_idx]
X_gmsc_explain_df = pd.DataFrame(X_gmsc_explain, columns=['annual_inc','term','grade','home_ownership','purpose','zip_code'])

explainer_gmsc = shap.KernelExplainer(mitigator.predict, X_gmsc_background)
gmsc_shap_values = explainer_gmsc.shap_values(X_gmsc_explain_df, nsamples=50)

plt.figure(figsize=(10, 6))
shap.summary_plot(gmsc_shap_values, X_gmsc_explain_df, show=False)
plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/exp7_gmsc_shap_global.png', dpi=150, bbox_inches='tight')
plt.close()



=== GiveMeSomeCredit Validation (SVM + Fairlearn) ===
Accuracy: 0.93316
Precision: 0.93316
Recall: 1.0
F1: 0.9654244863332575
AUC: 0.5


  0%|          | 0/200 [00:00<?, ?it/s]

In [8]:

# -----------------------------
# Step 7: Save combined results to CSV
# -----------------------------
results_all = pd.DataFrame([
    {
        'Dataset': 'LendingClub(SVM+Fairlearn,test)',
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1': f1_score(y_test, y_pred),
        'AUC': auc_val if auc_val is not None else np.nan,
        'SPD': metric.statistical_parity_difference(),
        'DI': metric.disparate_impact(),
        'EOD': metric.equal_opportunity_difference(),
        'AOD': metric.average_odds_difference(),
        'BiasAmp': metric.between_group_generalized_entropy_index(),
        'Theil': metric.theil_index()
    },
    {
        'Dataset': 'GermanCredit(SVM+Fairlearn)',
        'Accuracy': accuracy_score(y_gc, y_pred_gc),
        'Precision': precision_score(y_gc, y_pred_gc),
        'Recall': recall_score(y_gc, y_pred_gc),
        'F1': f1_score(y_gc, y_pred_gc),
        'AUC': roc_auc_score(y_gc, y_prob_gc) if len(np.unique(y_prob_gc))>1 else np.nan
    },
    {
        'Dataset': 'GiveMeSomeCredit(SVM+Fairlearn)',
        'Accuracy': accuracy_score(y_gmsc, y_pred_gmsc),
        'Precision': precision_score(y_gmsc, y_pred_gmsc),
        'Recall': recall_score(y_gmsc, y_pred_gmsc),
        'F1': f1_score(y_gmsc, y_pred_gmsc),
        'AUC': roc_auc_score(y_gmsc, y_prob_gmsc) if len(np.unique(y_prob_gmsc))>1 else np.nan
    }
])

results_all.to_csv(f"{RESULTS_DIR}/exp7_svm_fairlearn_validation_results.csv", index=False)
print(f"Validation results saved to {RESULTS_DIR}/exp7_svm_fairlearn_validation_results.csv")
print(f"SHAP plots saved to {RESULTS_DIR}/exp7_*.png")

Validation results saved to /content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation Results _germanCredit &GivemesomeCredit/exp7_svm_fairlearn_validation_results.csv
SHAP plots saved to /content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation Results _germanCredit &GivemesomeCredit/exp7_*.png
