# Experiment 4: Random Forest + Disparate Impact Remover with Validation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install aif360 shap scikit-learn pandas matplotlib seaborn BlackBoxAuditing --quiet

In [3]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import shap
import matplotlib.pyplot as plt

from aif360.datasets import StandardDataset
from aif360.algorithms.preprocessing import DisparateImpactRemover
from aif360.metrics import ClassificationMetric

RESULTS_DIR = '/content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation Results _germanCredit &GivemesomeCredit'


pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [4]:

## Step 1: Load LendingClub
df = pd.read_csv('/content/drive/MyDrive/Research_Thesis_Implementation/data_final/lendingclub_data.csv')
selected_cols = ['loan_status','annual_inc','term','grade','home_ownership','purpose','zip_code']
df = df[selected_cols].dropna()
df['loan_status'] = df['loan_status'].apply(lambda x: 1 if x == 'Fully Paid' else 0)
for col in ['term','grade','home_ownership','purpose','zip_code']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
df['annual_inc'] = StandardScaler().fit_transform(df[['annual_inc']])

  df = pd.read_csv('/content/drive/MyDrive/Research_Thesis_Implementation/data_final/lendingclub_data.csv')


In [5]:


## Step 2: AIF360 Dataset + DIR
privileged_groups = [{'zip_code': 1}]
unprivileged_groups = [{'zip_code': 0}]
aif_data = StandardDataset(df,
                           label_name='loan_status',
                           favorable_classes=[1],
                           protected_attribute_names=['zip_code'],
                           privileged_classes=[[1]])
DIR = DisparateImpactRemover(repair_level=1.0)
aif_data_transf = DIR.fit_transform(aif_data)

In [6]:


## Step 3: Train Random Forest
X = aif_data_transf.features
y = aif_data_transf.labels.ravel()
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)

y_pred = clf.predict(X)
y_prob = clf.predict_proba(X)[:,1]

print("=== LendingClub (DIR Applied) ===")
print("Accuracy:", accuracy_score(y,y_pred))
print("Precision:", precision_score(y,y_pred))
print("Recall:", recall_score(y,y_pred))
print("F1:", f1_score(y,y_pred))
print("AUC:", roc_auc_score(y,y_prob))

pred_dataset = aif_data.copy()
pred_dataset.labels = y_pred.reshape(-1,1)
metric = ClassificationMetric(aif_data, pred_dataset,
                              unprivileged_groups=unprivileged_groups,
                              privileged_groups=privileged_groups)

print("SPD:", metric.statistical_parity_difference())
print("DI:", metric.disparate_impact())
print("EOD:", metric.equal_opportunity_difference())
print("AOD:", metric.average_odds_difference())
print("BiasAmp:", metric.between_group_generalized_entropy_index())
print("Theil:", metric.theil_index())



=== LendingClub (DIR Applied) ===
Accuracy: 0.9944
Precision: 0.9939775526964139
Recall: 0.9983502886994776
F1: 0.996159122085048
AUC: 0.999864843901767
SPD: 0.42307692307692313
DI: 1.8461538461538463
EOD: 0.0
AOD: 0.0
BiasAmp: 166.16666666666666
Theil: 0.002885347529445146


In [7]:
## Step 4: SHAP (LendingClub)
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X)
X_df = pd.DataFrame(X, columns=aif_data.feature_names)
if isinstance(shap_values, list):
    shap.summary_plot(shap_values[1], X_df)

In [8]:
## Step 5: Validation on GermanCredit
GC_PATH = '/content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation dataset/german_credit_data.csv'
df_gc = pd.read_csv(GC_PATH)
if 'Risk' in df_gc.columns:
    df_gc['loan_status'] = df_gc['Risk'].map({'good':1,'bad':0})
elif 'Creditability' in df_gc.columns:
    df_gc['loan_status'] = df_gc['Creditability']
elif 'class' in df_gc.columns:
    df_gc['loan_status'] = df_gc['class'].map({'good':1,'bad':0})
for col in df_gc.columns:
    if df_gc[col].dtype == 'object':
        df_gc[col] = LabelEncoder().fit_transform(df_gc[col].astype(str))

common_gc = pd.DataFrame()
common_gc['annual_inc']     = df_gc['Credit amount']
common_gc['term']           = df_gc['Duration']
common_gc['grade']          = df_gc['Purpose']
common_gc['home_ownership'] = df_gc['Housing']
common_gc['purpose']        = df_gc['Purpose']
common_gc['zip_code']       = df_gc['Checking account']
common_gc['loan_status']    = df_gc['loan_status']
for col in ['term','grade','home_ownership','purpose','zip_code']:
    common_gc[col] = LabelEncoder().fit_transform(common_gc[col].astype(str))
common_gc['annual_inc'] = StandardScaler().fit_transform(common_gc[['annual_inc']])

X_gc = common_gc.drop(columns=['loan_status']).values
y_gc = common_gc['loan_status'].values
y_pred_gc = clf.predict(X_gc)
y_prob_gc = clf.predict_proba(X_gc)[:,1]

print("\n=== GermanCredit Validation (DIR Applied) ===")
print("Accuracy:", accuracy_score(y_gc,y_pred_gc))
print("Precision:", precision_score(y_gc,y_pred_gc))
print("Recall:", recall_score(y_gc,y_pred_gc))
print("F1:", f1_score(y_gc,y_pred_gc))
print("AUC:", roc_auc_score(y_gc,y_prob_gc))

explainer_gc = shap.TreeExplainer(clf)
shap_values_gc = explainer_gc.shap_values(X_gc)
X_gc_df = pd.DataFrame(X_gc, columns=['annual_inc','term','grade','home_ownership','purpose','zip_code'])
if isinstance(shap_values_gc, list):
    shap.summary_plot(shap_values_gc[1], X_gc_df)






=== GermanCredit Validation (DIR Applied) ===
Accuracy: 0.475
Precision: 0.714987714987715
Recall: 0.4157142857142857
F1: 0.5257452574525745
AUC: 0.5470404761904762


In [None]:
## Step 6: Validation on GiveMeSomeCredit
GMSC_PATH = '/content/drive/MyDrive/Research_Thesis_Implementation/Validation files & results/Validation dataset/GiveMeSomeCredit.csv'
df_gmsc = pd.read_csv(GMSC_PATH)
df_gmsc['loan_status'] = 1 - df_gmsc['SeriousDlqin2yrs']
for col in df_gmsc.columns:
    if df_gmsc[col].dtype == 'object':
        df_gmsc[col] = LabelEncoder().fit_transform(df_gmsc[col].astype(str))

common_gmsc = pd.DataFrame()
common_gmsc['annual_inc']     = df_gmsc['MonthlyIncome'].fillna(df_gmsc['MonthlyIncome'].median())
common_gmsc['term']           = df_gmsc['NumberOfOpenCreditLinesAndLoans']
common_gmsc['grade']          = df_gmsc['NumberOfTimes90DaysLate']
common_gmsc['home_ownership'] = df_gmsc['NumberRealEstateLoansOrLines']
common_gmsc['purpose']        = df_gmsc['NumberOfTime30-59DaysPastDueNotWorse']
common_gmsc['zip_code']       = df_gmsc['NumberOfDependents'].fillna(0)
common_gmsc['loan_status']    = df_gmsc['loan_status']
for col in ['term','grade','home_ownership','purpose','zip_code']:
    common_gmsc[col] = LabelEncoder().fit_transform(common_gmsc[col].astype(str))
common_gmsc['annual_inc'] = StandardScaler().fit_transform(common_gmsc[['annual_inc']])

X_gmsc = common_gmsc.drop(columns=['loan_status']).values
y_gmsc = common_gmsc['loan_status'].values
y_pred_gmsc = clf.predict(X_gmsc)
y_prob_gmsc = clf.predict_proba(X_gmsc)[:,1]

print("\n=== GiveMeSomeCredit Validation (DIR Applied) ===")
print("Accuracy:", accuracy_score(y_gmsc,y_pred_gmsc))
print("Precision:", precision_score(y_gmsc,y_pred_gmsc))
print("Recall:", recall_score(y_gmsc,y_pred_gmsc))
print("F1:", f1_score(y_gmsc,y_pred_gmsc))
print("AUC:", roc_auc_score(y_gmsc,y_prob_gmsc))

explainer_gmsc = shap.TreeExplainer(clf)
shap_values_gmsc = explainer_gmsc.shap_values(X_gmsc)
X_gmsc_df = pd.DataFrame(X_gmsc, columns=['annual_inc','term','grade','home_ownership','purpose','zip_code'])
if isinstance(shap_values_gmsc, list):
    shap.summary_plot(shap_values_gmsc[1], X_gmsc_df)

In [None]:
## Step 7: Save Combined Results
results_all = pd.DataFrame([{
    'Dataset': 'LendingClub(DIR)',
    'Accuracy': accuracy_score(y, y_pred),
    'Precision': precision_score(y, y_pred, zero_division=0),
    'Recall': recall_score(y, y_pred, zero_division=0),
    'F1': f1_score(y, y_pred, zero_division=0),
    'AUC': roc_auc_score(y, y_prob) if len(np.unique(y)) > 1 else 0.0
}])

output_path = f"{RESULTS_DIR}/combined_results_experiment.csv"
results_all.to_csv(output_path, index=False)

print(f"\nCombined results saved to: {output_path}")
