In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

In [2]:
data = pd.read_csv('/content/loan_approval_dataset.csv')
data.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [3]:
data.columns


Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [5]:
data[data.select_dtypes(include='object').columns] = data.select_dtypes(include='object').apply(lambda x: x.str.strip())


In [6]:
data[' loan_status'].unique()

array(['Approved', 'Rejected'], dtype=object)

In [8]:
data = data.drop(columns=["loan_id"])

In [9]:
# Inspect shape
print("Shape:", data.shape)

Shape: (4269, 12)


In [10]:
# Inspect datatypes
print("\nData types:\n", data.dtypes)


Data types:
 no_of_dependents             int64
education                   object
self_employed               object
income_annum                 int64
loan_amount                  int64
loan_term                    int64
cibil_score                  int64
residential_assets_value     int64
commercial_assets_value      int64
luxury_assets_value          int64
bank_asset_value             int64
loan_status                 object
dtype: object


In [11]:
# Check missing values
print("\nMissing values:\n", data.isnull().sum())


Missing values:
 no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64


In [12]:
print(data[' loan_status'].value_counts())

 loan_status
Approved    2656
Rejected    1613
Name: count, dtype: int64


In [13]:
for col in data.select_dtypes(include="number").columns:
    data[col].fillna(data[col].median(), inplace=True)
for col in data.select_dtypes(include="object").columns:
    data[col].fillna(data[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)


In [14]:
# Encode categorical columns
le = LabelEncoder()
for col in data.select_dtypes(include="object").columns:
    data[col] = le.fit_transform(data[col])

In [15]:
# Features and target
X = data.drop(" loan_status", axis=1)
y = data[" loan_status"]

In [16]:
# Scale numeric features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [19]:
# Handle imbalance with SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [20]:
print(y_train.value_counts())       # before SMOTE
print(y_train_res.value_counts())   # after SMOTE

 loan_status
0    2125
1    1290
Name: count, dtype: int64
 loan_status
1    2125
0    2125
Name: count, dtype: int64


In [21]:
# Logistic Regression
lr = LogisticRegression(class_weight="balanced", random_state=42)
lr.fit(X_train_res, y_train_res)

# Random Forest
rf = RandomForestClassifier(class_weight="balanced", random_state=42)
rf.fit(X_train_res, y_train_res)

# XGBoost
xgb_model = xgb.XGBClassifier(scale_pos_weight=(y_train_res.value_counts()[0]/y_train_res.value_counts()[1]), random_state=42, use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train_res, y_train_res)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [23]:
#Evaluate models
models = {"Logistic Regression": lr, "Random Forest": rf, "XGBoost": xgb_model}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"{name}")
    print(classification_report(y_test, y_pred))
    print("\n")

Logistic Regression
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       531
           1       0.91      0.91      0.91       323

    accuracy                           0.93       854
   macro avg       0.93      0.93      0.93       854
weighted avg       0.93      0.93      0.93       854



Random Forest
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       531
           1       0.98      0.97      0.97       323

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



XGBoost
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       531
           1       0.98      0.97      0.98       323

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98     

In [24]:

!pip install dice-ml --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.5 MB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m2.3/2.5 MB[0m [31m32.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [25]:
import dice_ml
from dice_ml.utils import helpers

In [26]:
# 1. Clean column names
data.columns = data.columns.str.strip()

# 2. Encode categorical columns only
le = LabelEncoder()
for col in ['education', 'self_employed', 'loan_status']:
    data[col] = le.fit_transform(data[col])

# 3. Split features and target (unscaled)
X = data.drop("loan_status", axis=1)
y = data["loan_status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 4. SMOTE (optional)
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# 5. Train Random Forest on **same features used in DiCE**
rf = RandomForestClassifier(class_weight="balanced", random_state=42)
rf.fit(X_train_res, y_train_res)

# 6. Now DiCE query instance from X_test will match feature names exactly
query_instance = X_test[y_test==0].iloc[[0]]  # first rejected



In [27]:

# Imports
import dice_ml
import pandas as pd

# 1. Fix column spaces
data.columns = data.columns.str.strip()

# 2. Define continuous features
continuous_features = [
    'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
    'residential_assets_value', 'commercial_assets_value',
    'luxury_assets_value', 'bank_asset_value'
]
# 3. Create DiCE Data object (original, unscaled)
d = dice_ml.Data(
    dataframe=data,
    continuous_features=continuous_features,
    outcome_name='loan_status'
)

# 4. Create DiCE Model object using trained Random Forest
m = dice_ml.Model(model=rf, backend='sklearn')

# 5. Initialize DiCE explainer
exp = dice_ml.Dice(d, m)

# 6. Pick a test instance predicted as negative (Rejected)
#    Use original unscaled data and drop target column
neg_idx = y_test[y_test==0].index[0]  # first Rejected
query_instance = data.loc[[neg_idx]].drop(columns=['loan_status'])  # only features
# 7. Generate 3 counterfactuals
cf_examples = exp.generate_counterfactuals(
    query_instance,
    total_CFs=3,
    desired_class="opposite"
)

# 8. Visualize counterfactuals, showing only changed features
cf_examples.visualize_as_dataframe(show_only_changes=True)
cf_df = cf_examples.cf_examples_list[0].final_cfs_df
print("Counterfactuals vs Original:\n", cf_df)

  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
100%|██████████| 1/1 [00:00<00:00,  5.18it/s]

Query instance (original outcome : 0)





Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,3,0,0,8000000,26200000,16,890,15800000,4300000,25000000,4000000,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,-,-,-,-,-,-,441,-,-,-,-,1
1,-,-,-,-,-,-,311,-,-,-,-,1
2,-,-,-,-,-,10,402,-,-,-,-,1


Counterfactuals vs Original:
    no_of_dependents  education  self_employed  income_annum  loan_amount  \
0                 3          0              0       8000000     26200000   
1                 3          0              0       8000000     26200000   
2                 3          0              0       8000000     26200000   

   loan_term  cibil_score  residential_assets_value  commercial_assets_value  \
0         16          441                  15800000                  4300000   
1         16          311                  15800000                  4300000   
2         10          402                  15800000                  4300000   

   luxury_assets_value  bank_asset_value  loan_status  
0             25000000           4000000            1  
1             25000000           4885986            1  
2             25000000           4000000            1  


In [28]:
query_instance_reindexed = query_instance.iloc[0].reindex(cf_df.columns)
changed_features = cf_df.loc[:, (cf_df != query_instance_reindexed).any()]
influential_features = changed_features.columns.tolist()
print("Most influential features:", influential_features)

Most influential features: ['loan_term', 'cibil_score', 'bank_asset_value', 'loan_status']


In [29]:
# 8. Check realism/actionability
print("Original instance:\n", query_instance)
print("Counterfactuals:\n", cf_df)

Original instance:
       no_of_dependents  education  self_employed  income_annum  loan_amount  \
2346                 3          0              0       8000000     26200000   

      loan_term  cibil_score  residential_assets_value  \
2346         16          890                  15800000   

      commercial_assets_value  luxury_assets_value  bank_asset_value  
2346                  4300000             25000000           4000000  
Counterfactuals:
    no_of_dependents  education  self_employed  income_annum  loan_amount  \
0                 3          0              0       8000000     26200000   
1                 3          0              0       8000000     26200000   
2                 3          0              0       8000000     26200000   

   loan_term  cibil_score  residential_assets_value  commercial_assets_value  \
0         16          441                  15800000                  4300000   
1         16          311                  15800000                  4300000   

In [30]:
# 9. Generate counterfactuals using Manhattan distance
cf_examples_manhattan = exp.generate_counterfactuals(
    query_instance, total_CFs=3, desired_class="opposite"
)
cf_examples_manhattan.visualize_as_dataframe(show_only_changes=True)
cf_df_manhattan = cf_examples_manhattan.cf_examples_list[0].final_cfs_df
print("Counterfactuals with Manhattan distance:\n", cf_df_manhattan)

  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
  candidate_cfs.at[k, selected_features[k][0]] = random_instances.at[k, selected_features[k][0]]
100%|██████████| 1/1 [00:00<00:00,  4.35it/s]


Query instance (original outcome : 0)


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,3,0,0,8000000,26200000,16,890,15800000,4300000,25000000,4000000,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,-,-,-,-,-,364,-,-,-,-,1
1,-,-,-,-,-,-,401,-,-,-,-,1
2,-,-,-,-,-,-,522,-,-,-,-,1


Counterfactuals with Manhattan distance:
    no_of_dependents  education  self_employed  income_annum  loan_amount  \
0                 1          0              0       8000000     26200000   
1                 3          0              0       8000000     26200000   
2                 3          0              0       6380548     26200000   

   loan_term  cibil_score  residential_assets_value  commercial_assets_value  \
0         16          364                  15800000                  4300000   
1         16          401                  15800000                  4300000   
2         16          522                  15800000                  4300000   

   luxury_assets_value  bank_asset_value  loan_status  
0             25000000           4000000            1  
1             25000000           4000000            1  
2             25000000           4000000            1  


**How Counterfactual Explanations Improve Trust and Transparency in AI Systems**

Counterfactual explanations enhance trust and transparency in AI systems by:

Clarifying Decision Logic: They show users how specific changes to input features (e.g., increasing CIBIL score) could alter outcomes (e.g., loan approval), making the model's decision-making process more interpretable. Empowering Users: By providing actionable suggestions (e.g., "increase income by X"), users understand what steps to take, fostering trust in the system’s fairness. Highlighting Model Behavior: Counterfactuals reveal which features are most influential, exposing potential biases or errors in the model, thus improving transparency. Reducing Black-Box Perception: They demystify complex models like Random Forest or XGBoost by presenting intuitive "what-if" scenarios, making AI decisions feel less opaque. Supporting Accountability: By showing how decisions are made and what changes could lead to different outcomes, counterfactuals help stakeholders verify the model aligns with ethical and logical standards.

**Real-World Application of Counterfactuals Beyond the Loan Approval Dataset**
**Healthcare Diagnostics:**

Use Case: In medical AI systems predicting disease risk (e.g., diabetes), counterfactuals can suggest actionable changes (e.g., "reduce BMI by 5 points" or "lower blood sugar by X") to achieve a healthier outcome. Benefit: Patients and doctors gain insights into critical factors influencing diagnoses, enabling personalized treatment plans and increasing trust in AI-driven medical decisions.