<a href="https://colab.research.google.com/github/2303A52450/b39-Explainable-ai/blob/main/assignment_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install dice-ml
!pip install xgboost
!pip install shap

Collecting dice-ml
  Downloading dice_ml-0.12-py3-none-any.whl.metadata (20 kB)
Collecting raiutils>=0.4.0 (from dice-ml)
  Downloading raiutils-0.4.2-py3-none-any.whl.metadata (1.4 kB)
Downloading dice_ml-0.12-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading raiutils-0.4.2-py3-none-any.whl (17 kB)
Installing collected packages: raiutils, dice-ml
Successfully installed dice-ml-0.12 raiutils-0.4.2


In [2]:
# -----------------------------
# Step 1: Imports
# -----------------------------
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from numpy.linalg import norm

# DiCE
import dice_ml
from dice_ml import Dice

# -----------------------------
# Step 2: Load dataset
# -----------------------------
raw_path = "loan_approval.csv"
df_raw = pd.read_csv(raw_path)
print("Original shape:", df_raw.shape)
print(df_raw.columns.tolist())
# display(df_raw.head()) # Not supported in all environments, will comment out

# -----------------------------
# Step 3: Clean names & target
# -----------------------------
df_raw.columns = df_raw.columns.str.lower().str.strip()
possible_targets = [c for c in df_raw.columns if ('loan' in c and 'status' in c)]
if not possible_targets:
    fallback = [c for c in df_raw.columns if c in ['target', 'status', 'label', 'approved']]
    possible_targets = fallback
if not possible_targets and 'loan_approved' in df_raw.columns: # Add explicit check for 'loan_approved'
    possible_targets = ['loan_approved']
if not possible_targets:
    raise ValueError("Could not find target column automatically.")
target_col = possible_targets[0]
print("Using target column:", target_col)
id_cols = [c for c in df_raw.columns if 'id' in c]
if id_cols:
    df_raw = df_raw.drop(columns=id_cols)

# -----------------------------
# Step 4: Missing values
# -----------------------------
for c in df_raw.columns:
    if c != target_col:
        df_raw[c] = pd.to_numeric(df_raw[c], errors='ignore')
feature_cols = [c for c in df_raw.columns if c != target_col]
num_cols = df_raw[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in feature_cols if c not in num_cols]
df_imputed = df_raw.copy()
for c in num_cols:
    df_imputed[c] = df_imputed[c].fillna(df_imputed[c].median())
for c in cat_cols:
    mode_val = df_imputed[c].mode(dropna=True)
    df_imputed[c] = df_imputed[c].fillna(mode_val[0] if not mode_val.empty else "missing")

# -----------------------------
# Step 5: Encode target (for model training)
# -----------------------------
le_target = LabelEncoder()
y_raw = df_imputed[target_col].astype(str)
y_encoded = le_target.fit_transform(y_raw)
print("Target classes mapping:", dict(zip(le_target.classes_, range(len(le_target.classes_)))))

# -----------------------------
# Step 6: Preprocessing pipeline
# -----------------------------
numeric_transformer = Pipeline(steps=[
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

# -----------------------------
# Step 7: Train-test split
# -----------------------------
X = df_imputed[feature_cols].copy()
y = y_encoded
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# -----------------------------
# Step 8: Train classifiers
# -----------------------------
pipe_lr = Pipeline(steps=[('preprocessor', preprocessor),
                            ('clf', LogisticRegression(max_iter=2000, random_state=42))])
pipe_rf = Pipeline(steps=[('preprocessor', preprocessor),
                            ('clf', RandomForestClassifier(n_estimators=200, random_state=42))])
pipe_lr.fit(X_train, y_train)
pipe_rf.fit(X_train, y_train) # Corrected from y_test to y_train
models = {'Logistic Regression': pipe_lr, 'Random Forest': pipe_rf}
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n{name} Performance:")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
    print("Precision:", round(precision_score(y_test, y_pred, zero_division=0), 4))
    print("Recall:", round(recall_score(y_test, y_pred, zero_division=0), 4))
    print("F1-score:", round(f1_score(y_test, y_pred, zero_division=0), 4))
    print(classification_report(y_test, y_pred, zero_division=0))
bb_pipeline = pipe_rf  # final model for DiCE

# -----------------------------
# Step 9: Wrap data + model (Using df_imputed for DiCE Data)
# -----------------------------
# Ensure the target column is in a suitable format for DiCE
df_imputed[target_col] = df_imputed[target_col].astype('category')
df_imputed[target_col] = df_imputed[target_col].fillna(df_imputed[target_col].mode()[0] if not df_imputed[target_col].mode().empty else df_imputed[target_col].iloc[0])


d = dice_ml.Data(dataframe=df_imputed, # Use df_imputed which includes the target column
                  continuous_features=num_cols, # Use num_cols which are the continuous features
                  outcome_name=target_col) # Specify the target column name
m = dice_ml.Model(model=bb_pipeline, backend="sklearn", model_type='classifier')
exp = Dice(d, m, method="random")

# -----------------------------
# Step 10: Pick rejected instance
# -----------------------------
neg_label_encoded = 0  # The encoded label for "Rejected"
query_instance = None
query_instance_original = None # Keep the original query instance for display
for idx in X_test.index:
    pred = bb_pipeline.predict(X_test.loc[[idx]])[0]
    if pred == neg_label_encoded:
        query_instance = X_test.loc[[idx]] # Use the instance from X_test (features only)
        query_instance_original = df_imputed.loc[[idx]] # Get the original instance from df_imputed
        chosen_index = idx
        break
print("\nSelected instance (Rejected):")
# display(query_instance_original)
print(query_instance_original)
print("Predicted label:", neg_label_encoded, "==>", le_target.inverse_transform([neg_label_encoded])[0])

# -----------------------------
# Step 11: Generate CFs → Approved
# -----------------------------
pos_label_encoded = 1 if neg_label_encoded == 0 else 0
dice_exp = exp.generate_counterfactuals(query_instance, # Use query_instance (features only) for CF generation
                                        total_CFs=3,
                                        desired_class=pos_label_encoded,
                                        features_to_vary="all")
cf_df = dice_exp.cf_examples_list[0].final_cfs_df.reset_index(drop=True)
print("\nCounterfactuals generated:")
# display(cf_df)
print(cf_df)

# -----------------------------
# Step 12: Show BEFORE vs AFTER
# -----------------------------
orig_pred = bb_pipeline.predict(query_instance)[0]
cf_preds = bb_pipeline.predict(cf_df[feature_cols])
print("\n=== Loan Decision Status ===")
print("Original Instance:", le_target.inverse_transform([orig_pred])[0], "(Rejected)")
for i, p in enumerate(cf_preds):
    print(f"CF_{i+1}:", le_target.inverse_transform([p])[0], "(Approved)")

# Build comparison table (using original query instance)
compare_table = pd.concat([
    query_instance_original.assign(example="Original (Rejected)"),
    cf_df.assign(example=[f"CF_{i+1} (Approved)" for i in range(len(cf_df))])
], ignore_index=True)
cols = ['example'] + [c for c in compare_table.columns if c != 'example']
compare_table = compare_table[cols]
# display(compare_table)
print("\nComparison Table:")
print(compare_table)

# -----------------------------
# Step 12b: Compute Euclidean and Manhattan distance
# -----------------------------
# We need to apply the same preprocessor to the original and counterfactual instances
X_orig_scaled = preprocessor.transform(query_instance) # Use query_instance (features only)
X_cf_scaled = preprocessor.transform(cf_df[feature_cols]) # Use cf_df[feature_cols] (features only)

# Compute Euclidean distance (L2 norm) for each counterfactual
euclidean_distances = [norm(X_cf_scaled[i] - X_orig_scaled[0]) for i in range(len(cf_df))]
cf_df['euclidean_distance'] = euclidean_distances

# Compute Manhattan distance (L1 norm) for each counterfactual
manhattan_distances = [norm(X_cf_scaled[i] - X_orig_scaled[0], ord=1) for i in range(len(cf_df))]
cf_df['manhattan_distance'] = manhattan_distances

print("\nCounterfactuals with Euclidean and Manhattan distances:")
# Select the desired columns explicitly from cf_df
display_cols = [target_col, 'euclidean_distance', 'manhattan_distance'] + feature_cols
# Ensure unique column names in case of any unexpected overlap (though feature_cols should not contain target_col)
display_cols = list(dict.fromkeys(display_cols))
print(cf_df[display_cols]) # Use the combined and unique list of columns


# -----------------------------
# Step 13: Reflections
# -----------------------------
print("\n--- REFLECTIONS ---")
print("✔ Original instance was REJECTED.")
print("✔ Counterfactuals flipped decision to APPROVED with minimal changes.")
print("✔ This shows how small, actionable changes (like income, loan amount, credit history) can alter outcomes.")
print("✔ Counterfactuals increase trust by answering 'what-if' questions for end-users.")

Original shape: (2000, 8)
['name', 'city', 'income', 'credit_score', 'loan_amount', 'years_employed', 'points', 'loan_approved']
Using target column: loan_approved
Target classes mapping: {'False': 0, 'True': 1}

Logistic Regression Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       224
           1       1.00      1.00      1.00       176

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400


Random Forest Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       224
           1       1.00      1.00      1.00       176

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg     

100%|██████████| 1/1 [00:01<00:00,  1.37s/it]



Counterfactuals generated:
             name          city  income  credit_score  loan_amount  \
0  James Thompson  Davisborough   62662           466        26460   
1  James Thompson  Davisborough   62662           466        44184   
2  Nathan Freeman  Davisborough   62662           466        26460   

   years_employed  points  loan_approved  
0              25    67.2              1  
1              25    62.3              1  
2              25    62.0              1  

=== Loan Decision Status ===
Original Instance: False (Rejected)
CF_1: True (Approved)
CF_2: True (Approved)
CF_3: True (Approved)

Comparison Table:
               example            name          city  income  credit_score  \
0  Original (Rejected)  James Thompson  Davisborough   62662           466   
1      CF_1 (Approved)  James Thompson  Davisborough   62662           466   
2      CF_2 (Approved)  James Thompson  Davisborough   62662           466   
3      CF_3 (Approved)  Nathan Freeman  Davisborough   6