<a href="https://colab.research.google.com/github/2303A52430/explainable-AI-LAB/blob/main/ex_ai_ass_07_2430.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import matplotlib.pyplot as plt

In [2]:

%pip install dice-ml

Collecting dice-ml
  Downloading dice_ml-0.12-py3-none-any.whl.metadata (20 kB)
Collecting raiutils>=0.4.0 (from dice-ml)
  Downloading raiutils-0.4.2-py3-none-any.whl.metadata (1.4 kB)
Downloading dice_ml-0.12-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading raiutils-0.4.2-py3-none-any.whl (17 kB)
Installing collected packages: raiutils, dice-ml
Successfully installed dice-ml-0.12 raiutils-0.4.2


In [4]:

df = sns.load_dataset('titanic')

In [5]:
print("Initial shape:", df.shape)
print(df.head())

Initial shape: (891, 15)
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [6]:
df = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']].copy()
df = df.rename(columns={'survived': 'Survived'})

In [7]:
numeric_features = ['age', 'sibsp', 'parch', 'fare']
categorical_features = ['pclass', 'sex', 'embarked']

In [8]:
df = df[~df['Survived'].isna()]

In [9]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Survived'])

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [12]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop', verbose_feature_names_out=False)

In [13]:
preprocessor.fit(train_df)

In [14]:
X_train = pd.DataFrame(preprocessor.transform(train_df),
                       columns=preprocessor.get_feature_names_out(),
                       index=train_df.index)

In [15]:
X_test = pd.DataFrame(preprocessor.transform(test_df),
                      columns=preprocessor.get_feature_names_out(),
                      index=test_df.index)

In [16]:
y_train = train_df['Survived'].astype(int)
y_test = test_df['Survived'].astype(int)

In [17]:
print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)

X_train shape: (712, 12) X_test shape: (179, 12)


In [18]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

In [19]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

In [20]:
def evaluate(model, X, y, name):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred, zero_division=0)
    rec = recall_score(y, y_pred, zero_division=0)
    f1 = f1_score(y, y_pred, zero_division=0)
    print(f"--- {name} ---")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
    print(classification_report(y, y_pred, zero_division=0))
    return {'model': name, 'accuracy': acc, 'precision': prec, 'recall': rec, 'f1': f1}

In [21]:
results = []
results.append(evaluate(lr, X_test, y_test, "Logistic Regression"))
results.append(evaluate(rf, X_test, y_test, "Random Forest"))

--- Logistic Regression ---
Accuracy: 0.8045 | Precision: 0.7931 | Recall: 0.6667 | F1: 0.7244
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

--- Random Forest ---
Accuracy: 0.8156 | Precision: 0.8103 | Recall: 0.6812 | F1: 0.7402
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.68      0.74        69

    accuracy                           0.82       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.82      0.82      0.81       179



In [22]:
metrics_df = pd.DataFrame(results)
print(metrics_df)

                 model  accuracy  precision    recall        f1
0  Logistic Regression  0.804469   0.793103  0.666667  0.724409
1        Random Forest  0.815642   0.810345  0.681159  0.740157


In [23]:
test_preds = rf.predict(X_test)
test_indices = X_test.index


In [24]:
neg_indices = [i for i, p in zip(test_indices, test_preds) if p == 0]
if len(neg_indices) == 0:
    pass

In [25]:
# Since there are no negative predictions, choose an index from positive predictions
pos_indices = [i for i, p in zip(test_indices, test_preds) if p == 1]
if len(pos_indices) > 0:
    chosen_idx = pos_indices[0] # choose first positive predicted example
    print("Chosen test index:", chosen_idx)
    original_row = test_df.loc[chosen_idx]
    print("Original raw row:\n", original_row)
else:
    print("No positive predictions found in test set.")

Chosen test index: 241
Original raw row:
 Survived         1
pclass           3
sex         female
age            NaN
sibsp            1
parch            0
fare          15.5
embarked         Q
Name: 241, dtype: object


In [26]:
import dice_ml
from dice_ml.data import Data
from dice_ml.model import Model

In [27]:
# DiCE
import dice_ml
from dice_ml.data import Data
from dice_ml.model import Model
from dice_ml import Dice  # Corrected import path

In [28]:
dice_data = Data(dataframe=train_df.reset_index(drop=True),
                 continuous_features=['age', 'sibsp', 'parch', 'fare'],
                 outcome_name='Survived')

In [29]:
import dice_ml
from dice_ml.data import Data
from dice_ml.model import Model
from dice_ml import Dice

# Get feature names after preprocessing
processed_feature_names = preprocessor.get_feature_names_out()

# Separate continuous and categorical feature names from the processed data
processed_continuous_features = [f for f in processed_feature_names if f in numeric_features]
# Note: This is a simplification; in a real scenario, you'd need a more robust way to
# identify the one-hot encoded categorical features from the processed names.
# For this dataset and preprocessor setup, features not in original numeric_features are likely the encoded categoricals.
processed_categorical_features = [f for f in processed_feature_names if f not in processed_continuous_features]


dice_data = Data(dataframe=X_train,
                 continuous_features=processed_continuous_features,
                 categorical_features=processed_categorical_features,
                 outcome_name='Survived')

dice_model = Model(model=rf, backend='sklearn')

exp = Dice(dice_data, dice_model, method='random')

In [33]:
dice_data = Data(dataframe=train_df.reset_index(drop=True),
                 continuous_features=['age', 'sibsp', 'parch', 'fare'],
                 outcome_name='Survived')

In [34]:
dice_model = Model(model=rf, backend='sklearn')

In [35]:
exp = Dice(dice_data, dice_model, method='random')

In [36]:
instance_for_dice = original_row[['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']].to_frame().T.reset_index(drop=True)

In [37]:
print("Instance for DiCE (raw):")
print(instance_for_dice)

Instance for DiCE (raw):
  pclass     sex  age sibsp parch  fare embarked
0      3  female  NaN     1     0  15.5        Q


In [38]:
try:
    cf_df = dice_cf.final_cfs_df.copy()
except Exception:
    pass

In [43]:
instance_for_dice = test_df.loc[[chosen_idx]].reset_index(drop=True)

# Impute missing values in the instance
instance_for_dice_processed = pd.DataFrame(preprocessor.transform(instance_for_dice),
                                         columns=preprocessor.get_feature_names_out(),
                                         index=instance_for_dice.index)

dice_cf = exp.generate_counterfactuals(instance_for_dice_processed,
                                       total_CFs=3,
                                       desired_class="opposite")

cf_df = dice_cf.cf_examples_list[0].final_cfs_df.copy()
print("\nCounterfactual Examples:")
print(cf_df)

  0%|          | 0/1 [00:00<?, ?it/s]


ValueError: ('Feature', 'pclass_1', 'not present in training data!')