<a href="https://colab.research.google.com/github/2303A52102/2303A52102-EXAI_lab/blob/main/Ass_07_2102.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [8]:
df = pd.read_csv("/content/Titanic-Dataset.csv")
print("Original shape:", df.shape)
print(df.head())


Original shape: (891, 12)
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   

In [11]:
df = df.rename(columns=lambda x: x.strip())  # clean column names
df = df.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"], errors='ignore')


In [12]:
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

In [13]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [14]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

In [15]:
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

In [17]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    clf = Pipeline(steps=[("preprocessor", preprocessor),
                          ("classifier", model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "report": classification_report(y_test, y_pred)
    }

In [18]:
print("\n=== MODEL PERFORMANCE ===")
for k,v in results.items():
    print(f"\n{k}")
    for metric, val in v.items():
        if metric != "report":
            print(f"{metric:10s}: {val:.4f}")
    print(v["report"])



=== MODEL PERFORMANCE ===

LogisticRegression
accuracy  : 0.8045
precision : 0.7931
recall    : 0.6667
f1        : 0.7244
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       110
           1       0.79      0.67      0.72        69

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179


RandomForest
accuracy  : 0.8212
precision : 0.8136
recall    : 0.6957
f1        : 0.7500
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.70      0.75        69

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179



In [19]:
print("\n=== COUNTERFACTUAL GENERATION ===")

# Try importing DiCE
dice_available = True
try:
    import dice_ml
except ImportError:
    dice_available = False
    print("⚠️ DiCE not installed. Using fallback nearest neighbor method.")
base_model = models["LogisticRegression"]
pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("classifier", base_model)])
pipeline.fit(X_train, y_train)



=== COUNTERFACTUAL GENERATION ===
⚠️ DiCE not installed. Using fallback nearest neighbor method.


In [20]:
X_test_reset = X_test.reset_index(drop=True)
y_pred = pipeline.predict(X_test_reset)
negative_indices = np.where(y_pred == 0)[0]

test_index = int(negative_indices[0])
test_instance = X_test_reset.iloc[[test_index]]
print("\nChosen test instance index:", test_index)
print(test_instance)


Chosen test instance index: 0
   Pclass   Sex   Age  SibSp  Parch   Fare Embarked
0       3  male  24.0      2      0  24.15        S


In [22]:
if dice_available:
    import dice_ml
    from dice_ml.utils import helpers

    # Prepare data for DiCE
    d = dice_ml.Data(dataframe=df, continuous_features=num_cols, outcome_name='Survived')
    m = dice_ml.Model(model=pipeline, backend="sklearn")
    exp = dice_ml.Dice(d, m, method="random")

    # Generate 3 counterfactuals that flip outcome
    dice_cf = exp.generate_counterfactuals(test_instance, total_CFs=3, desired_class="opposite")
    dice_cf.visualize_as_dataframe(show_only_changes=True)
else:
    # =============================
    # Fallback: Nearest-neighbor Counterfactuals
    # =============================
    from sklearn.neighbors import NearestNeighbors

    X_train_proc = preprocessor.fit_transform(X_train)
    X_test_proc = preprocessor.transform(X_test_reset)

    survived_idx = y_train[y_train == 1].index
    X_train_survived = preprocessor.transform(X_train.loc[survived_idx])

    # Use Euclidean distance
    nbrs = NearestNeighbors(n_neighbors=3, metric='euclidean')
    nbrs.fit(X_train_survived)

    distances, indices = nbrs.kneighbors(X_test_proc[[test_index]])
    cf_indices = survived_idx[indices[0]]

    print("\nGenerated Counterfactuals (Euclidean NN method):")
    for i, idx in enumerate(cf_indices):
        cf = X_train.loc[idx]
        changes = [col for col in X.columns if cf[col] != test_instance.iloc[0][col]]
        print(f"\nCF #{i+1}:\n{cf}\nChanged: {', '.join(changes)}")


Generated Counterfactuals (Euclidean NN method):

CF #1:
Pclass          3
Sex          male
Age          25.0
SibSp           1
Parch           0
Fare        7.775
Embarked        S
Name: 267, dtype: object
Changed: Age, SibSp, Fare

CF #2:
Pclass          3
Sex          male
Age          20.0
SibSp           1
Parch           0
Fare        7.925
Embarked        S
Name: 664, dtype: object
Changed: Age, SibSp, Fare

CF #3:
Pclass          3
Sex          male
Age          28.0
SibSp           2
Parch           0
Fare        23.25
Embarked        Q
Name: 301, dtype: object
Changed: Age, Fare, Embarked


In [23]:
print("\n=== FEATURE INFLUENCE ANALYSIS ===")
print("Most commonly changed features likely influencing survival: Age, Fare, SibSp, Embarked")

print("\n=== DISTANCE METRIC COMPARISON ===")
print("Compare Euclidean vs Manhattan nearest neighbors to observe different but similar CFs.")



=== FEATURE INFLUENCE ANALYSIS ===
Most commonly changed features likely influencing survival: Age, Fare, SibSp, Embarked

=== DISTANCE METRIC COMPARISON ===
Compare Euclidean vs Manhattan nearest neighbors to observe different but similar CFs.


In [24]:
nbrs_manhattan = NearestNeighbors(n_neighbors=3, metric='manhattan')
nbrs_manhattan.fit(X_train_survived)
_, indices_manhattan = nbrs_manhattan.kneighbors(X_test_proc[[test_index]])
cf_indices_manhattan = survived_idx[indices_manhattan[0]]
print("Manhattan-based CF indices:", list(cf_indices_manhattan))

Manhattan-based CF indices: [267, 664, 127]
