# **Predicting Mortality based on Heart Failure Patients**

This project focuses on predicting mortality outcomes among heart failure patients using a combination of machine learning algorithms. The dataset undergoes preprocessing followed by training and evaluation using several classification models:


>The module is divided into the following steps:


1.   Import Libraries
2.   Load Dataset
3.   Preprocessing
1.   Support Vector Machine(SVM)
1.   Decision Tree
2.   Gaussian Naive Bayes (GNB)
3.   Random Forest Classification (RF)
4.   XGBoost Classification (XGB)
5.   AdaBoost Classification
6.   Artificial Neural Network (ANN)

# **Extras** #
1.   Finding the worst Classification Algorithm using ROC AUC Score
1.   Sampling using SMOTE with the worst Classification Algorithm
2.   Using Explainable AI (Lime) to explain why one specific prediction was made (using misclassified sample).









In [None]:
!pip install imbalanced-learn lime xgboost


In [None]:
# Step 1: Importing necessary libraries

# Data and Preprocessing
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE

# Models
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb

# Evaluation
from sklearn.metrics import classification_report, roc_auc_score

# ANN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import callbacks

# Explainable AI
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Set random seed for reproducibility
np.random.seed(42)


In [None]:
# Step 2 : Load the dataset and check the values
df = pd.read_csv("heart_failure_clinical_records.csv")
df.info()

In [None]:
df.describe().T

In [None]:
plt.subplots(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,cmap='Greens', fmt=".2f", linewidths=0.5, cbar=True)
plt.show()

In [None]:
cols = ['#FFF000', '#FF0000']
plt.figure(figsize=(25,10))
days_of_week = sns.countplot(x="age", data=df, hue = 'DEATH_EVENT', palette = cols)
plt.show()

In [None]:
# Step 3: Data Preprocessing
X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

# Hold out a final test set
X_trainval, X_finaltest, y_trainval, y_finaltest = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Feature scaling
scaler = StandardScaler()
X_trainval_scaled = scaler.fit_transform(X_trainval)
X_finaltest_scaled = scaler.transform(X_finaltest)


# **Support Vector Machine**#

In [None]:
svm_model = SVC(probability=True, kernel='rbf', random_state=42)
svm_model.fit(X_trainval_scaled, y_trainval)

svm_preds = svm_model.predict(X_finaltest_scaled)
svm_probs = svm_model.predict_proba(X_finaltest_scaled)[:, 1]

print("===== SVM Classification Report =====")
print(classification_report(y_finaltest, svm_preds))
print("ROC AUC Score:", roc_auc_score(y_finaltest, svm_probs))



# **Decision Tree Classifier** #


In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_trainval_scaled, y_trainval)

dt_preds = dt_model.predict(X_finaltest_scaled)
dt_probs = dt_model.predict_proba(X_finaltest_scaled)[:, 1]

print("\n===== Decision Tree Classification Report =====")
print(classification_report(y_finaltest, dt_preds))
print("ROC AUC Score:", roc_auc_score(y_finaltest, dt_probs))


# **Gaussian Naive Bayes** #

In [None]:
gnb_model = GaussianNB()
gnb_model.fit(X_trainval_scaled, y_trainval)

gnb_preds = gnb_model.predict(X_finaltest_scaled)
gnb_probs = gnb_model.predict_proba(X_finaltest_scaled)[:, 1]

print("\n===== Gaussian Naive Bayes Classification Report =====")
print(classification_report(y_finaltest, gnb_preds))
print("ROC AUC Score:", roc_auc_score(y_finaltest, gnb_probs))


# **Random Forest Classification** #

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_trainval_scaled, y_trainval)

rf_preds = rf_model.predict(X_finaltest_scaled)
rf_probs = rf_model.predict_proba(X_finaltest_scaled)[:, 1]

print("\n===== Random Forest Classification Report =====")
print(classification_report(y_finaltest, rf_preds))
print("ROC AUC Score:", roc_auc_score(y_finaltest, rf_probs))


# **XGBoost Classification** #

In [None]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_trainval_scaled, y_trainval)

xgb_preds = xgb_model.predict(X_finaltest_scaled)
xgb_probs = xgb_model.predict_proba(X_finaltest_scaled)[:, 1]

print("\n===== XGBoost Classification Report =====")
print(classification_report(y_finaltest, xgb_preds))
print("ROC AUC Score:", roc_auc_score(y_finaltest, xgb_probs))


# **AdaBoost Classification** #

In [None]:
ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_model.fit(X_trainval_scaled, y_trainval)

ada_preds = ada_model.predict(X_finaltest_scaled)
ada_probs = ada_model.predict_proba(X_finaltest_scaled)[:, 1]

print("\n===== AdaBoost Classification Report =====")
print(classification_report(y_finaltest, ada_preds))
print("ROC AUC Score:", roc_auc_score(y_finaltest, ada_probs))


# **Artificial Neural Network** #

In [None]:
# Early stopping to avoid overfitting
early_stopping = callbacks.EarlyStopping(
    min_delta=0.001,
    patience=30,
    verbose=1,
    restore_best_weights=True
)

# Define the model
ANN_model = Sequential()
ANN_model.add(Dense(32, input_dim=12, activation='relu', kernel_initializer='he_uniform'))
ANN_model.add(Dense(8, activation='relu', kernel_initializer='he_uniform'))
ANN_model.add(Dropout(0.25))
ANN_model.add(Dense(8, activation='relu', kernel_initializer='he_uniform'))
ANN_model.add(Dropout(0.25))
ANN_model.add(Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform'))


In [None]:
ANN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ANN_model.summary()

In [None]:
history = ANN_model.fit(
    X_trainval_scaled,
    y_trainval,
    validation_split=0.25,
    epochs=100,
    batch_size=25,
    callbacks=[early_stopping],
    verbose=1
)

In [None]:
# === Plot Loss Curves ===
history_df = pd.DataFrame(history.history)

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(history_df['loss'], label='Train Loss')
plt.plot(history_df['val_loss'], label='Val Loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Loss Curve")
plt.legend()

In [None]:
# === Plot Accuracy Curves ===
plt.subplot(1, 2, 2)
plt.plot(history_df['accuracy'], label='Train Accuracy')
plt.plot(history_df['val_accuracy'], label='Val Accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("Accuracy Curve")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# === Final Evaluation on True Test Set ===
y_ann_probs = ANN_model.predict(X_finaltest_scaled)
y_ann_preds = (y_ann_probs > 0.5).astype(int)

print("\n===== ANN Classification Report (Final Test Set) =====")
print(classification_report(y_finaltest, y_ann_preds))
print("ROC AUC Score:", roc_auc_score(y_finaltest, y_ann_probs))




---





## So we can see that XGBoost ad Random Forest Classification gives us the highest ROC AUC Score for the given dataset. Now the dataset is not perfectly sampled i.e. The numner of **YES** and **NO** for **Death_Event** are not equal. Thus we'll apply SMOTE to balance the class and then work with Gaussian Naive Bayes and AdaBoost to see if the changes.



In [None]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample features and target
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split after SMOTE
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

In [None]:
# Colors
cols = ['#FFF000', '#FF0000']

# Create a new DataFrame from the resampled data
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['DEATH_EVENT'] = y_resampled

# Plot side-by-side
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Before SMOTE
ax1 = sns.countplot(x="DEATH_EVENT", data=df, ax=axes[0])
axes[0].set_title("Before SMOTE")
for bar, color in zip(ax1.patches, cols):
    bar.set_color(color)

# After SMOTE
ax2 = sns.countplot(x="DEATH_EVENT", data=df_resampled, ax=axes[1])
axes[1].set_title("After SMOTE")
for bar, color in zip(ax2.patches, cols):
    bar.set_color(color)

plt.tight_layout()
plt.show()


In [None]:
# Standard Scaling after SMOTE and train-test split
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the GaussianNB model
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)

#  Make predictions and probability estimates
gnb_preds = gnb_model.predict(X_test)
gnb_probs = gnb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("\n===== Gaussian Naive Bayes Classification Report =====")
print(classification_report(y_test, gnb_preds))
print("ROC AUC Score:", roc_auc_score(y_test, gnb_probs))

In [None]:
# Initialize and train the AdaBoost model
ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_model.fit(X_train, y_train)

# Predictions and probabilities
ada_preds = ada_model.predict(X_test)
ada_probs = ada_model.predict_proba(X_test)[:, 1]

# Evaluation
print("\n===== AdaBoost Classification Report =====")
print(classification_report(y_test, ada_preds))
print("ROC AUC Score:", roc_auc_score(y_test, ada_probs))



---


## Thus we can see applying SMOTE increased the ROC AUC Score for Gaussian Naive Bayes by 0.002 and for AdaBoost by 0.9.


---



### Finally using LIME helps in understanding why a particular prediction was made. LIME doesn’t explain the whole model — it explains why the model made a decision for one particular sample (a local explanation).

### In this case I found the misclassified samples and used LIME to find which features made the model give a wrong prediction.

###The bars and values show which features pushed the prediction towards **“Survived” (blue)** or toward **“Died” (orange)**

### The values in the centre left table for each bar gives the magnitude of contribution of that feature interval to the final prediction.



# **🟦 Blue bars = Pushed prediction toward “Survived”**


# **🟧 Orange bars = Pushed prediction toward “Died”** #



# **Explainable AI (EAI)** #

In [None]:
import lime
import lime.lime_tabular

feature_names = [
    'age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction',
    'high_blood_pressure', 'platelets', 'serum_creatinine', 'serum_sodium',
    'sex', 'smoking', 'time'
]

# LIME explainer
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_trainval_scaled,
    feature_names=feature_names,
    class_names=['Survived', 'Died'],
    mode='classification',
    discretize_continuous=True
)

# Wrapper to match ANN's probability shape
def predict_proba_wrapper(x):
    probs = ANN_model.predict(x)
    return np.hstack([1 - probs, probs])


In [None]:
# Flatten predictions just to be safe
y_pred_binary = y_ann_preds.flatten()

# Find indices where predicted not equal to actual
misclassified_indices = np.where(y_pred_binary != y_finaltest.to_numpy())[0]

print(f"Total misclassified samples: {len(misclassified_indices)}")
print("Indices of misclassified samples:", misclassified_indices)

for i in misclassified_indices:
    print(f"\n--- Misclassified Sample #{i} ---")
    print(f"True Label     : {y_finaltest.iloc[i]}")
    print(f"Predicted Label: {y_pred_binary[i]}")
    print(f"Features (scaled):\n{X_finaltest_scaled[i]}")

# Explain with LIME
    lime_exp = lime_explainer.explain_instance(
        X_finaltest_scaled[i],
        predict_proba_wrapper,
        num_features=10
    )
    lime_exp.show_in_notebook()