In [None]:
from google.colab import drive
drive.mount('/content/drive')

Applying SMOTE-ENN on combined Heart Disease Dataset and performance comparison before and after applying SMOTE_ENN


1.   List item please download the Heart disease combined dataset from this https://www.kaggle.com/datasets/mfarhaannazirkhan/heart-dataset/data




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, jaccard_score, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import shap
from itertools import combinations

# Load the cleaned dataset from your drive
cleaned_df = pd.read_csv('/content/drive/MyDrive/Datasets/heart_disease/cleaned_merged_heart_dataset.csv')

# Separate the 'target' column (target variable) from the features
target = cleaned_df['target']  # Extract the target column
features = cleaned_df.drop(['target'], axis=1)  # Remove 'target' from features

# Normalize the feature columns
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)  # Normalize only the feature columns

# Convert the normalized features back to a DataFrame and restore column names
normalized_features = pd.DataFrame(normalized_features, columns=features.columns)

# Recombine the target column with the normalized features
cleaned_df = pd.concat([normalized_features, target.reset_index(drop=True)], axis=1)

total_rows = cleaned_df.shape[0]
print("Total number of rows:", total_rows)

# Select relevant features
temp_df = cleaned_df.drop(['sex', 'trestbps', 'chol', 'fbs', 'exang', 'slope', 'ca', 'restecg', 'age'], axis=1).copy()

# Create a copy for testing
cleaned_df_copy = temp_df.copy()

# Function to calculate derived features
def calculate_derived_features(df):
    feature_cols = ['cp', 'thalachh', 'oldpeak', 'thal']
    combined_features = pd.DataFrame()
    pairs = list(combinations(feature_cols, 2))
    for idx, (n1, n2) in enumerate(pairs, 1):
        max_val = df[[n1, n2]].max(axis=1)
        min_val = df[[n1, n2]].min(axis=1)
        combined_features[f'S1_P{idx}'] = np.log10(np.where(max_val > 0, max_val, 1e-6))
        combined_features[f'S2_P{idx}'] = np.log10(np.where(min_val > 0, min_val, 1e-6))
        combined_features[f'S3_P{idx}'] = max_val - min_val
        combined_features[f'S4_P{idx}'] = max_val * min_val
        combined_features[f'S5_P{idx}'] = max_val + min_val
        combined_features[f'S6_P{idx}'] = max_val / (min_val + 1e-6)
    return combined_features

# Apply the function to the data
combined_features = calculate_derived_features(cleaned_df_copy)
cleaned_df_copy = pd.concat([cleaned_df_copy, combined_features], axis=1)

# Drop original features to avoid redundancy
#cleaned_df_copy = cleaned_df_copy.drop(['cp','thal','thalachh','oldpeak','S1_C1','S2_C1','S3_C1','S4_C1','S5_C1','S6_C1','S1_C2','S2_C2','S3_C2','S4_C2','S5_C2','S6_C2', 'S1_C3','S2_C3','S3_C3','S4_C3','S5_C3','S6_C3','S1_C4','S2_C4','S3_C4','S4_C4','S5_C4','S6_C4','S1_C5','S2_C5','S3_C5','S4_C5','S5_C5','S6_C5'],axis = 1).copy()
#cleaned_df_copy = cleaned_df_copy.drop(['cp', 'thal', 'thalachh', 'oldpeak'], axis=1).copy()

# Prepare features and target
X = cleaned_df_copy.drop(['target'], axis=1).values
y = cleaned_df_copy['target'].values

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the dataset(Remark: No need to standardize data here if earlier we did)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print class distribution before SMOTE-ENN
print("\nClass Distribution Before SMOTE-ENN (Training Set):")
class_counts_before = pd.Series(y_train).value_counts()
class_proportions_before = pd.Series(y_train).value_counts(normalize=True)
print("Counts:\n", class_counts_before)
print("Proportions:\n", class_proportions_before)

# Define classifiers
models = [
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('DT', DecisionTreeClassifier()),
    ('NB', GaussianNB()),
    ('RF', RandomForestClassifier(n_estimators=100)),
    ('ABC', AdaBoostClassifier(n_estimators=100)),
    ('GB', GradientBoostingClassifier()),
    ('LR', LogisticRegression()),
    ('SVM', SVC(probability=True))
]

# Function to evaluate models and return metrics
def evaluate_models(models, X_train, X_test, y_train, y_test, title_prefix=""):
    fitted_models = {}
    metrics = {
        "Model": [],
        "Accuracy": [],
        "Precision": [],
        "Recall": [],
        "F1 Score": [],
        "AUC-ROC": [],
        "Jaccard Score": []
    }

    # Train and evaluate individual models
    for name, model in models:
        model.fit(X_train, y_train)
        fitted_models[name] = model
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        metrics["Model"].append(name)
        metrics["Accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Precision"].append(precision_score(y_test, y_pred, zero_division=0))
        metrics["Recall"].append(recall_score(y_test, y_pred, zero_division=0))
        metrics["F1 Score"].append(f1_score(y_test, y_pred, zero_division=0))
        metrics["AUC-ROC"].append(roc_auc_score(y_test, y_pred_prob))
        metrics["Jaccard Score"].append(jaccard_score(y_test, y_pred))

    # Voting Classifier
    voting_clf = VotingClassifier(estimators=models, voting='soft')
    voting_clf.fit(X_train, y_train)
    y_pred_voting = voting_clf.predict(X_test)
    y_pred_prob_voting = voting_clf.predict_proba(X_test)[:, 1]
    metrics["Model"].append("Voting Classifier (Soft Voting)")
    metrics["Accuracy"].append(accuracy_score(y_test, y_pred_voting))
    metrics["Precision"].append(precision_score(y_test, y_pred_voting, zero_division=0))
    metrics["Recall"].append(recall_score(y_test, y_pred_voting, zero_division=0))
    metrics["F1 Score"].append(f1_score(y_test, y_pred_voting, zero_division=0))
    metrics["AUC-ROC"].append(roc_auc_score(y_test, y_pred_prob_voting))
    metrics["Jaccard Score"].append(jaccard_score(y_test, y_pred_voting))

    # Convert metrics to DataFrame
    metrics_df = pd.DataFrame(metrics)
    print(f"\n{title_prefix} Metrics:")
    print(metrics_df)

    # Plot ROC Curves
    plt.figure(figsize=(10, 8))
    for name, model in fitted_models.items():
        y_prob = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
    y_prob_voting = voting_clf.predict_proba(X_test)[:, 1]
    fpr_voting, tpr_voting, _ = roc_curve(y_test, y_pred_prob_voting)
    roc_auc_voting = auc(fpr_voting, tpr_voting)
    plt.plot(fpr_voting, tpr_voting, label=f'Voting Classifier (AUC = {roc_auc_voting:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.title(f'{title_prefix} ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.savefig(f'{title_prefix.lower().replace(" ", "_")}_roc_curve.png', dpi=300)
    plt.show()

    # Plot Precision-Recall Curves
    plt.figure(figsize=(12, 8))
    for name, model in fitted_models.items():
        y_prob = model.predict_proba(X_test)[:, 1]
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        ap_score = average_precision_score(y_test, y_prob)
        plt.plot(recall, precision, label=f'{name} (AP = {ap_score:.2f})')
    y_prob_voting = voting_clf.predict_proba(X_test)[:, 1]
    precision_voting, recall_voting, _ = precision_recall_curve(y_test, y_prob_voting)
    ap_score_voting = average_precision_score(y_test, y_prob_voting)
    plt.plot(recall_voting, precision_voting, label=f'Voting Classifier (AP = {ap_score_voting:.2f})')
    plt.title(f'{title_prefix} Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend(loc='lower left')
    plt.grid(True)
    plt.savefig(f'{title_prefix.lower().replace(" ", "_")}_pr_curve.png', dpi=300)
    plt.show()

    return fitted_models, metrics_df

# Evaluate models on original data
print("Evaluating models on original data...")
original_models, original_metrics = evaluate_models(models, X_train_scaled, X_test_scaled, y_train, y_test, "Original Data")

# Apply SMOTE-ENN
smote_enn = SMOTEENN(random_state=100)
X_train_smote_enn, y_train_smote_enn = smote_enn.fit_resample(X_train_scaled, y_train)

# Print class distribution after SMOTE-ENN
print("\nClass Distribution After SMOTE-ENN (Training Set):")
class_counts_after = pd.Series(y_train_smote_enn).value_counts()
class_proportions_after = pd.Series(y_train_smote_enn).value_counts(normalize=True)
print("Counts:\n", class_counts_after)
print("Proportions:\n", class_proportions_after)

# Evaluate models on SMOTE-ENN balanced data
print("\nEvaluating models on SMOTE-ENN balanced data...")
smote_enn_models, smote_enn_metrics = evaluate_models(models, X_train_smote_enn, X_test_scaled, y_train_smote_enn, y_test, "SMOTE-ENN Balanced Data")

# Feature Importance and SHAP Analysis (for Random Forest on SMOTE-ENN data)
rf_model = smote_enn_models['RF']
feature_names = cleaned_df_copy.drop(['target'], axis=1).columns
importances = rf_model.feature_importances_
plt.figure(figsize=(10, 10))
plt.barh(feature_names, importances, color='green')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance (Random Forest - SMOTE-ENN)")
plt.savefig('feature_importance_smote_enn.png', dpi=300)
plt.show()

# SHAP Analysis
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_scaled)
for class_idx in range(shap_values.shape[2]):
    print(f"SHAP summary for Class {class_idx} (SMOTE-ENN)")
    custom_cmap = LinearSegmentedColormap.from_list("custom_cmap", ["green", "blue"])
    shap.summary_plot(shap_values[:, :, class_idx], X_test_scaled, feature_names=feature_names, plot_type="bar", cmap=custom_cmap(0.5))


Applying SMOTE on combined heart disease dataset and performance comparison before and after applying SMOTE for data balancing.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, jaccard_score, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import shap
from itertools import combinations

# Load the cleaned dataset
cleaned_df = pd.read_csv('/content/drive/MyDrive/Datasets/heart_disease/cleaned_merged_heart_dataset.csv')

# Separate the 'target' column (target variable) from the features
target = cleaned_df['target']  # Extract the target column
features = cleaned_df.drop(['target'], axis=1)  # Remove 'target' from features

# Normalize the feature columns
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)  # Normalize only the feature columns

# Convert the normalized features back to a DataFrame and restore column names
normalized_features = pd.DataFrame(normalized_features, columns=features.columns)

# Recombine the target column with the normalized features
cleaned_df = pd.concat([normalized_features, target.reset_index(drop=True)], axis=1)

# Select relevant features
temp_df = cleaned_df.drop(['sex', 'trestbps', 'chol', 'fbs', 'exang', 'slope', 'ca', 'restecg', 'age'], axis=1).copy()

# Create a copy for testing
cleaned_df_copy = temp_df.copy()

# Function to calculate derived features
def calculate_derived_features(df):
    feature_cols = ['cp', 'thalachh', 'oldpeak', 'thal']
    combined_features = pd.DataFrame()
    pairs = list(combinations(feature_cols, 2))
    for idx, (n1, n2) in enumerate(pairs, 1):
        max_val = df[[n1, n2]].max(axis=1)
        min_val = df[[n1, n2]].min(axis=1)
        combined_features[f'S1_P{idx}'] = np.log10(np.where(max_val > 0, max_val, 1e-6))
        combined_features[f'S2_P{idx}'] = np.log10(np.where(min_val > 0, min_val, 1e-6))
        combined_features[f'S3_P{idx}'] = max_val - min_val
        combined_features[f'S4_P{idx}'] = max_val * min_val
        combined_features[f'S5_P{idx}'] = max_val + min_val
        combined_features[f'S6_P{idx}'] = max_val / (min_val + 1e-6)
    return combined_features

# Apply the function to the data
combined_features = calculate_derived_features(cleaned_df_copy)
cleaned_df_copy = pd.concat([cleaned_df_copy, combined_features], axis=1)

# Drop original features to avoid redundancy
#cleaned_df_copy = cleaned_df_copy.drop(['cp','thal','thalachh','oldpeak','S1_C1','S2_C1','S3_C1','S4_C1','S5_C1','S6_C1','S1_C2','S2_C2','S3_C2','S4_C2','S5_C2','S6_C2', 'S1_C3','S2_C3','S3_C3','S4_C3','S5_C3','S6_C3','S1_C4','S2_C4','S3_C4','S4_C4','S5_C4','S6_C4','S1_C5','S2_C5','S3_C5','S4_C5','S5_C5','S6_C5'],axis = 1).copy()
cleaned_df_copy = cleaned_df_copy.drop(['cp', 'thal', 'thalachh', 'oldpeak'], axis=1).copy()

# Prepare features and target
X = cleaned_df_copy.drop(['target'], axis=1).values
y = cleaned_df_copy['target'].values

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Print class distribution before SMOTE
print("\nClass Distribution Before SMOTE (Training Set):")
class_counts_before = pd.Series(y_train).value_counts()
class_proportions_before = pd.Series(y_train).value_counts(normalize=True)
print("Counts:\n", class_counts_before)
print("Proportions:\n", class_proportions_before)

# Define classifiers
models = [
    ('KNN', KNeighborsClassifier(n_neighbors=3)),
    ('DT', DecisionTreeClassifier()),
    ('NB', GaussianNB()),
    ('RF', RandomForestClassifier(n_estimators=100)),
    ('ABC', AdaBoostClassifier(n_estimators=100)),
    ('GB', GradientBoostingClassifier()),
    ('LR', LogisticRegression()),
    ('SVM', SVC(probability=True))
]

# Function to evaluate models and return metrics
def evaluate_models(models, X_train, X_test, y_train, y_test, title_prefix=""):
    fitted_models = {}
    metrics = {
        "Model": [],
        "Accuracy": [],
        "Precision": [],
        "Recall": [],
        "F1 Score": [],
        "AUC-ROC": [],
        "Jaccard Score": []
    }

    # Train and evaluate individual models
    for name, model in models:
        model.fit(X_train, y_train)
        fitted_models[name] = model
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        metrics["Model"].append(name)
        metrics["Accuracy"].append(accuracy_score(y_test, y_pred))
        metrics["Precision"].append(precision_score(y_test, y_pred, zero_division=0))
        metrics["Recall"].append(recall_score(y_test, y_pred, zero_division=0))
        metrics["F1 Score"].append(f1_score(y_test, y_pred, zero_division=0))
        metrics["AUC-ROC"].append(roc_auc_score(y_test, y_pred_prob))
        metrics["Jaccard Score"].append(jaccard_score(y_test, y_pred))

    # Voting Classifier
    voting_clf = VotingClassifier(estimators=models, voting='soft')
    voting_clf.fit(X_train, y_train)
    y_pred_voting = voting_clf.predict(X_test)
    y_pred_prob_voting = voting_clf.predict_proba(X_test)[:, 1]
    metrics["Model"].append("Voting Classifier (Soft Voting)")
    metrics["Accuracy"].append(accuracy_score(y_test, y_pred_voting))
    metrics["Precision"].append(precision_score(y_test, y_pred_voting, zero_division=0))
    metrics["Recall"].append(recall_score(y_test, y_pred_voting, zero_division=0))
    metrics["F1 Score"].append(f1_score(y_test, y_pred_voting, zero_division=0))
    metrics["AUC-ROC"].append(roc_auc_score(y_test, y_pred_prob_voting))
    metrics["Jaccard Score"].append(jaccard_score(y_test, y_pred_voting))

    # Convert metrics to DataFrame
    metrics_df = pd.DataFrame(metrics)
    print(f"\n{title_prefix} Metrics:")
    print(metrics_df)

    # Plot ROC Curves
    plt.figure(figsize=(10, 8))
    for name, model in fitted_models.items():
        y_prob = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
    y_prob_voting = voting_clf.predict_proba(X_test)[:, 1]
    fpr_voting, tpr_voting, _ = roc_curve(y_test, y_pred_prob_voting)
    roc_auc_voting = auc(fpr_voting, tpr_voting)
    plt.plot(fpr_voting, tpr_voting, label=f'Voting Classifier (AUC = {roc_auc_voting:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.title(f'{title_prefix} ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.savefig(f'{title_prefix.lower().replace(" ", "_")}_roc_curve.png', dpi=300)
    plt.show()

    # Plot Precision-Recall Curves
    plt.figure(figsize=(12, 8))
    for name, model in fitted_models.items():
        y_prob = model.predict_proba(X_test)[:, 1]
        precision, recall, _ = precision_recall_curve(y_test, y_prob)
        ap_score = average_precision_score(y_test, y_prob)
        plt.plot(recall, precision, label=f'{name} (AP = {ap_score:.2f})')
    y_prob_voting = voting_clf.predict_proba(X_test)[:, 1]
    precision_voting, recall_voting, _ = precision_recall_curve(y_test, y_prob_voting)
    ap_score_voting = average_precision_score(y_test, y_prob_voting)
    plt.plot(recall_voting, precision_voting, label=f'Voting Classifier (AP = {ap_score_voting:.2f})')
    plt.title(f'{title_prefix} Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend(loc='lower left')
    plt.grid(True)
    plt.savefig(f'{title_prefix.lower().replace(" ", "_")}_pr_curve.png', dpi=300)
    plt.show()

    return fitted_models, metrics_df

# Evaluate models on original data
print("Evaluating models on original data...")
original_models, original_metrics = evaluate_models(models, X_train_scaled, X_test_scaled, y_train, y_test, "Original Data")

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Print class distribution after SMOTE
print("\nClass Distribution After SMOTE (Training Set):")
class_counts_after = pd.Series(y_train_smote).value_counts()
class_proportions_after = pd.Series(y_train_smote).value_counts(normalize=True)
print("Counts:\n", class_counts_after)
print("Proportions:\n", class_proportions_after)

# Evaluate models on SMOTE balanced data
print("\nEvaluating models on SMOTE balanced data...")
smote_models, smote_metrics = evaluate_models(models, X_train_smote, X_test_scaled, y_train_smote, y_test, "SMOTE Balanced Data")

# Feature Importance and SHAP Analysis (for Random Forest on SMOTE data)
rf_model = smote_models['RF']
feature_names = cleaned_df_copy.drop(['target'], axis=1).columns
importances = rf_model.feature_importances_
plt.figure(figsize=(10, 10))
plt.barh(feature_names, importances, color='green')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance (Random Forest - SMOTE)")
plt.savefig('feature_importance_smote.png', dpi=300)
plt.show()

# SHAP Analysis
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_scaled)
for class_idx in range(shap_values.shape[2]):
    print(f"SHAP summary for Class {class_idx} (SMOTE)")
    custom_cmap = LinearSegmentedColormap.from_list("custom_cmap", ["green", "blue"])
    shap.summary_plot(shap_values[:, :, class_idx], X_test_scaled, feature_names=feature_names, plot_type="bar", cmap=custom_cmap(0.5))
