# 1. Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as  sns

In [None]:
viscoe_df = pd.read_csv('TPA_Datasets.csv')
viscoe_df.tail()

In [None]:
sns.countplot(data=viscoe_df, x='TPA_Label')  #visualise data with label

In [None]:
viscoe_df.columns

# Select features
feature_df = viscoe_df[['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ',
       'Glucose']]

# Independent variable (features)
X = np.asarray(feature_df)

# Dependent variable (target)
y = np.asarray(viscoe_df['TPA_Label'])

print(X)


In [None]:
import numpy as np
from sklearn.impute import KNNImputer

# Impute missing values using KNNImputer
nan = np.nan
imputer = KNNImputer(n_neighbors=6, weights="uniform")
X_imputed = imputer.fit_transform(X)
X = feature_df = X_imputed

print("X values after KNN imputation:")
print(X)

In [None]:
# Count samples in each label before resampling
unique_labels, label_counts = np.unique(y, return_counts=True)
print("Label counts before resampling:")
for label, count in zip(unique_labels, label_counts):
    print("Label {}: {}".format(label, count)) 

In [None]:
from imblearn.over_sampling import SMOTE

# Resample using SMOTE
smote = SMOTE(random_state=6)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Count samples in each label after resampling
unique_labels_resampled, label_counts_resampled = np.unique(y_resampled, return_counts=True)
print("\nLabel counts after resampling:")
for label, count in zip(unique_labels_resampled, label_counts_resampled):
    print("Label {}: {}".format(label, count))

# 2. Train and test with 8 classifiers

1. Stratified K-fold CV  &  RANDOM FOREST CLASSIFIER 

In [None]:
from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

# Feature Scaling for input features.
from statistics import mean, stdev
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
  
# Create RF classifier object.
rf_classifier = RandomForestClassifier(random_state=42)
  
# Create StratifiedKFold object.
skf_rf_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
  
for train_index, test_index in skf_rf_classifier.split(X, y):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    rf_classifier.fit(X_train_fold, y_train_fold)
    lst_accu_stratified.append(rf_classifier.score(X_test_fold, y_test_fold))
  
# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))

# RANDOM FOREST - IMPORTANT FEATURE

importance_rf_classifier = rf_classifier.feature_importances_
# summarize feature importance
for i,v in enumerate(importance_rf_classifier):
	print('Feature: %0d, Score: %.5f' % (i,v))

feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']


# plot feature importance
plt.figure(figsize=(8, 6))
plt.bar(range(len(importance_rf_classifier)), importance_rf_classifier)
plt.title("RF - Feature Importance")
plt.xticks(range(len(importance_rf_classifier)), feature_names, rotation='vertical')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()


# ROC CURVE for Random Forest Classifier

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc, RocCurveDisplay

tprs = []
aucs = []
mean_fpr_rf = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_rf_classifier.split(X_scaled, y_resampled)):
    rf_classifier.fit(X_scaled[train], y_resampled[train])
    
    # Plot ROC curve for each fold
    viz = RocCurveDisplay.from_estimator(
        rf_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",  # Add 1 to fold for the updated name
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_rf_classifier.n_splits - 1),
    )
    
    interp_tpr = np.interp(mean_fpr_rf, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

mean_tpr_rf = np.mean(tprs, axis=0)
mean_tpr_rf[-1] = 1.0
mean_auc_rf = auc(mean_fpr_rf, mean_tpr_rf)
std_auc_rf = np.std(aucs)

ax.plot(
    mean_fpr_rf,
    mean_tpr_rf,
    color="b",
    label=r"Mean ROC - RF (AUC_RF = %0.2f $\pm$ %0.2f)" % (mean_auc_rf, std_auc_rf),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_rf + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_rf - std_tpr, 0)

ax.fill_between(
    mean_fpr_rf,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - RF",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

2. Stratified K-fold CV  &  K-nearest Neighbor CLASSIFIER 

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev
import matplotlib.pyplot as plt
import numpy as np

scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
knn_classifier = KNeighborsClassifier(n_neighbors=6)
skf_knn_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
feature_importance_scores = {}

for train_index, test_index in skf_knn_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    knn_classifier.fit(X_train_fold, y_train_fold)
    accu_score = knn_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = knn_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)

total_score = sum(feature_importance_scores.values())
feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}

print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:', max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:', min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:', mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i,v in enumerate(importance_rf_classifier):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("KNN - Feature Importance")
    plt.xticks(range(len(importance_rf_classifier)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC-AUC
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc, RocCurveDisplay

tprs = []
aucs = []
mean_fpr_knn = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_knn_classifier.split(X_scaled, y_resampled)):
    knn_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        knn_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_knn_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_knn, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

mean_tpr_knn = np.mean(tprs, axis=0)
mean_tpr_knn[-1] = 1.0
mean_auc_knn = auc(mean_fpr_knn, mean_tpr_knn)
std_auc_knn = np.std(aucs)

ax.plot(
    mean_fpr_knn,
    mean_tpr_knn,
    color="b",
    label=r"Mean ROC - KNN (AUC_KNN = %0.2f $\pm$ %0.2f)" % (mean_auc_knn, std_auc_knn),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_knn + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_knn - std_tpr, 0)

ax.fill_between(
    mean_fpr_knn,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - KNN",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()


3. Stratified K-fold CV  &  Support Vector Machine CLASSIFIER 

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
svm_classifier = SVC(kernel='linear', C=1)
skf_svm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_svm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    svm_classifier.fit(X_train_fold, y_train_fold)
    accu_score = svm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    feature_importance_scores = dict(enumerate(svm_classifier.coef_[0]))
total_score = sum(feature_importance_scores.values())
feature_importance_scores = {key: value / total_score for key, value in feature_importance_scores.items()}
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:', max(lst_accu_stratified) * 100, '%')
    print('\nMinimum Accuracy:', min(lst_accu_stratified) * 100, '%')
    print('\nOverall Accuracy:', mean(lst_accu_stratified) * 100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    print("\nFeature Importance Scores:")
    for i, v in enumerate(importance_rf_classifier):
        print('Feature: %0d, Score: %.5f' % (i, v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("SVM - Feature Importance")
    plt.xticks(range(len(importance_rf_classifier)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC-AUC

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc, RocCurveDisplay
tprs = []
aucs = []
mean_fpr_svm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_svm_classifier.split(X_scaled, y_resampled)):
    svm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(svm_classifier, X_scaled[test], y_resampled[test], name=f"Fold {fold + 1}", alpha=0.3, lw=1, ax=ax, plot_chance_level=(fold == skf_svm_classifier.n_splits - 1))
    interp_tpr = np.interp(mean_fpr_svm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_svm = np.mean(tprs, axis=0)
mean_tpr_svm[-1] = 1.0
mean_auc_svm = auc(mean_fpr_svm, mean_tpr_svm)
std_auc_svm = np.std(aucs)
ax.plot(mean_fpr_svm, mean_tpr_svm, color="b", label=r"Mean ROC - SVM (AUC_SVM = %0.2f $\pm$ %0.2f)" % (mean_auc_svm, std_auc_svm), lw=2, alpha=0.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_svm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_svm - std_tpr, 0)
ax.fill_between(mean_fpr_svm, tprs_lower, tprs_upper, color="grey", alpha=0.2, label=r"$\pm$ 1 std. dev.")
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], xlabel="False Positive Rate", ylabel="True Positive Rate", title=f"Mean ROC curve - SVM")
ax.axis("square")
ax.legend(loc="lower right")
plt.show()


4. Stratified K-fold CV  &  Decision Tree CLASSIFIER 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from statistics import mean, stdev
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
dt_classifier = DecisionTreeClassifier(random_state=42)
skf_dt_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_dt_classifier.split(X_resampled, y_resampled):
    X_train_fold, X_test_fold = X_resampled[train_index], X_resampled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    dt_classifier.fit(X_train_fold, y_train_fold)
    accu_score = dt_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    feature_importance_scores_fold = dt_classifier.feature_importances_
    for i, score in enumerate(feature_importance_scores_fold):
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + score
total_score = sum(feature_importance_scores.values())
feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('Maximum Accuracy That can be obtained from this model is:', max(lst_accu_stratified)*100, '%')
    print('Minimum Accuracy:', min(lst_accu_stratified)*100, '%')
    print('Overall Accuracy:', mean(lst_accu_stratified)*100, '%')
    print('Standard Deviation is:', stdev(lst_accu_stratified))
    print("Feature Importance Scores:")
    for i,v in enumerate(importance_rf_classifier):
        print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("DT - Feature Importance")
    plt.xticks(range(len(importance_rf_classifier)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC-AUC

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc, RocCurveDisplay
tprs = []
aucs = []
mean_fpr_dt = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_dt_classifier.split(X_scaled, y_resampled)):
    dt_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        dt_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_dt_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_dt, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_dt = np.mean(tprs, axis=0)
mean_tpr_dt[-1] = 1.0
mean_auc_dt = auc(mean_fpr_dt, mean_tpr_dt)
std_auc_dt = np.std(aucs)
ax.plot(mean_fpr_dt, mean_tpr_dt, color="b", label=r"Mean ROC - DT (AUC_DT = %0.2f $\pm$ %0.2f)" % (mean_auc_dt, std_auc_dt), lw=2, alpha=0.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_dt + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_dt - std_tpr, 0)
ax.fill_between(mean_fpr_dt, tprs_lower, tprs_upper, color="grey", alpha=0.2, label=r"$\pm$ 1 std. dev.")
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], xlabel="False Positive Rate", ylabel="True Positive Rate", title=f"Mean ROC curve - DT")
ax.axis("square")
ax.legend(loc="lower right")
plt.show()


5. Stratified K-fold CV  &  RF CLASSIFIER + Bagging Classifier (RFBM)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_rf_classifier = RandomForestClassifier(random_state=42)
rfbm_classifier = BaggingClassifier(base_rf_classifier, n_estimators=10, random_state=42)
skf_rfbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_rfbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    rfbm_classifier.fit(X_train_fold, y_train_fold)
    accu_score = rfbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = rfbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:', max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:', min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:', mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i,v in enumerate(importance_rf_classifier):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("RFBM - Feature Importance")
    plt.xticks(range(len(importance_rf_classifier)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC-AUC
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc, RocCurveDisplay
tprs = []
aucs = []
mean_fpr_rfbm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_rfbm_classifier.split(X_scaled, y_resampled)):
    rfbm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        rfbm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_rfbm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_rfbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_rfbm = np.mean(tprs, axis=0)
mean_tpr_rfbm[-1] = 1.0
mean_auc_rfbm = auc(mean_fpr_rfbm, mean_tpr_rfbm)
std_auc_rfbm = np.std(aucs)
ax.plot(
    mean_fpr_rfbm,
    mean_tpr_rfbm,
    color="b",
    label=r"Mean ROC - RFBM (AUC_RFBM = %0.2f $\pm$ %0.2f)" % (mean_auc_rfbm, std_auc_rfbm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_rfbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_rfbm - std_tpr, 0)
ax.fill_between(
    mean_fpr_rfbm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate", 
    title=f"Mean ROC curve - RFBM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()


6. Stratified K-fold CV  &  KNN CLASSIFIER + Bagging Classifier (KNNBM)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev

scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_knn_classifier = KNeighborsClassifier(n_neighbors=6)
knnbm_classifier = BaggingClassifier(base_knn_classifier, n_estimators=10, random_state=42)
skf_knnbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_knnbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    knnbm_classifier.fit(X_train_fold, y_train_fold)
    accu_score = knnbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = knnbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))

    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i,v in enumerate(importance_rf_classifier):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("KNNBM - Feature Importance")
    plt.xticks(range(len(importance_rf_classifier)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC-AUC

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc, RocCurveDisplay

tprs = []
aucs = []
mean_fpr_knnbm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_knnbm_classifier.split(X_scaled, y_resampled)):
    knnbm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        knnbm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_knnbm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_knnbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_knnbm = np.mean(tprs, axis=0)
mean_tpr_knnbm[-1] = 1.0
mean_auc_knnbm = auc(mean_fpr_knnbm, mean_tpr_knnbm)
std_auc_knnbm = np.std(aucs)
ax.plot(
    mean_fpr_knnbm,
    mean_tpr_knnbm,
    color="b",
    label=r"Mean ROC - KNNBM (AUC_KNNBM = %0.2f $\pm$ %0.2f)" % (mean_auc_knnbm, std_auc_knnbm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_knnbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_knnbm - std_tpr, 0)
ax.fill_between(
    mean_fpr_knnbm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - KNNBM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

7. Stratified K-fold CV  &  SVM CLASSIFIER + Bagging Classifier (SVMBM)

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from statistics import mean, stdev
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_svm_classifier = SVC(kernel='linear', random_state=42)
svmbm_classifier = BaggingClassifier(base_svm_classifier, n_estimators=10, random_state=42)
skf_svmbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_svmbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    svmbm_classifier.fit(X_train_fold, y_train_fold)
    accu_score = svmbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = svmbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:', max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:', min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:', mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i, v in enumerate(feature_importance_scores.values()):
        print('Feature: %0d, Score: %.5f' % (i, v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("SVMBM - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC-AUC

from statistics import mean, stdev
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import auc, RocCurveDisplay
tprs = []
aucs = []
mean_fpr_svmbm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_svmbm_classifier.split(X_scaled, y_resampled)):
    svmbm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        svmbm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_svmbm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_svmbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_svmbm = np.mean(tprs, axis=0)
mean_tpr_svmbm[-1] = 1.0
mean_auc_svmbm = auc(mean_fpr_svmbm, mean_tpr_svmbm)
std_auc_svmbm = np.std(aucs)
ax.plot(
    mean_fpr_svmbm,
    mean_tpr_svmbm,
    color="b",
    label=r"Mean ROC - SVMBM (AUC_SVMBM = %0.2f $\pm$ %0.2f)" % (mean_auc_svmbm, std_auc_svmbm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_svmbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_svmbm - std_tpr, 0)
ax.fill_between(
    mean_fpr_svmbm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - SVMBM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()


8. Stratified K-fold CV  &  Decision Tree CLASSIFIER + Bagging Classifier (DTBM)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from statistics import mean, stdev
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_dt_classifier = DecisionTreeClassifier(random_state=42)
dtbm_classifier = BaggingClassifier(base_dt_classifier, n_estimators=10, random_state=42)
skf_dtbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_dtbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    dtbm_classifier.fit(X_train_fold, y_train_fold)
    accu_score = dtbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = dtbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
if lst_accu_stratified:
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("DTBM - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()

# 3. ROC-AUC

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import RocCurveDisplay
tprs = []
aucs = []
mean_fpr_dtbm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_dtbm_classifier.split(X_scaled, y_resampled)):
    dtbm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(dtbm_classifier, X_scaled[test], y_resampled[test], name=f"Fold {fold + 1}", alpha=0.3, lw=1, ax=ax, plot_chance_level=(fold == skf_dtbm_classifier.n_splits - 1))
    interp_tpr = np.interp(mean_fpr_dtbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_dtbm = np.mean(tprs, axis=0)
mean_tpr_dtbm[-1] = 1.0
mean_auc_dtbm = auc(mean_fpr_dtbm, mean_tpr_dtbm)
std_auc_dtbm = np.std(aucs)
ax.plot(mean_fpr_dtbm, mean_tpr_dtbm, color="b", label=r"Mean ROC - DTBM (AUC_DTBM = %0.2f $\pm$ %0.2f)" % (mean_auc_dtbm, std_auc_dtbm), lw=2, alpha=0.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_dtbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_dtbm - std_tpr, 0)
ax.fill_between(mean_fpr_dtbm, tprs_lower, tprs_upper, color="grey", alpha=0.2, label=r"$\pm$ 1 std. dev.")
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], xlabel="False Positive Rate", ylabel="True Positive Rate", title=f"Mean ROC curve - DTBM")
ax.axis("square")
ax.legend(loc="lower right")
plt.show()


# ROC-AUC

In [None]:

import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Times New Roman"
# Plot the mean ROC curve
plt.figure(figsize=(6, 5))
plt.plot(mean_fpr_rf, mean_tpr_rf, color="cornflowerblue", linestyle='dashed', label=f"ROC - RF (AUC = {mean_auc_rf:.2f} $\pm$ {std_auc_rf:.2f})")
plt.plot(mean_fpr_knn, mean_tpr_knn, color="lightcoral", linestyle='dashed', label=f"ROC - KNN (AUC = {mean_auc_knn:.2f} $\pm$ {std_auc_knn:.2f})")
plt.plot(mean_fpr_svm, mean_tpr_svm, color="limegreen", linestyle='dashed', label=f"ROC - SVM (AUC = {mean_auc_svm:.2f} $\pm$ {std_auc_svm:.2f})")
plt.plot(mean_fpr_dt, mean_tpr_dt, color="dimgrey", linestyle='dashed', label=f"ROC - DT (AUC = {mean_auc_dt:.2f} $\pm$ {std_auc_dt:.2f})")

plt.plot(mean_fpr_rfbm, mean_tpr_rfbm, color="cornflowerblue", label=f"ROC - RFBM (AUC = {mean_auc_rfbm:.2f} $\pm$ {std_auc_rfbm:.2f})")
plt.plot(mean_fpr_knnbm, mean_tpr_knnbm, color="lightcoral", label=f"ROC - KNNBM (AUC = {mean_auc_knnbm:.2f} $\pm$ {std_auc_knnbm:.2f})")
plt.plot(mean_fpr_svmbm, mean_tpr_svmbm, color="limegreen", label=f"ROC - SVMBM (AUC = {mean_auc_svmbm:.2f} $\pm$ {std_auc_svmbm:.2f})")
plt.plot(mean_fpr_dtbm, mean_tpr_dtbm, color="dimgrey", label=f"ROC - DTBM (AUC = {mean_auc_dtbm:.2f} $\pm$ {std_auc_dtbm:.2f})")
plt.plot([0, 1], [0, 1], linestyle="-.", color="gray", label="Random Classifier")

plt.plot([0, 1], [1, 1], color='black', lw=1, linestyle=':', label='Perfect Classifier')
plt.plot([0, 0], [0, 1], color='black', lw=1, linestyle=':')

# Set graph properties
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False Positive Rate", fontsize=13)
plt.ylabel("True Positive Rate", fontsize=13)
plt.legend(loc="lower right", fontsize=9.8)

# Display the plot
plt.show()
