# 1. Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
from statistics import mean, stdev
from sklearn import preprocessing
from sklearn.preprocessing import label_binarize, MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, auc, RocCurveDisplay

In [None]:
viscoe_df = pd.read_csv('TPA_Datasets.csv')
viscoe_df.tail()

In [None]:
sns.countplot(data=viscoe_df, x='TPA_Label')  #visualise data with label

In [None]:
viscoe_df.columns

# Select features
feature_df = viscoe_df[['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']]

# Independent variable (features)
X = np.asarray(feature_df)

# Dependent variable (target)
y = np.asarray(viscoe_df['TPA_Label'])

print(X)


In [None]:
# Impute missing values using KNNImputer
nan = np.nan
imputer = KNNImputer(n_neighbors=6, weights="uniform")
X_imputed = imputer.fit_transform(X)
X = feature_df = X_imputed

print("X values after KNN imputation:")
print(X)

In [None]:
# Count samples in each label before resampling
unique_labels, label_counts = np.unique(y, return_counts=True)
print("Label counts before resampling:")
for label, count in zip(unique_labels, label_counts):
    print("Label {}: {}".format(label, count)) 

In [None]:
# Resample using SMOTE
smote = SMOTE(random_state=6, k_neighbors=5)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Count samples in each label after resampling
unique_labels_resampled, label_counts_resampled = np.unique(y_resampled, return_counts=True)
print("\nLabel counts after resampling:")
for label, count in zip(unique_labels_resampled, label_counts_resampled):
    print("Label {}: {}".format(label, count))

# Train and test with 8 classifiers

1. Stratified K-fold CV  &  RANDOM FOREST CLASSIFIER 

In [None]:
# Feature Scaling for input features.
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
  
# Create RF classifier object.
rf_classifier = RandomForestClassifier(random_state=42, criterion= 'gini', max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators= 100)


# Create StratifiedKFold object.
skf_rf_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_rf_stratified = []
lst_precision_rf_stratified = []
lst_recall_rf_stratified = []
  
for train_index, test_index in skf_rf_classifier.split(X, y):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    rf_classifier.fit(X_train_fold, y_train_fold)
    lst_accu_stratified.append(rf_classifier.score(X_test_fold, y_test_fold))
    
    # Predict on the test fold
    y_pred_fold = rf_classifier.predict(X_test_fold)

    # Calculate F1, precision, and recall for the fold
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted')
    precision = precision_score(y_test_fold, y_pred_fold, average='weighted')
    recall = recall_score(y_test_fold, y_pred_fold, average='weighted')

    lst_f1_rf_stratified.append(f1)
    lst_precision_rf_stratified.append(precision)
    lst_recall_rf_stratified.append(recall)  
    
# Print evaluation metrics
print('List of F1-scores for RF each fold:', lst_f1_rf_stratified)
print('Mean F1-score for RF:', mean(lst_f1_rf_stratified), '±', stdev(lst_f1_rf_stratified))

print('List of Precision for RF each fold:', lst_precision_rf_stratified)
print('Mean Precision for RF:', mean(lst_precision_rf_stratified), '±', stdev(lst_precision_rf_stratified))

print('List of Recall for RF each fold:', lst_recall_rf_stratified)
print('Mean Recall for RF:', mean(lst_recall_rf_stratified), '±', stdev(lst_recall_rf_stratified))    

print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))

# RANDOM FOREST - IMPORTANT FEATURE

importance_rf_classifier = rf_classifier.feature_importances_
# summarize feature importance
for i,v in enumerate(importance_rf_classifier):
	print('Feature: %0d, Score: %.5f' % (i,v))
# define feature names
feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
# plot feature importance
plt.figure(figsize=(8, 6))
plt.bar(range(len(importance_rf_classifier)), importance_rf_classifier)
plt.title("RF - Feature Importance")
plt.xticks(range(len(importance_rf_classifier)), feature_names, rotation='vertical')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()


# ROC CURVE for Random Forest Classifier
# Evaluate model using k-fold CV and plot ROC curve
tprs = []
aucs = []
mean_fpr_rf = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_rf_classifier.split(X_scaled, y_resampled)):
    rf_classifier.fit(X_scaled[train], y_resampled[train])
    
    # Plot ROC curve for each fold
    viz = RocCurveDisplay.from_estimator(
        rf_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",  # Add 1 to fold for the updated name
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_rf_classifier.n_splits - 1),
    )
    
    interp_tpr = np.interp(mean_fpr_rf, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

# Plot mean ROC curve with variability
mean_tpr_rf = np.mean(tprs, axis=0)
mean_tpr_rf[-1] = 1.0
mean_auc_rf = auc(mean_fpr_rf, mean_tpr_rf)
std_auc_rf = np.std(aucs)

ax.plot(
    mean_fpr_rf,
    mean_tpr_rf,
    color="b",
    label=r"Mean ROC - RF (AUC_RF = %0.2f $\pm$ %0.2f)" % (mean_auc_rf, std_auc_rf),
    lw=2,
    alpha=0.8,
)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_rf + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_rf - std_tpr, 0)

ax.fill_between(
    mean_fpr_rf,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)

ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - RF",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

2. Stratified K-fold CV  &  K-nearest Neighbor CLASSIFIER 

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
knn_classifier = KNeighborsClassifier(n_neighbors=4, algorithm='auto', leaf_size=10, metric='minkowski', p= 4, weights='distance')
skf_knn_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_knn_stratified = []
lst_precision_knn_stratified = []
lst_recall_knn_stratified = []
feature_importance_scores = {}

for train_index, test_index in skf_knn_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    
    knn_classifier.fit(X_train_fold, y_train_fold)
    accu_score = knn_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    y_pred_fold_knn = knn_classifier.predict(X_test_fold)
    f1_knn = f1_score(y_test_fold, y_pred_fold_knn, average='weighted')
    precision_knn = precision_score(y_test_fold, y_pred_fold_knn, average='weighted')
    recall_knn = recall_score(y_test_fold, y_pred_fold_knn, average='weighted')
    lst_f1_knn_stratified.append(f1_knn)
    lst_precision_knn_stratified.append(precision_knn)
    lst_recall_knn_stratified.append(recall_knn)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = knn_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
total_score = sum(feature_importance_scores.values())
feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}

print('List of F1-scores for KNN each fold:', lst_f1_knn_stratified)
print('Mean F1-score for KNN:', mean(lst_f1_knn_stratified), '±', stdev(lst_f1_knn_stratified))
print('List of Precision for KNN each fold:', lst_precision_knn_stratified)
print('Mean Precision for KNN:', mean(lst_precision_knn_stratified), '±', stdev(lst_precision_knn_stratified))
print('List of Recall for KNN each fold:', lst_recall_knn_stratified)
print('Mean Recall for KNN:', mean(lst_recall_knn_stratified), '±', stdev(lst_recall_knn_stratified)).
if lst_accu_stratified:
    print('List of possible accuracy:', lst_accu_stratified)
    print('Mean Accuracy for KNN:', mean(lst_accu_stratified) * 100, '±', stdev(lst_accu_stratified) * 100)

    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i,v in enumerate(feature_importance_scores):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("KNN - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC CURVE for KNN Classifier
tprs = []
aucs = []
mean_fpr_knn = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_knn_classifier.split(X_scaled, y_resampled)):
    knn_classifier.fit(X_scaled[train], y_resampled[train])\
    viz = RocCurveDisplay.from_estimator(
        knn_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_knn_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_knn, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_knn = np.mean(tprs, axis=0)
mean_tpr_knn[-1] = 1.0
mean_auc_knn = auc(mean_fpr_knn, mean_tpr_knn)
std_auc_knn = np.std(aucs)
ax.plot(
    mean_fpr_knn,
    mean_tpr_knn,
    color="b",
    label=r"Mean ROC - KNN (AUC_KNN = %0.2f $\pm$ %0.2f)" % (mean_auc_knn, std_auc_knn),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_knn + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_knn - std_tpr, 0)
ax.fill_between(
    mean_fpr_knn,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - KNN",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

3. Stratified K-fold CV  &  Support Vector Machine CLASSIFIER 

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
svm_classifier = SVC(C=4, kernel='linear', random_state=42)
skf_svm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_svm_stratified = []
lst_precision_svm_stratified = []
lst_recall_svm_stratified = []
feature_importance_scores = {}

for train_index, test_index in skf_svm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    svm_classifier.fit(X_train_fold, y_train_fold)
    y_pred_fold_svm = svm_classifier.predict(X_test_fold)
    accu_score = svm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    f1_svm = f1_score(y_test_fold, y_pred_fold_svm, average='weighted')
    precision_svm = precision_score(y_test_fold, y_pred_fold_svm, average='weighted')
    recall_svm = recall_score(y_test_fold, y_pred_fold_svm, average='weighted')
    lst_f1_svm_stratified.append(f1_svm)
    lst_precision_svm_stratified.append(precision_svm)
    lst_recall_svm_stratified.append(recall_svm)
    feature_importance_scores = dict(enumerate(svm_classifier.coef_[0]))
total_score = sum(feature_importance_scores.values())
feature_importance_scores = {key: value / total_score for key, value in feature_importance_scores.items()}

print('List of F1-scores for SVM each fold:', lst_f1_svm_stratified)
print('Mean F1-score for SVM:', mean(lst_f1_svm_stratified), '±', stdev(lst_f1_svm_stratified))
print('List of Precision for SVM each fold:', lst_precision_svm_stratified)
print('Mean Precision for SVM:', mean(lst_precision_svm_stratified), '±', stdev(lst_precision_svm_stratified))
print('List of Recall for SVM each fold:', lst_recall_svm_stratified)
print('Mean Recall for SVM:', mean(lst_recall_svm_stratified), '±', stdev(lst_recall_svm_stratified))
print('List of possible accuracy:', lst_accu_stratified)
print('Mean Accuracy for SVM:', mean(lst_accu_stratified) * 100, '±', stdev(lst_accu_stratified) * 100)
if lst_accu_stratified:
    print('List of possible accuracy:', lst_accu_stratified)
    print('Mean Accuracy for SVM:', mean(lst_accu_stratified) * 100, '±', stdev(lst_accu_stratified) * 100)
    print("\nFeature Importance Scores:")
    for i,v in enumerate(feature_importance_scores):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("SVM - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC CURVE for SVM Classifier

tprs = []
aucs = []
mean_fpr_svm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_svm_classifier.split(X_scaled, y_resampled)):
    svm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        svm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_svm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_svm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_svm = np.mean(tprs, axis=0)
mean_tpr_svm[-1] = 1.0
mean_auc_svm = auc(mean_fpr_svm, mean_tpr_svm)
std_auc_svm = np.std(aucs)
ax.plot(
    mean_fpr_svm,
    mean_tpr_svm,
    color="b",
    label=r"Mean ROC - SVM (AUC_SVM = %0.2f $\pm$ %0.2f)" % (mean_auc_svm, std_auc_svm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_svm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_svm - std_tpr, 0)
ax.fill_between(
    mean_fpr_svm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - SVM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

4. Stratified K-fold CV  &  Decision Tree CLASSIFIER 

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
dt_classifier = DecisionTreeClassifier(random_state=42, ccp_alpha=0.0, criterion='gini', max_depth=None, max_features=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, splitter='best')
skf_dt_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_dt_stratified = []
lst_precision_dt_stratified = []
lst_recall_dt_stratified = []
feature_importance_scores = {}

for train_index, test_index in skf_dt_classifier.split(X_resampled, y_resampled):
    X_train_fold, X_test_fold = X_resampled[train_index], X_resampled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    dt_classifier.fit(X_train_fold, y_train_fold)
    y_pred_fold_dt = dt_classifier.predict(X_test_fold)
    f1_dt = f1_score(y_test_fold, y_pred_fold_dt, average='weighted')
    precision_dt = precision_score(y_test_fold, y_pred_fold_dt, average='weighted')
    recall_dt = recall_score(y_test_fold, y_pred_fold_dt, average='weighted')
    lst_f1_dt_stratified.append(f1_dt)
    lst_precision_dt_stratified.append(precision_dt)
    lst_recall_dt_stratified.append(recall_dt)
    accu_score = dt_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    feature_importance_scores_fold = dt_classifier.feature_importances_
    for i, score in enumerate(feature_importance_scores_fold):
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + score
total_score = sum(feature_importance_scores.values())
feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
print('List of F1-scores for Decision Tree each fold:', lst_f1_dt_stratified)
print('Mean F1-score for Decision Tree:', mean(lst_f1_dt_stratified), '±', stdev(lst_f1_dt_stratified))
print('List of Precision for Decision Tree each fold:', lst_precision_dt_stratified)
print('Mean Precision for Decision Tree:', mean(lst_precision_dt_stratified), '±', stdev(lst_precision_dt_stratified))
print('List of Recall for Decision Tree each fold:', lst_recall_dt_stratified)
print('Mean Recall for Decision Tree:', mean(lst_recall_dt_stratified), '±', stdev(lst_recall_dt_stratified))
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    print("\nFeature Importance Scores:")
    for i,v in enumerate(feature_importance_scores):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("DT - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC CURVE for DT Classifier

tprs = []
aucs = []
mean_fpr_dt = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_dt_classifier.split(X_scaled, y_resampled)):
    dt_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        dt_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",  # Add 1 to fold for the updated name
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_dt_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_dt, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_dt = np.mean(tprs, axis=0)
mean_tpr_dt[-1] = 1.0
mean_auc_dt = auc(mean_fpr_dt, mean_tpr_dt)
std_auc_dt = np.std(aucs)
ax.plot(
    mean_fpr_dt,
    mean_tpr_dt,
    color="b",
    label=r"Mean ROC - DT (AUC_DT = %0.2f $\pm$ %0.2f)" % (mean_auc_dt, std_auc_dt),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_dt + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_dt - std_tpr, 0)
ax.fill_between(
    mean_fpr_dt,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - DT",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

5. Stratified K-fold CV  &  RF CLASSIFIER + Bagging Classifier (RFBM)

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_rf_classifier = RandomForestClassifier(random_state=42, criterion= 'gini', max_depth=None, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators= 100)
rfbm_classifier = BaggingClassifier(base_rf_classifier, n_estimators=10, random_state=42)
skf_rfbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_rfbm_stratified = []
lst_precision_rfbm_stratified = []
lst_recall_rfbm_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_rfbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    rfbm_classifier.fit(X_train_fold, y_train_fold)
    y_pred_fold_rfbm = rfbm_classifier.predict(X_test_fold)
    f1_rfbm = f1_score(y_test_fold, y_pred_fold_rfbm, average='weighted')
    precision_rfbm = precision_score(y_test_fold, y_pred_fold_rfbm, average='weighted')
    recall_rfbm = recall_score(y_test_fold, y_pred_fold_rfbm, average='weighted')
    lst_f1_rfbm_stratified.append(f1_rfbm)
    lst_precision_rfbm_stratified.append(precision_rfbm)
    lst_recall_rfbm_stratified.append(recall_rfbm)
    accu_score = rfbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = rfbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
print('List of F1-scores for RFBM each fold:', lst_f1_rfbm_stratified)
print('\nMean F1-score for RFBM:', mean(lst_f1_rfbm_stratified))
print('\nStandard Deviation of F1-score for RFBM:', stdev(lst_f1_rfbm_stratified))
print('List of Precision for RFBM each fold:', lst_precision_rfbm_stratified)
print('\nMean Precision:', mean(lst_precision_rfbm_stratified))
print('\nStandard Deviation of Precision RFBM:', stdev(lst_precision_rfbm_stratified))
print('List of Recall for RFBM each fold:', lst_recall_rfbm_stratified)
print('\nMean Recall RFBM:', mean(lst_recall_rfbm_stratified))
print('\nStandard Deviation of Recall RFBM:', stdev(lst_recall_rfbm_stratified))
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i,v in enumerate(feature_importance_scores):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("RFBM - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC CURVE for Random Forest Bagging Classifier

tprs = []
aucs = []
mean_fpr_rfbm = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_rfbm_classifier.split(X_scaled, y_resampled)):
    rfbm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        rfbm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",  # Add 1 to fold for the updated name
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_rfbm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_rfbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_rfbm = np.mean(tprs, axis=0)
mean_tpr_rfbm[-1] = 1.0
mean_auc_rfbm = auc(mean_fpr_rfbm, mean_tpr_rfbm)
std_auc_rfbm = np.std(aucs)
ax.plot(
    mean_fpr_rfbm,
    mean_tpr_rfbm,
    color="b",
    label=r"Mean ROC - RFBM (AUC_RFBM = %0.2f $\pm$ %0.2f)" % (mean_auc_rfbm, std_auc_rfbm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_rfbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_rfbm - std_tpr, 0)
ax.fill_between(
    mean_fpr_rfbm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate", 
    title=f"Mean ROC curve - RFBM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

6. Stratified K-fold CV  &  KNN CLASSIFIER + Bagging Classifier (KNNBM)

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_knn_classifier = KNeighborsClassifier(algorithm='auto', leaf_size=10, metric='minkowski', n_neighbors=4, p= 4, weights='distance')
knnbm_classifier = BaggingClassifier(base_knn_classifier, n_estimators=10, random_state=42)
skf_knnbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_knnbm_stratified = []
lst_precision_knnbm_stratified = []
lst_recall_knnbm_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_knnbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    knnbm_classifier.fit(X_train_fold, y_train_fold)
    y_pred_fold_knnbm = knnbm_classifier.predict(X_test_fold)
    f1_knnbm = f1_score(y_test_fold, y_pred_fold_knnbm, average='weighted')
    precision_knnbm = precision_score(y_test_fold, y_pred_fold_knnbm, average='weighted')
    recall_knnbm = recall_score(y_test_fold, y_pred_fold_knnbm, average='weighted')
    lst_f1_knnbm_stratified.append(f1_knnbm)
    lst_precision_knnbm_stratified.append(precision_knnbm)
    lst_recall_knnbm_stratified.append(recall_knnbm)   
    accu_score = knnbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = knnbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
print('List of F1-scores for KNNBM each fold:', lst_f1_knnbm_stratified)
print('\nMean F1-score for KNNBM:', mean(lst_f1_knnbm_stratified))
print('\nStandard Deviation of F1-score for KNNBM:', stdev(lst_f1_knnbm_stratified))
print('List of Precision for KNNBM each fold:', lst_precision_knnbm_stratified)
print('\nMean Precision:', mean(lst_precision_knnbm_stratified))
print('\nStandard Deviation of Precision KNNBM:', stdev(lst_precision_knnbm_stratified))
print('List of Recall for KNNBM each fold:', lst_recall_knnbm_stratified)
print('\nMean Recall KNNBM:', mean(lst_recall_knnbm_stratified))
print('\nStandard Deviation of Recall KNNBM:', stdev(lst_recall_knnbm_stratified))
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i,v in enumerate(feature_importance_scores):
    	print('Feature: %0d, Score: %.5f' % (i,v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("KNNBM - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC CURVE for KNNBM Classifier

tprs = []
aucs = []
mean_fpr_knnbm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_knnbm_classifier.split(X_scaled, y_resampled)):
    knnbm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        knnbm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}", 
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_knnbm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_knnbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_knnbm = np.mean(tprs, axis=0)
mean_tpr_knnbm[-1] = 1.0
mean_auc_knnbm = auc(mean_fpr_knnbm, mean_tpr_knnbm)
std_auc_knnbm = np.std(aucs)
ax.plot(
    mean_fpr_knnbm,
    mean_tpr_knnbm,
    color="b",
    label=r"Mean ROC - KNNBM (AUC_KNNBM = %0.2f $\pm$ %0.2f)" % (mean_auc_knnbm, std_auc_knnbm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_knnbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_knnbm - std_tpr, 0)
ax.fill_between(
    mean_fpr_knnbm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - KNNBM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

7. Stratified K-fold CV  &  SVM CLASSIFIER + Bagging Classifier (SVMBM)

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_svm_classifier = SVC(C=4, kernel='linear', random_state=42)
svmbm_classifier = BaggingClassifier(base_svm_classifier, n_estimators=10, random_state=42)
skf_svmbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_svmbm_stratified = []
lst_precision_svmbm_stratified = []
lst_recall_svmbm_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_svmbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    svmbm_classifier.fit(X_train_fold, y_train_fold)
    y_pred_fold_svmbm = svmbm_classifier.predict(X_test_fold)
    f1_svmbm = f1_score(y_test_fold, y_pred_fold_svmbm, average='weighted')
    precision_svmbm = precision_score(y_test_fold, y_pred_fold_svmbm, average='weighted')
    recall_svmbm = recall_score(y_test_fold, y_pred_fold_svmbm, average='weighted')
    lst_f1_svmbm_stratified.append(f1_svmbm)
    lst_precision_svmbm_stratified.append(precision_svmbm)
    lst_recall_svmbm_stratified.append(recall_svmbm)
    accu_score = svmbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = svmbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
print('List of F1-scores for SVMBM each fold:', lst_f1_svmbm_stratified)
print('\nMean F1-score for SVMBM:', mean(lst_f1_svmbm_stratified), '±', stdev(lst_f1_svmbm_stratified))
print('List of Precision for SVMBM each fold:', lst_precision_svmbm_stratified)
print('\nMean Precision:', mean(lst_precision_svmbm_stratified), '±', stdev(lst_precision_svmbm_stratified))
print('List of Recall for SVMBM each fold:', lst_recall_svmbm_stratified)
print('\nMean Recall SVMBM:', mean(lst_recall_svmbm_stratified), '±', stdev(lst_recall_svmbm_stratified))
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i, v in enumerate(feature_importance_scores.values()):
        print('Feature: %0d, Score: %.5f' % (i, v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("SVMBM - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC CURVE for SVMBM Classifier
tprs = []
aucs = []
mean_fpr_svmbm = np.linspace(0, 1, 100)
fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_svmbm_classifier.split(X_scaled, y_resampled)):
    svmbm_classifier.fit(X_scaled[train], y_resampled[train])
    # Plot ROC curve for each fold
    viz = RocCurveDisplay.from_estimator(
        svmbm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",  # Add 1 to fold for the updated name
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_svmbm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_svmbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_svmbm = np.mean(tprs, axis=0)
mean_tpr_svmbm[-1] = 1.0
mean_auc_svmbm = auc(mean_fpr_svmbm, mean_tpr_svmbm)
std_auc_svmbm = np.std(aucs)
ax.plot(
    mean_fpr_svmbm,
    mean_tpr_svmbm,
    color="b",
    label=r"Mean ROC - SVMBM (AUC_SVMBM = %0.2f $\pm$ %0.2f)" % (mean_auc_svmbm, std_auc_svmbm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_svmbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_svmbm - std_tpr, 0)

ax.fill_between(
    mean_fpr_svmbm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - SVMBM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

8. Stratified K-fold CV  &  Decision Tree CLASSIFIER + Bagging Classifier (DTBM)

In [None]:
scaler = preprocessing.MinMaxScaler()
X_scaled = scaler.fit_transform(X_resampled)
base_dt_classifier = DecisionTreeClassifier(random_state=42, ccp_alpha=0.0, criterion='gini', max_depth=None, max_features=None, min_impurity_decrease=0.0, min_samples_leaf=1, min_samples_split=2, splitter='best')
dtbm_classifier = BaggingClassifier(base_dt_classifier, n_estimators=10, random_state=42)
skf_dtbm_classifier = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lst_accu_stratified = []
lst_f1_dtbm_stratified = []
lst_precision_dtbm_stratified = []
lst_recall_dtbm_stratified = []
feature_importance_scores = {}
for train_index, test_index in skf_dtbm_classifier.split(X_scaled, y_resampled):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_resampled[train_index], y_resampled[test_index]
    dtbm_classifier.fit(X_train_fold, y_train_fold)
    y_pred_fold_dtbm = dtbm_classifier.predict(X_test_fold) 
    f1_dtbm = f1_score(y_test_fold, y_pred_fold_dtbm, average='weighted')
    precision_dtbm = precision_score(y_test_fold, y_pred_fold_dtbm, average='weighted')
    recall_dtbm = recall_score(y_test_fold, y_pred_fold_dtbm, average='weighted')
    lst_f1_dtbm_stratified.append(f1_dtbm)
    lst_precision_dtbm_stratified.append(precision_dtbm)
    lst_recall_dtbm_stratified.append(recall_dtbm)
    accu_score = dtbm_classifier.score(X_test_fold, y_test_fold)
    lst_accu_stratified.append(accu_score)
    for i in range(X_scaled.shape[1]):
        perturbed_X_test = X_test_fold.copy()
        perturbed_X_test[:, i] = np.random.permutation(perturbed_X_test[:, i])
        perturbed_accu_score = dtbm_classifier.score(perturbed_X_test, y_test_fold)
        feature_importance_scores[i] = feature_importance_scores.get(i, 0) + (accu_score - perturbed_accu_score)
print('List of F1-scores for DTBM each fold:', lst_f1_dtbm_stratified)
print('\nMean F1-score for DTBM:', mean(lst_f1_dtbm_stratified))
print('\nStandard Deviation of F1-score for DTBM:', stdev(lst_f1_dtbm_stratified))
print('List of Precision for DTBM each fold:', lst_precision_dtbm_stratified)
print('\nMean Precision:', mean(lst_precision_dtbm_stratified))
print('\nStandard Deviation of Precision DTBM:', stdev(lst_precision_dtbm_stratified))
print('List of Recall for DTBM each fold:', lst_recall_dtbm_stratified)
print('\nMean Recall DTBM:', mean(lst_recall_dtbm_stratified))
print('\nStandard Deviation of Recall DTBM:', stdev(lst_recall_dtbm_stratified))
print('List of possible accuracy:', lst_accu_stratified)
if lst_accu_stratified:
    print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
    print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
    print('\nOverall Accuracy:',
          mean(lst_accu_stratified)*100, '%')
    print('\nStandard Deviation is:', stdev(lst_accu_stratified))
    total_score = sum(feature_importance_scores.values())
    feature_importance_scores = {key: value/total_score for key, value in feature_importance_scores.items()}
    print("\nFeature Importance Scores:")
    for i, v in enumerate(feature_importance_scores.values()):
        print('Feature: %0d, Score: %.5f' % (i, v))
    feature_names = ['beta', 'Ep', 'EDR', 'Age', 'Sex', 'SBP', 'Smoking', 'HDL', 'LDL ', 'Glucose']
    plt.figure(figsize=(8, 6))
    plt.bar(feature_importance_scores.keys(), feature_importance_scores.values())
    plt.title("DTBM - Feature Importance")
    plt.xticks(range(len(feature_importance_scores)), feature_names, rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.show()
else:
    print("No accuracy scores available.")

# ROC CURVE for Decision Tree Bagging Classifier
tprs = []
aucs = []
mean_fpr_dtbm = np.linspace(0, 1, 100)

fig, ax = plt.subplots(figsize=(6, 6))
for fold, (train, test) in enumerate(skf_dtbm_classifier.split(X_scaled, y_resampled)):
    dtbm_classifier.fit(X_scaled[train], y_resampled[train])
    viz = RocCurveDisplay.from_estimator(
        dtbm_classifier,
        X_scaled[test],
        y_resampled[test],
        name=f"Fold {fold + 1}",  # Add 1 to fold for the updated name
        alpha=0.3,
        lw=1,
        ax=ax,
        plot_chance_level=(fold == skf_dtbm_classifier.n_splits - 1),
    )
    interp_tpr = np.interp(mean_fpr_dtbm, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
mean_tpr_dtbm = np.mean(tprs, axis=0)
mean_tpr_dtbm[-1] = 1.0
mean_auc_dtbm = auc(mean_fpr_dtbm, mean_tpr_dtbm)
std_auc_dtbm = np.std(aucs)
ax.plot(
    mean_fpr_dtbm,
    mean_tpr_dtbm,
    color="b",
    label=r"Mean ROC - DTBM (AUC_DTBM = %0.2f $\pm$ %0.2f)" % (mean_auc_dtbm, std_auc_dtbm),
    lw=2,
    alpha=0.8,
)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr_dtbm + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr_dtbm - std_tpr, 0)
ax.fill_between(
    mean_fpr_dtbm,
    tprs_lower,
    tprs_upper,
    color="grey",
    alpha=0.2,
    label=r"$\pm$ 1 std. dev.",
)
ax.set(
    xlim=[-0.05, 1.05],
    ylim=[-0.05, 1.05],
    xlabel="False Positive Rate",
    ylabel="True Positive Rate",
    title=f"Mean ROC curve - DTBM",
)
ax.axis("square")
ax.legend(loc="lower right")
plt.show()

# ROC-AUC

In [None]:
# Extract x and y-axis data from the mean ROC curve
plt.rcParams["font.family"] = "Times New Roman"
# Plot the mean ROC curve
plt.figure(figsize=(6, 5))
plt.plot(mean_fpr_rf, mean_tpr_rf, color="cornflowerblue", linestyle='dashed', label=f"ROC - RF")
plt.plot(mean_fpr_knn, mean_tpr_knn, color="lightcoral", linestyle='dashed', label=f"ROC - KNN")
plt.plot(mean_fpr_svm, mean_tpr_svm, color="limegreen", linestyle='dashed', label=f"ROC - SVM")
plt.plot(mean_fpr_dt, mean_tpr_dt, color="dimgrey", linestyle='dashed', label=f"ROC - DT")

plt.plot(mean_fpr_rfbm, mean_tpr_rfbm, color="cornflowerblue", label=f"ROC - RFBM")
plt.plot(mean_fpr_knnbm, mean_tpr_knnbm, color="lightcoral", label=f"ROC - KNNBM")
plt.plot(mean_fpr_svmbm, mean_tpr_svmbm, color="limegreen", label=f"ROC - SVMBM")
plt.plot(mean_fpr_dtbm, mean_tpr_dtbm, color="dimgrey", label=f"ROC - DTBM")
plt.plot([0, 1], [0, 1], linestyle="-.", color="gray", label="Random Classifier")

plt.plot([0, 1], [1, 1], color='black', lw=1, linestyle=':', label='Perfect Classifier')
plt.plot([0, 0], [0, 1], color='black', lw=1, linestyle=':')

# Set graph properties
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel("False Positive Rate", fontsize=13)
plt.ylabel("True Positive Rate", fontsize=13)
plt.legend(loc="lower right", fontsize=9.8)

# Display the plot
plt.show()
