<a href="https://colab.research.google.com/github/AnahitShekikyan/Progect_500B/blob/main/ADS_502_Final_updated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
cancer = files.upload()

In [None]:
#library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#import data
df_original = pd.read_csv("/content/breast-cancer.csv")
df_original.head()


In [None]:
#data trim, per study "best predictive accuracy obtained using one separating plane in the 3-D space of Worst Area, Worst Smoothness and Mean Texture."
df = df_original[['diagnosis', 'area_worst', 'smoothness_worst', 'texture_mean']]
df.head()


In [None]:
#convert diagnosis to binary
df.loc[:, 'diagnosis'] = df['diagnosis'].replace({'M': 1, 'B': 0})
df.head()

# **Basic Data Information**

In [None]:
#get shape
df.shape

In [None]:
#are there duplicates?
df.duplicated().sum()

In [None]:
df['diagnosis'] = df['diagnosis'].astype(int)

In [None]:
sns.countplot(x='diagnosis', data=df)

In [None]:
#percentage of binary class
print("percentage of each class", df['diagnosis'].value_counts()/len(df)*100)

# **Data Quality Report**

## **Continuous Features**

In [None]:
# identify continuous features
conf = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
conf

In [None]:
#identify any columns to filter out from the "continuous features"
conf_exclude = ['']
filter_conf = [x for x in conf if x not in conf_exclude]
filter_conf

In [None]:
#get summary stats on continuous
df[filter_conf].describe()

In [None]:
data_quality_conf = pd.DataFrame({
    'Feature': filter_conf,
    'Count': df[filter_conf].count().values,
    'Missing Values': df[filter_conf].isnull().sum().values,
    'Cardinality': df[filter_conf].nunique().values,
    'Min': df[filter_conf].min().values,
    '1st Quartile': df[filter_conf].quantile(0.25).values,
    'Mean': df[filter_conf].mean().values,
    'Median': df[filter_conf].median().values,
    '3rd Quartile': df[filter_conf].quantile(0.75).values,
    'Max': df[filter_conf].max().values,
    'Standard Deviation': df[filter_conf].std().values,
})
data_quality_conf

# **Univariate Analysis**

In [None]:
#plot histograms for numerical variables
plt.style.use('ggplot')
for column in filter_conf:
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 2, 1)
    sns.histplot(df[column], kde = True)
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
#plot boxplots of all continuous features
plt.style.use('ggplot')
for column in filter_conf:
    plt.figure(figsize=(20, 4))
    plt.subplot(1, 2, 1)
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    plt.show()

# **Multivariate Analysis**

In [None]:
#correlations, all
corr_matrix = df[filter_conf].corr()
corr_matrix

In [None]:
# Create a heatmap
plt.figure(figsize=(16, 12))
heatmap = sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5, annot_kws={"size": 8})

# Rotate the x and y labels for better readability
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)

# Show the heatmap
plt.show()

In [None]:
sns.pairplot(df, hue ="diagnosis", height=3)

# **Feature Scaling**

[link text](https://)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Isolate features (X) and target (y)
X = df[['area_worst', 'smoothness_worst', 'texture_mean']]
y = df['diagnosis']

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# **Stratified K-Fold Partitioning**

In [None]:
# @title
# Initialize StratifiedKFold with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# get list of partitions
def get_partitions(X, y):
  partitions = []
  # Performing stratified k-fold cross-validation
  for train_index, test_index in skf.split(X, y):
      X_train, X_test = X_scaled[train_index], X_scaled[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]
      partitions.append((X_train, X_test, y_train, y_test))
  return partitions

## **Visualizations**

In [None]:
# Define color maps for visualization
cmap_cv = plt.get_cmap('coolwarm')
cmap_data = plt.get_cmap('tab10')

# Define visualization function for cross-validation indices
def plot_cv_indices(cv, X, y, ax, n_splits, lw=10):

    #Create a plot for indices of a cross-validation object
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1  # Testing set
        indices[tr] = 0  # Training set
        ax.scatter(range(len(indices)), [ii + 0.5] * len(indices), c=indices, marker="_", lw=lw, cmap=cmap_cv, vmin=-0.2, vmax=1.2)

    ax.scatter(range(len(X)), [ii + 1.5] * len(X), c=y, marker="_", lw=lw, cmap=cmap_data)
    yticklabels = list(range(n_splits)) + ["class"]
    ax.set(yticks=np.arange(n_splits + 1) + 0.5, yticklabels=yticklabels, xlabel="Sample index", ylabel="CV iteration", ylim=[n_splits + 1.2, -0.2], xlim=[0, len(X)])
    ax.set_title("Cross-Validation Splits", fontsize=15)
    return ax

# Creating a plot
fig, ax = plt.subplots(figsize=(12, 6))
plot_cv_indices(skf, X_scaled, y, ax, n_splits=5)
plt.show()

# Plotting fold distribution
def plot_fold_distribution(cv, X, y, ax):
    fold_sizes = [np.sum(y.iloc[tt] == 1) for _, tt in cv.split(X, y)]
    class_0 = [np.sum(y.iloc[tt] == 0) for _, tt in cv.split(X, y)]
    class_1 = [np.sum(y.iloc[tt] == 1) for _, tt in cv.split(X, y)]

    df_fold = pd.DataFrame({'Fold': list(range(len(fold_sizes))), 'Class 0': class_0, 'Class 1': class_1})
    df_fold.set_index('Fold').plot(kind='bar', ax=ax)
    ax.set_xlabel('Fold')
    ax.set_ylabel('Count')
    ax.set_title('Distribution of Classes Across Folds')

fig, ax = plt.subplots(figsize=(10, 6))
plot_fold_distribution(skf, X_scaled, y, ax)
plt.show()

# Plotting class distribution heatmap
def plot_class_distribution_heatmap(cv, X, y, ax):
    fold_class_distribution = []
    for train_idx, test_idx in cv.split(X, y):
        fold_class_distribution.append(np.bincount(y.iloc[test_idx], minlength=2))

    df_class_dist = pd.DataFrame(fold_class_distribution, columns=['Class 0', 'Class 1'])
    sns.heatmap(df_class_dist, annot=True, cmap='Blues', fmt='d', ax=ax)
    ax.set_xlabel('Class')
    ax.set_ylabel('Fold')
    ax.set_title('Class Distribution Across Folds')

fig, ax = plt.subplots(figsize=(10, 6))
plot_class_distribution_heatmap(skf, X_scaled, y, ax)
plt.show()


## Cross-Validation Splits Plot

Description:

*   The first plot shows the indices of samples used for training (blue) and testing (red) across different cross-validation (CV) iterations.

*   The "class" row at the bottom indicates the actual class distribution of the samples.

Key Points:

*   **Consistent Distribution:** The plot shows a consistent distribution of training and testing samples across all folds, which ensures that each fold has a representative sample of the overall dataset.

*   **Class Distribution:** The "class" row indicates the balance of the classes in the dataset, which appears to be relatively balanced.


Usage:

*   This plot is useful for visualizing how the data is split into training and testing sets during each fold of the cross-validation process.

*   It helps in ensuring that the stratification process is working correctly, preserving the class distribution in each fold.

## Distribution of Classes Across Folds

Description:

*   The second plot is a bar chart showing the distribution of classes (Class 0 and Class 1) across different folds.

*   The x-axis represents the fold number, and the y-axis represents the count of samples for each class.


Key Points:

*   **Balanced Classes:** Each fold contains a similar number of samples for Class 0 and Class 1, indicating that the stratified cross-validation is preserving the class balance.

*   **Uniform Distribution:** The consistent height of the bars across folds shows that the distribution of classes is uniform across all folds.

Usage:

*   This plot is useful for confirming that each fold has a balanced distribution of the target classes.

*   It helps in validating the effectiveness of the StratifiedKFold method in maintaining class balance.

## Class Distribution Heatmap

Description:

*   The third plot is a heatmap showing the distribution of classes across different folds.

*   The x-axis represents the classes (Class 0 and Class 1), and the y-axis represents the fold number.


Key Points:

*  **Counts per Class:** The heatmap values indicate the count of each class in each fold.

*   **Color Intensity:** The color intensity represents the number of samples, with darker colors indicating higher counts.

Usage:

*   This plot provides a visual representation of the class distribution across folds.

*   It is useful for quickly identifying any discrepancies in the distribution of classes in different folds.

**Summary**

**Cross-Validation Splits Plot:** Ensures proper stratification and visualizes the data splitting process.

**Distribution of Classes Across Folds:** Confirms the uniform distribution of classes across folds, validating the effectiveness of stratified cross-validation.

**Class Distribution Heatmap:** Provides a detailed view of class counts in each fold, highlighting the balance maintained across folds.


# Models

## Baseline Model (Logistic Regression)

In [None]:
# Calculate and print mean and standard deviation of each metric
def print_metrics_summary(name, metrics):
    mean_metric = np.mean(metrics)
    std_metric = np.std(metrics, ddof=1)
    print(f'Mean {name}: {mean_metric:.4f}')
    print(f'Standard Deviation of {name}: {std_metric:.4f}')

In [None]:
eval_metrics = {
    "logistic_regression": {},
    "neural_net": {},
    "random_forest": {}
}

In [None]:
# Initialize lists to store metrics
accuracy_scores = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []

for part in get_partitions(X_scaled, y):
    # Unpack partition into constituent variables
    (X_train, X_test, y_train, y_test) = part

    # Initializing the logistic regression model
    model = LogisticRegression(max_iter=10000)

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    y_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    # Append metrics to lists
    accuracy_scores.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_aucs.append(roc_auc)

    # Print metrics for this fold
    print(f'Fold Accuracy: {accuracy:.4f}')
    print(f'Fold Precision: {precision:.4f}')
    print(f'Fold Recall: {recall:.4f}')
    print(f'Fold F1 Score: {f1:.4f}')
    print(f'Fold ROC AUC: {roc_auc:.4f}')
    print(confusion_matrix(y_test, y_pred))

# display aggregate eval metrics
print_metrics_summary('Accuracy', accuracy_scores)
print_metrics_summary('Precision', precisions)
print_metrics_summary('Recall', recalls)
print_metrics_summary('F1 Score', f1_scores)
print_metrics_summary('ROC AUC', roc_aucs)

Output metrics indicate that the model is performing quite well, here is summary:

**Fold-wise Metrics**


1.   Fold Accuracy:

    *   Fold 1: 0.9649

    *   Fold 2: 0.9474

    *   Fold 3: 0.9649

    *   Fold 4: 0.9825

    *   Fold 5: 0.9558


2.   Fold Precision:

    *   Fold 1:  0.9535

    *   Fold 2: 1.0000

    *   Fold 3: 1.0000

    *   Fold 4: 0.9545

    *   Fold 5: 0.9744


3.   Fold Recall:

    *   Fold 1: 0.9535

    *   Fold 2: 0.8605

    *   Fold 3: 0.9048

    *   Fold 4: 1.0000

    *   Fold 5: 0.9048

   
4.   Fold F1 Score:

    *   Fold 1: 0.9535

    *   Fold 2: 0.9250

    *   Fold 3: 0.9500

    *   Fold 4: 0.9767

    *   Fold 5: 0.9383


 5.   Fold ROC AUC:

    *   Fold 1: 0.9980

    *   Fold 2: 0.9866

    *   Fold 3: 0.9818

    *   Fold 4: 1.0000

    *   Fold 5: 0.9950


**Summary Metrics**

*   Mean Accuracy: 0.9631

*   Standard Deviation of Accuracy: 0.0131

*   Mean Precision: 0.9765

*   Standard Deviation of Precision: 0.0230

*   Mean Recall: 0.9247

*   Standard Deviation of Recall: 0.0534

*   Mean F1 Score: 0.9487

*   Standard Deviation of F1 Score: 0.0192

*   Mean ROC AUC: 0.9923

*   Standard Deviation of ROC AUC: 0.0078


**Accuracy:** The accuracy is consistently high across all folds, indicating that the model is performing well in distinguishing classes.


**Precision and Recall:** High precision indicates few false positives, while recall values are slightly more variable, reflecting differences in detecting true positives across folds.

**F1 Score:** This combines precision and recall, showing balanced performance.


**ROC AUC:** High values close to 1 indicate excellent performance in distinguishing classes.

**Confusion Matrices**

*   Fold 1: [[69 2], [ 2 41]]

*   Fold 2: [[71 0], [ 6 37]]

*   Fold 3: [[72 0], [ 4 38]]

*   Fold 4: [[70 2], [ 0 42]]

*   Fold 5: [[70 1], [ 4 38]]



**Conclusion**
The standard deviations are relatively low, suggesting stable performance. The use of StratifiedKFold ensures that the class distribution is maintained across each fold, contributing to reliable validation.

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize lists to store metrics
accuracy_scores = []
precisions = []
recalls = []
f1_scores = []
roc_aucs = []

for part in get_partitions(X_scaled, y):
    # Unpack partition into constituent variables
    (X_train, X_test, y_train, y_test) = part

    clf = MLPClassifier(solver='lbfgs', max_iter=100, alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=2)

    # train the model
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate metrics for this fold
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    y_prob = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_prob)

    # Append metrics to lists
    accuracy_scores.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    roc_aucs.append(roc_auc)

    # Print metrics for this fold
    print(f'Fold Accuracy: {accuracy:.4f}')
    print(f'Fold Precision: {precision:.4f}')
    print(f'Fold Recall: {recall:.4f}')
    print(f'Fold F1 Score: {f1:.4f}')
    print(f'Fold ROC AUC: {roc_auc:.4f}')
    print(confusion_matrix(y_test, y_pred))

# display aggregate eval metrics
print_metrics_summary('Accuracy', accuracy_scores)
print_metrics_summary('Precision', precisions)
print_metrics_summary('Recall', recalls)
print_metrics_summary('F1 Score', f1_scores)
print_metrics_summary('ROC AUC', roc_aucs)

# Model Comparison

## ROC Curve Plotting

In [None]:
# Plot ROC curve for each fold
mean_fpr = np.linspace(0, 1, 100)
mean_tpr = np.zeros_like(mean_fpr)

for train_index, test_index in skf.split(X_scaled, y):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    mean_tpr += np.interp(mean_fpr, fpr, tpr)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label=f'ROC curve (area = {roc_auc:.2f})')

mean_tpr /= skf.get_n_splits()
mean_auc = auc(mean_fpr, mean_tpr)

# Plot settings
plt.plot(mean_fpr, mean_tpr, color='b', lw=2, label=f'Mean ROC curve (area = {mean_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

Roc curve  plots the True Positive Rate (TPR) against the False Positive Rate (FPR).

**Mean ROC Curve:**

*   The mean ROC curve, plotted in blue, represents the average performance across all folds.

*   It has an AUC of 0.99, indicating excellent overall performance.

**Diagonal Line:** The dashed diagonal line represents the ROC curve of a random classifier with an AUC of 0.50. This serves as a baseline for comparison.

**AUC**: The AUC values for the individual folds are very high (ranging from 0.98 to 1.00), indicating that the model performs exceptionally well in distinguishing between the positive and negative classes.

The ROC curves for different folds are very close to each other, demonstrating consistent performance across the cross-validation folds. This suggests that the model's performance is stable and not heavily influenced by the particular train-test split.

The curves show a sharp increase in the True Positive Rate (TPR) with a very small increase in the False Positive Rate (FPR), indicating that the model has a high sensitivity (recall) and specificity.


# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Function for training and evaluating a model using Stratified K-Fold cross-validation
def evaluate_model(model, X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = []
    precisions = []
    recalls = []
    f1_scores = []
    roc_aucs = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precisions.append(precision_score(y_test, y_pred))
        recalls.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
        roc_aucs.append(roc_auc_score(y_test, y_prob))

    return accuracy_scores, precisions, recalls, f1_scores, roc_aucs

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_metrics = evaluate_model(rf_model, X_scaled, y)
print_metrics_summary('Random Forest Accuracy', rf_metrics[0])
print_metrics_summary('Random Forest Precision', rf_metrics[1])
print_metrics_summary('Random Forest Recall', rf_metrics[2])
print_metrics_summary('Random Forest F1 Score', rf_metrics[3])
print_metrics_summary('Random Forest ROC AUC', rf_metrics[4])

# **CART**

In [None]:
# CART (Decision Tree)
cart_model = DecisionTreeClassifier(random_state=42)
cart_metrics = evaluate_model(cart_model, X_scaled, y)
print_metrics_summary('CART Accuracy', cart_metrics[0])
print_metrics_summary('CART Precision', cart_metrics[1])
print_metrics_summary('CART Recall', cart_metrics[2])
print_metrics_summary('CART F1 Score', cart_metrics[3])
print_metrics_summary('CART ROC AUC', cart_metrics[4])

# **C5.0**

In [None]:
# C5.0 (using Decision Tree with entropy criterion as proxy)
c50_model = DecisionTreeClassifier(criterion='entropy', random_state=42)
c50_metrics = evaluate_model(c50_model, X_scaled, y)
print_metrics_summary('C5.0 Accuracy', c50_metrics[0])
print_metrics_summary('C5.0 Precision', c50_metrics[1])
print_metrics_summary('C5.0 Recall', c50_metrics[2])
print_metrics_summary('C5.0 F1 Score', c50_metrics[3])
print_metrics_summary('C5.0 ROC AUC', c50_metrics[4])

# **Naive Bayes**

In [None]:
# Evaluating Naive Bayes
nb_model = GaussianNB()
nb_metrics = evaluate_model(nb_model, X_scaled, y)
print_metrics_summary('Naive Bayes Accuracy', nb_metrics[0])
print_metrics_summary('Naive Bayes Precision', nb_metrics[1])
print_metrics_summary('Naive Bayes Recall', nb_metrics[2])
print_metrics_summary('Naive Bayes F1 Score', nb_metrics[3])
print_metrics_summary('Naive Bayes ROC AUC', nb_metrics[4])

# **ROC Curves for All Models**

In [None]:
# Plotting the ROC curves for all models
mean_fpr = np.linspace(0, 1, 100)
plt.figure(figsize=(10, 6))

for model, name in zip([rf_model, cart_model, c50_model, nb_model], ['Random Forest', 'CART', 'C5.0', 'Naive Bayes']):
    mean_tpr = np.zeros_like(mean_fpr)
    for train_index, test_index in skf.split(X_scaled, y):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        mean_tpr += np.interp(mean_fpr, fpr, tpr)

    mean_tpr /= skf.get_n_splits()
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, lw=2, label=f'{name} (area = {mean_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()

# **Interpretation of the ROC Curve Plot**

**Random Forest (AUC = 0.99):** The red line represents the Random Forest classifier, which shows excellent performance with an AUC of 0.99. The curve is very close to the top-left corner, indicating a high TPR and a low FPR across various threshold settings. This suggests that the Random Forest model is highly effective at distinguishing between the malignant and benign tumor cases.

**CART (AUC = 0.93):** The blue line represents the CART (Decision Tree) classifier. With an AUC of 0.93, this model also performs well, but not as well as the Random Forest. The curve is slightly further from the top-left corner, indicating slightly lower sensitivity and specificity compared to the Random Forest.

**C5.0 (AUC = 0.93):** The purple line represents the C5.0 model (simulated using a Decision Tree with the entropy criterion). It has an identical AUC to the CART model, showing similar performance in terms of discriminative ability. The overlap in curves indicates that both models have comparable effectiveness.

**Naive Bayes (AUC = 0.99):** The black line represents the Naive Bayes classifier, which also shows excellent performance with an AUC of 0.99. The curve is close to the top-left corner, similar to the Random Forest, indicating high sensitivity and specificity.

**Excellent**

The Random Forest and Naive Bayes models both show excellent performance with AUCs of 0.99. Their ROC curves indicate that they have a high capability to correctly classify malignant and benign tumors.

**Good**

The CART and C5.0 models also perform well with AUCs of 0.93. While not as high as Random Forest and Naive Bayes, they still demonstrate strong discriminative ability.

**Consistency**

The consistency in high AUC values across models suggests that the features selected ("area_worst", "smoothness_worst", and "texture_mean") are effective in distinguishing between the two classes in the breast cancer dataset.

The ROC curve is a valuable tool for comparing the performance of different classifiers. In this case, it highlights that both the Random Forest and Naive Bayes models are particularly effective for the given dataset.