<a href="https://colab.research.google.com/github/AsianaHolloway/Clinical-Decision-Support-with-Machine-Learning-Predicting-Breast-Cancer-Outcomes/blob/main/Breast_Cancer_CDSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install scikit-learn pandas numpy matplotlib seaborn imbalanced-learn

import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE


In [2]:
import pandas as pd

df = pd.read_csv('/content/breast_cancer_wisconsin.csv')
df.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
# Drop ID/unnamed columns if present
to_drop = [c for c in df.columns if c.lower() in ['id', 'unnamed: 32', 'unnamed:32']]
df = df.drop(columns=to_drop, errors='ignore')

# Encode target: M=1 (malignant), B=0 (benign)
df['target'] = (df['diagnosis'].str.upper() == 'M').astype(int)
X = df.drop(columns=['diagnosis', 'target'])
y = df['target']

# Quick sanity checks
print(df['target'].value_counts())
df.info()


target
0    357
1    212
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)


In [5]:
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)


In [6]:
models = {
    "LogReg": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=500, solver='liblinear'))
    ]),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC(kernel='rbf', probability=True, random_state=42))
    ])
}


In [7]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_table = []
for name, model in models.items():
    auc = cross_val_score(model, X_train_sm, y_train_sm, cv=cv, scoring='roc_auc').mean()
    acc = cross_val_score(model, X_train_sm, y_train_sm, cv=cv, scoring='accuracy').mean()
    cv_table.append({"model": name, "cv_auc": auc, "cv_acc": acc})

cv_df = pd.DataFrame(cv_table).sort_values('cv_auc', ascending=False)
cv_df


Unnamed: 0,model,cv_auc,cv_acc
0,LogReg,0.995999,0.973684
3,SVM,0.994829,0.973684
2,RandomForest,0.994521,0.961404
1,DecisionTree,0.935088,0.935088


In [8]:
metrics = []
for name, model in models.items():
    model.fit(X_train_sm, y_train_sm)
    y_pred = model.predict(X_test)
    y_proba = (model.predict_proba(X_test)[:,1]
               if hasattr(model, "predict_proba")
               else model.decision_function(X_test))

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    sens = recall_score(y_test, y_pred)       # sensitivity = recall for positive class
    auc = roc_auc_score(y_test, y_proba)

    metrics.append({"model": name, "accuracy": acc, "precision": prec, "sensitivity": sens, "auc": auc})

metrics_df = pd.DataFrame(metrics).sort_values('auc', ascending=False)
metrics_df


Unnamed: 0,model,accuracy,precision,sensitivity,auc
2,RandomForest,0.982456,1.0,0.952381,0.998347
3,SVM,0.982456,0.97619,0.97619,0.994709
0,LogReg,0.973684,0.97561,0.952381,0.994048
1,DecisionTree,0.921053,0.902439,0.880952,0.912698


In [9]:
import os
os.makedirs('/content/plots', exist_ok=True)

for name, model in models.items():
    model.fit(X_train_sm, y_train_sm)
    y_pred = model.predict(X_test)
    y_proba = (model.predict_proba(X_test)[:,1]
               if hasattr(model, "predict_proba")
               else model.decision_function(X_test))

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots()
    sns.heatmap(cm, annot=True, fmt='d', cbar=False, ax=ax)
    ax.set_title(f'Confusion Matrix — {name}')
    ax.set_xlabel('Predicted'); ax.set_ylabel('True')
    plt.tight_layout()
    plt.savefig(f'/content/plots/confusion_{name}.png', dpi=160)
    plt.close(fig)

    # ROC
    fig, ax = plt.subplots()
    RocCurveDisplay.from_predictions(y_test, y_proba, ax=ax)
    ax.set_title(f'ROC — {name}')
    plt.tight_layout()
    plt.savefig(f'/content/plots/roc_{name}.png', dpi=160)
    plt.close(fig)

# Save metrics
metrics_df.to_csv('/content/plots/metrics_summary.csv', index=False)
cv_df.to_csv('/content/plots/cv_summary.csv', index=False)


In [10]:
rf = models["RandomForest"].fit(X_train_sm, y_train_sm)
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(15)

fig, ax = plt.subplots(figsize=(6,5))
imp.plot(kind='barh', ax=ax)
ax.set_title('Top 15 Feature Importances (Random Forest)')
plt.tight_layout()
plt.savefig('/content/plots/feature_importance_rf.png', dpi=160)
plt.close(fig)


In [11]:
import os

# Create folder if not exists
os.makedirs('/content/plots', exist_ok=True)

# List contents
os.listdir('/content/plots')


['confusion_LogReg.png',
 'metrics_summary.csv',
 'cv_summary.csv',
 'roc_LogReg.png',
 'roc_RandomForest.png',
 'confusion_DecisionTree.png',
 'feature_importance_rf.png',
 'roc_SVM.png',
 'confusion_RandomForest.png',
 'roc_DecisionTree.png',
 'confusion_SVM.png']