In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import numpy as np

In [2]:
X, y = load_breast_cancer(return_X_y=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [4]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((455, 30), (455,), (114, 30), (114,))

In [5]:
model_pipeline1 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000, solver='liblinear', random_state=21))
])

model_pipeline2 = Pipeline([
    ('scaler', MinMaxScaler()),
    ('model', LogisticRegression())
])

In [6]:
stdscaler_scores = cross_val_score(model_pipeline1, X_train, y_train, cv=5)
print(f"Standard Scaler: {np.mean(stdscaler_scores)}")

minmax_scores = cross_val_score(model_pipeline2, X_train, y_train, cv=5)
print(f"MinMax Scaler: {np.mean(minmax_scores)}")

Standard Scaler: 0.9692307692307693
MinMax Scaler: 0.9582417582417582


In [7]:
param_grid = {
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'model__penalty': ['l1', 'l2'],
}

grid = GridSearchCV(model_pipeline1, param_grid, cv=5, verbose=3, n_jobs=-1)
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)
print(grid.score(X_test, y_test))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
{'model__C': 0.1, 'model__penalty': 'l2'}
0.9736263736263737
0.9912280701754386


In [8]:
optimal_model = grid.best_estimator_
optimal_model.fit(X_train, y_train)

In [9]:
y_pred = optimal_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Accuracy: 0.9912280701754386
ROC AUC: 0.988095238095238
Confusion Matrix: 
[[41  1]
 [ 0 72]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.98      0.99        42
           1       0.99      1.00      0.99        72

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114



In [10]:
print(f"ROC Curve: {roc_curve(y_test, y_pred)}")
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(f"AUC: {auc(fpr, tpr)}")

ROC Curve: (array([0.        , 0.02380952, 1.        ]), array([0., 1., 1.]), array([inf,  1.,  0.]))
AUC: 0.988095238095238


In [11]:
model_pipeline3 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', DecisionTreeClassifier(random_state=21))
])

model_pipeline4 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=21))
])

model_pipeline5 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', SVC(random_state=21))
])

In [12]:
pipe3_params = {
    'model__max_depth': [3, 5, 7, 9, 11, 13],
    'model__min_samples_split': [2, 4, 6, 8, 10]
}

pipe4_params = {
    'model__n_estimators': [100, 200, 300, 400, 500],
    'model__max_depth': [3, 5, 7, 9, 11, 13]
}

pipe5_params = {
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'model__gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

In [13]:
grid3 = GridSearchCV(model_pipeline3, pipe3_params, cv=5, verbose=3, n_jobs=-1)
grid3.fit(X_train, y_train)

print(grid3.best_params_)
print(grid3.best_score_)
print(grid3.score(X_test, y_test))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
{'model__max_depth': 5, 'model__min_samples_split': 2}
0.9428571428571428
0.9649122807017544


In [14]:
grid4 = GridSearchCV(model_pipeline4, pipe4_params, cv=5, verbose=3, n_jobs=-1)
grid4.fit(X_train, y_train)

print(grid4.best_params_)
print(grid4.best_score_)
print(grid4.score(X_test, y_test))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
{'model__max_depth': 7, 'model__n_estimators': 200}
0.9604395604395604
0.956140350877193


In [15]:
grid5 = GridSearchCV(model_pipeline5, pipe5_params, cv=5, verbose=3, n_jobs=-1) 
grid5.fit(X_train, y_train)

print(grid5.best_params_)
print(grid5.best_score_)
print(grid5.score(X_test, y_test))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'model__C': 10, 'model__gamma': 0.01}
0.9714285714285715
0.9912280701754386


In [17]:
compare_models = {
    'Logistic Regression': optimal_model,
    'Decision Tree': grid3.best_estimator_,
    'Random Forest': grid4.best_estimator_,
    'SVC': grid5.best_estimator_
}

for model_name, model in compare_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    print(f"{model_name} AUC: {auc(fpr, tpr)}")

Logistic Regression AUC: 0.988095238095238
Decision Tree AUC: 0.9623015873015873
Random Forest AUC: 0.9503968253968255
SVC AUC: 0.988095238095238


So our best model is either LogisticRegression or SVC with an AUC of 0.988, \
Logistic regression is to be preferred because it is faster and easier to interpret.