In [None]:
#Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

<span style="color:green; font-weight:bold; font-size:24px">Loading and Preprocessing</span>

In [3]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

#### Check for missing values

In [7]:
print("Missing values:\n", X.isnull().sum())

Missing values:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64


 <span style="color:blue; font-weight:bold; font-size:16px">Missing values can distort model training and lead to errors or misleading results. In this dataset, there are no missing values, so no imputation was needed.</span>

In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

<span style="color:blue; font-weight:bold; font-size:16px">Feature scaling is essential for algorithms like SVM and KNN that are sensitive to feature magnitudes </span>

#### Split into train and test sets

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

 <span style="color:blue; font-weight:bold; font-size:16px">To evaluate model performance, we need to separate a portion of the data for testing, ensuring the model is evaluated on unseen data to mimic real-world scenarios.</span> 

<span style="color:green; font-weight:bold; font-size:24px">Feature scaling</span>

In [29]:
#  Models to Implement
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [31]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(),
    "k-Nearest Neighbors": KNeighborsClassifier()
}

In [33]:
#Train, predict, and evaluate models
results = []

In [37]:
from IPython.display import display, HTML

In [57]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    results.append({"Model": name, "Accuracy": acc})

    display(HTML(f'<span style="color:blue; font-size:18px; font-weight:bold;">************ {name} ***********</span>'))
    display(HTML(f'<span style="color:green; font-size:18px; font-weight:bold;">Accuracy: {acc:.8f}</span>'))
    print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



#### Compare Results

In [76]:
results_df = pd.DataFrame(results)
print("\n\n")
display(HTML(f'<span style="color:blue; font-size:18px; font-weight:bold;">************  Model Performance Comparison: \n ***********</span>'))
print("\n\n")
print(results_df.sort_values(by="Accuracy", ascending=False))
#print(results_df)









                     Model  Accuracy
0      Logistic Regression  0.973684
10     Logistic Regression  0.973684
20     Logistic Regression  0.973684
18  Support Vector Machine  0.973684
28  Support Vector Machine  0.973684
30     Logistic Regression  0.973684
15     Logistic Regression  0.973684
13  Support Vector Machine  0.973684
33  Support Vector Machine  0.973684
23  Support Vector Machine  0.973684
35     Logistic Regression  0.973684
8   Support Vector Machine  0.973684
38  Support Vector Machine  0.973684
40     Logistic Regression  0.973684
5      Logistic Regression  0.973684
3   Support Vector Machine  0.973684
43  Support Vector Machine  0.973684
25     Logistic Regression  0.973684
37           Random Forest  0.964912
32           Random Forest  0.964912
42           Random Forest  0.964912
27           Random Forest  0.964912
22           Random Forest  0.964912
2            Random Forest  0.964912
17           Random Forest  0.964912
7            Random Forest  0.96491

 <span style="color:green; font-weight:bold; font-size:24px"> Best and Worst Performing Models</span>

In [78]:
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
worst_model = results_df.loc[results_df['Accuracy'].idxmin()]

In [81]:
display(HTML('<span style="color:blue; font-weight:bold; font-size:18px">Best Performing Model</span>'))
print(best_model)

Model       Logistic Regression
Accuracy               0.973684
Name: 0, dtype: object


In [83]:
display(HTML('<span style="color:blue; font-weight:bold; font-size:18px">Worst Performing Model</span>'))
print(worst_model)

Model       Decision Tree
Accuracy         0.947368
Name: 1, dtype: object
