In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [3]:
# Load the Breast Cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [5]:
# Display basic info
print("Dataset Shape:", X.shape)
print("Target Labels:", data.target_names)

Dataset Shape: (569, 30)
Target Labels: ['malignant' 'benign']


In [7]:
# Check for missing values
print("Missing values:", X.isnull().sum().sum())

Missing values: 0


In [9]:
# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Splitting the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [15]:
# Evaluation Function
def evaluate_model(name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print(classification_report(y_test, y_pred, target_names=data.target_names))
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

In [18]:
# Initialize models
models = [
    ("Logistic Regression", LogisticRegression(max_iter=10000)),
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Random Forest", RandomForestClassifier(random_state=42)),
    ("Support Vector Machine", SVC()),
    ("k-Nearest Neighbors", KNeighborsClassifier())
]


In [20]:
# Evaluate all models
results = []
for name, model in models:
    results.append(evaluate_model(name, model))


Logistic Regression
              precision    recall  f1-score   support

   malignant       0.98      0.95      0.96        43
      benign       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Decision Tree
              precision    recall  f1-score   support

   malignant       0.93      0.93      0.93        43
      benign       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


Random Forest
              precision    recall  f1-score   support

   malignant       0.98      0.93      0.95        43
      benign       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97

In [22]:
# Create a DataFrame to compare models
results_df = pd.DataFrame(results)
print("\nModel Comparison:\n")
print(results_df.sort_values(by="F1 Score", ascending=False))


Model Comparison:

                    Model  Accuracy  Precision    Recall  F1 Score
0     Logistic Regression  0.973684   0.972222  0.985915  0.979021
3  Support Vector Machine  0.973684   0.972222  0.985915  0.979021
2           Random Forest  0.964912   0.958904  0.985915  0.972222
1           Decision Tree  0.947368   0.957746  0.957746  0.957746
4     k-Nearest Neighbors  0.947368   0.957746  0.957746  0.957746
