In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [3]:
# Loading the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Converting the data into a pandas DataFrame for better visualization
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

# Check for any missing values
print(df.isnull().sum())

# Feature Scaling: Standardize the features (important for models like SVM and k-NN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [5]:
# Logistic Regression
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# Performance metrics
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))
print(classification_report(y_test, logreg_pred))


Logistic Regression Accuracy: 0.9736842105263158
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [7]:
# Decision Tree Classifier
dtree = DecisionTreeClassifier(random_state=42)
dtree.fit(X_train, y_train)
dtree_pred = dtree.predict(X_test)

# Performance metrics
print("Decision Tree Accuracy:", accuracy_score(y_test, dtree_pred))
print(classification_report(y_test, dtree_pred))


Decision Tree Accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [9]:
# Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# Performance metrics
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print(classification_report(y_test, rf_pred))


Random Forest Accuracy: 0.9649122807017544
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [11]:
# Support Vector Machine
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

# Performance metrics
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))


SVM Accuracy: 0.9736842105263158
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [13]:
# k-Nearest Neighbors
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Performance metrics
print("k-NN Accuracy:", accuracy_score(y_test, knn_pred))
print(classification_report(y_test, knn_pred))


k-NN Accuracy: 0.9473684210526315
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



In [15]:
# Comparing the performance of all models
models = {
    "Logistic Regression": logreg,
    "Decision Tree": dtree,
    "Random Forest": rf,
    "SVM": svm,
    "k-NN": knn
}

# Creating a table to compare accuracy
accuracy_results = {}
for model_name, model in models.items():
    pred = model.predict(X_test)
    accuracy_results[model_name] = accuracy_score(y_test, pred)

# Display accuracy results
accuracy_df = pd.DataFrame(list(accuracy_results.items()), columns=["Model", "Accuracy"])
print(accuracy_df)

# Identify the best and worst performing models
best_model = accuracy_df.loc[accuracy_df['Accuracy'].idxmax()]
worst_model = accuracy_df.loc[accuracy_df['Accuracy'].idxmin()]

print(f"Best Performing Model: {best_model['Model']} with accuracy of {best_model['Accuracy']:.4f}")
print(f"Worst Performing Model: {worst_model['Model']} with accuracy of {worst_model['Accuracy']:.4f}")


                 Model  Accuracy
0  Logistic Regression  0.973684
1        Decision Tree  0.947368
2        Random Forest  0.964912
3                  SVM  0.973684
4                 k-NN  0.947368
Best Performing Model: Logistic Regression with accuracy of 0.9737
Worst Performing Model: Decision Tree with accuracy of 0.9474
