In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Load dataset
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [5]:
# Check for missing values
print("Missing values in dataset:")
print(df.isnull().sum())

Missing values in dataset:
mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [7]:
# Feature Scaling
X = df.drop(columns=['target'])
y = df['target']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


## Explanation of Preprocessing Steps 
1. Checked for missing values: No missing values found.
2. Applied StandardScaler to normalize feature values for better model performance.
3. Split data into training (80%) and testing (20%) sets to prevent overfitting.


In [13]:
# Step 2: Classification Algorithm Implementation (5 marks)
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Support Vector Machine": SVC(),
    "k-Nearest Neighbors": KNeighborsClassifier()
}

results = {}
explanations = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": classification_report(y_test, y_pred, output_dict=True)
    }
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


Logistic Regression Performance:
Accuracy: 0.9737
Confusion Matrix:
[[41  2]
 [ 1 70]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Decision Tree Performance:
Accuracy: 0.9386
Confusion Matrix:
[[39  4]
 [ 3 68]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92        43
           1       0.94      0.96      0.95        71

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114


Random Forest Performance:
Accuracy: 0.9649
Confusion Matrix:
[[40  3]
 [ 1 70]]
Classification Report:
              precision  

# Algorithm Explanation

**Logistic Regression**

        - Uses the sigmoid function to predict probabilities for binary classification.
        - Suitable for this dataset because it is simple, interpretable, and works well for binary classification.
        - Achieved an accuracy of 97.37% with high precision and recall.
    
**Decision Tree**
        
        - Splits data based on feature values and creates a tree structure for decision making.
        - Suitable as it can handle both numerical and categorical data but may overfit.
        - Achieved an accuracy of 93.86%, slightly lower than other models due to possible overfitting.

**Random Forest**
       
        - Uses multiple decision trees to reduce overfitting and improve accuracy.
        - Suitable as it provides high accuracy and works well with missing data.
        - Achieved an accuracy of 96.49%, showing its robustness over a single decision tree.
  
**Support Vector Machine**
   
        - Finds the best hyperplane to separate classes and uses kernel tricks for non-linearity.
        - Suitable as it works well in high-dimensional datasets like this.
        - Achieved an accuracy of 97.37%, similar to Logistic Regression, making it a strong classifier.
  
**k-Nearest Neighbors**
   
        - Classifies based on the majority class of the nearest neighbors.
        - Suitable for small datasets but computationally expensive for large ones.
        - Achieved an accuracy of 94.74%, making it slightly less effective than Random Forest or SVM.
  

In [19]:
# Step 3: Model Comparison (2 marks)
comparison_df = pd.DataFrame({k: v['Accuracy'] for k, v in results.items()}, index=['Accuracy']).T
print("\nModel Comparison Table:")
print(comparison_df)

best_model = comparison_df['Accuracy'].idxmax()
worst_model = comparison_df['Accuracy'].idxmin()
print(f"Best Performing Model: {best_model}")
print(f"Worst Performing Model: {worst_model}")



Model Comparison Table:
                        Accuracy
Logistic Regression     0.973684
Decision Tree           0.938596
Random Forest           0.964912
Support Vector Machine  0.973684
k-Nearest Neighbors     0.947368
Best Performing Model: Logistic Regression
Worst Performing Model: Decision Tree
