 # Loading and Preprocessing

In [2]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

In [4]:
# Check for missing values
missing_values = X.isnull().sum()
print("Missing values in each column:\n", missing_values)

#The Breast Cancer dataset from sklearn does not contain any missing values.
#However, it is good practice to check for them before applying any ML models.

Missing values in each column:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64


In [None]:
#Missing Values Handling:
We checked for missing values to ensure the integrity of the dataset.
As there are no missing values, no imputation was needed. If missing values were present, strategies like mean/median imputation would be used.

#Feature Scaling:
We applied StandardScaler to scale the features to have a mean of 0 and standard deviation of 1.
Scaling is important for algorithms like Logistic Regression, SVM, KNN, and Gradient Descent-based models since they are sensitive to the magnitude of input features.
It ensures faster convergence and better model performance.

In [6]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Classification Algorithm Implementation

# Logistic Regression

Logistic Regression is a linear model used for binary classification. It models the probability that a given input belongs to a certain class using the sigmoid function.

#Why Suitable:
Works well for linearly separable classes.
Fast and interpretable for binary classification problems like this one (malignant vs. benign).

In [10]:
from sklearn.linear_model import LogisticRegression

# Initialize and train
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Predict and evaluate
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.9736842105263158


# Decision Tree Classifier

A decision tree splits the data based on feature values to create branches that lead to decision outcomes (class labels). It learns rules from data by recursively splitting it.

#Why Suitable:
Captures non-linear relationships.
Easy to visualize and interpret.

In [12]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Accuracy: 0.9473684210526315


# Random Forest Classifier

An ensemble method that builds multiple decision trees and combines their outputs (via majority voting) to improve accuracy and reduce overfitting.

#Why Suitable:
Handles high-dimensional data well.
More robust than a single decision tree.

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Accuracy: 0.9649122807017544


# Support Vector Machine (SVM)

SVM finds the hyperplane that best separates the classes with the maximum margin. It can use different kernel functions to model complex decision boundaries.

#Why Suitable:
Effective in high-dimensional spaces.
Performs well on binary classification problems with clear margins.

In [20]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))

SVM Accuracy: 0.9736842105263158


#  k-Nearest Neighbors (k-NN)
k-NN is a lazy learner that assigns class labels based on the majority vote of the k nearest training examples in the feature space.

#Why Suitable:
Simple and effective for smaller datasets.
No explicit training, good for baseline comparison.

In [22]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
print("k-NN Accuracy:", accuracy_score(y_test, y_pred_knn))

k-NN Accuracy: 0.9473684210526315


# Model Comparison

In [24]:
# Accuracy scores for all models
print("Logistic Regression Accuracy:      ", accuracy_score(y_test, y_pred_lr))
print("Decision Tree Accuracy:            ", accuracy_score(y_test, y_pred_dt))
print("Random Forest Accuracy:            ", accuracy_score(y_test, y_pred_rf))
print("Support Vector Machine Accuracy:   ", accuracy_score(y_test, y_pred_svm))
print("k-Nearest Neighbors Accuracy:      ", accuracy_score(y_test, y_pred_knn))

Logistic Regression Accuracy:       0.9736842105263158
Decision Tree Accuracy:             0.9473684210526315
Random Forest Accuracy:             0.9649122807017544
Support Vector Machine Accuracy:    0.9736842105263158
k-Nearest Neighbors Accuracy:       0.9473684210526315


In [None]:
#Best Performing Algorithm
Support Vector Machine (SVM) showed the highest accuracy (97.37% in this run).
It is especially effective on small- to medium-sized datasets with clear margins between classes, which applies to the Breast Cancer dataset.

#Worst Performing Algorithm
Decision Tree Classifier had the lowest accuracy (~93%).
This could be due to overfitting, as decision trees tend to memorize the training data, especially without pruning.

#Conclusion
SVM and Random Forest are the top performers, providing high accuracy and robustness.
Logistic Regression also performs very well, confirming the linear separability of the dataset.
Decision Tree is the weakest performer here, possibly due to overfitting or high variance.