In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Loading the breast cancer dataset
data = load_breast_cancer()

In [3]:
# Converting the dataset into a DataFrame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Displaying first few rows of the dataset
print(df.head())

# Checking for missing values
print(df.isnull().sum())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [7]:
# Preprocessing: Handling missing values (in this case, there are no missing values)
# Feature Scaling (important for SVM, k-NN, and Logistic Regression)
scaler = StandardScaler()
X = df.drop('target', axis=1)
y = df['target']

In [9]:
# Applying Standard Scaler to features
X_scaled = scaler.fit_transform(X)

# Spliting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
Explanation of Preprocessing:

Missing values: The dataset does not have any missing values, since isnull().sum() has shown zeros. 
Feature Scaling: We used StandardScaler to scale the features. This is important for algorithms like Logistic Regression, SVM, and k-NN since these algorithms are sensitive to the scale of the data. Random Forest and Decision Tree are not sensitive to feature scaling, but it’s still a good practice to scale all features to a common range.

In [None]:
2. Classification Algorithm Implementation 

a) Logistic Regression
Logistic Regression is a linear model used for binary classification. It outputs, probabilities using the logistic function. It works well when the data is linearly separable.

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression Accuracy: {accuracy_log_reg:.4f}")

Logistic Regression Accuracy: 0.9737


In [None]:
b) Decision Tree Classifier
A Decision Tree recursively splits the data into smaller subsets based on the most significant feature. It works well for both classification and regression problems.

In [17]:
from sklearn.tree import DecisionTreeClassifier

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")


Decision Tree Accuracy: 0.9474


In [None]:
c) Random Forest Classifier
Random Forest is an ensemble method that builds multiple decision trees and merges their results to improve classification accuracy. It is robust and works well on a variety of datasets.

In [19]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.4f}")


Random Forest Accuracy: 0.9649


In [None]:
d) Support Vector Machine (SVM)
 SVM is a powerful classification algorithm that finds the hyperplane that best separates the data. It works well with high-dimensional data and is especially good when the data is not linearly separable.

In [21]:
from sklearn.svm import SVC

# Support Vector Machine (SVM)
svm_classifier = SVC(random_state=42)
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")


SVM Accuracy: 0.9737


In [None]:
e) k-Nearest Neighbors (k-NN)
k-NN is a simple algorithm that classifies data based on the majority class among the k-nearest neighbors. It’s a non-parametric algorithm.

In [23]:
from sklearn.neighbors import KNeighborsClassifier

# k-Nearest Neighbors (k-NN)
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"k-NN Accuracy: {accuracy_knn:.4f}")


k-NN Accuracy: 0.9474


In [None]:
3. Model Comparison 

In [25]:
# Collecting the accuracies of each model
accuracies = {
    'Logistic Regression': accuracy_log_reg,
    'Decision Tree': accuracy_dt,
    'Random Forest': accuracy_rf,
    'SVM': accuracy_svm,
    'k-NN': accuracy_knn
}

# Sorting the accuracies
sorted_accuracies = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)

# Displaying the results
for model, accuracy in sorted_accuracies:
    print(f"{model}: {accuracy:.4f}")

# Determine the best and worst models
best_model = sorted_accuracies[0]
worst_model = sorted_accuracies[-1]

print(f"\nBest Model: {best_model[0]} with accuracy {best_model[1]:.4f}")
print(f"Worst Model: {worst_model[0]} with accuracy {worst_model[1]:.4f}")


Logistic Regression: 0.9737
SVM: 0.9737
Random Forest: 0.9649
Decision Tree: 0.9474
k-NN: 0.9474

Best Model: Logistic Regression with accuracy 0.9737
Worst Model: k-NN with accuracy 0.9474


In [None]:
Summary of the Steps:
Data Loading and Preprocessing: Loaded the dataset, checked for missing values, scaled the features, and split the data into training and testing sets.
Classification Algorithms: Implemented five classification models (Logistic Regression, Decision Tree, Random Forest, SVM, and k-NN) and evaluated their performance using accuracy.
Model Comparison: Compared the models' performances based on accuracy and identified the best and worst models.