In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [26]:
data = load_breast_cancer()
print(data.data)

[[1.799e+01 1.038e+01 1.228e+02 ... 2.654e-01 4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 ... 1.860e-01 2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 ... 2.430e-01 3.613e-01 8.758e-02]
 ...
 [1.660e+01 2.808e+01 1.083e+02 ... 1.418e-01 2.218e-01 7.820e-02]
 [2.060e+01 2.933e+01 1.401e+02 ... 2.650e-01 4.087e-01 1.240e-01]
 [7.760e+00 2.454e+01 4.792e+01 ... 0.000e+00 2.871e-01 7.039e-02]]


In [None]:
print("Feature names:", data.feature_names)
print("Data shape:", data.data.shape)
print("Target name:", data.target_names)

Feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Data shape: (569, 30)
Target name: ['malignant' 'benign']


In [27]:
X = data.data
y = data.target

In [15]:
# convert data into data frame
df = pd.DataFrame(data=data.data,columns=data.feature_names)

In [25]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [19]:
df.duplicated().sum()

np.int64(0)

In [20]:
# displaying null values in each columns
df.isnull().sum()/len(df)

mean radius                0.0
mean texture               0.0
mean perimeter             0.0
mean area                  0.0
mean smoothness            0.0
mean compactness           0.0
mean concavity             0.0
mean concave points        0.0
mean symmetry              0.0
mean fractal dimension     0.0
radius error               0.0
texture error              0.0
perimeter error            0.0
area error                 0.0
smoothness error           0.0
compactness error          0.0
concavity error            0.0
concave points error       0.0
symmetry error             0.0
fractal dimension error    0.0
worst radius               0.0
worst texture              0.0
worst perimeter            0.0
worst area                 0.0
worst smoothness           0.0
worst compactness          0.0
worst concavity            0.0
worst concave points       0.0
worst symmetry             0.0
worst fractal dimension    0.0
dtype: float64

In [34]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42  )

In [35]:
scaler = StandardScaler()

In [43]:
scaled_X_train = scaler.fit_transform(X_train)

In [44]:
scaled_X_test = scaler.transform(X_test)

In [39]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [41]:
# define function for executing the model

def build_model(model,model_name,x_train,y_train,x_test,y_test):
    print("model_name : ",model_name,'model')
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print(model)
    print("-------------------------------------")
    # Evaluate performance
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

- Logistic Regression is a linear classification algorithm that models the probability of an instance belonging to a particular class using the sigmoid function.
 
- It calculates a linear combination of input features and applies the sigmoid to output a probability between 0 and 1.

Suitable for :
- Suitable when relationship between features and target is approximately linear.

- Fast, interpretable, and works well for binary classification or multiclass.

- Performs well when features are scaled and dataset is not highly complex.

In [None]:
lr_model = LogisticRegression()
build_model(lr_model,"Logistic Regression",scaled_X_train,y_train,scaled_X_test,y_test)

model_name :  Logistic Regression model
LogisticRegression()
-------------------------------------
Accuracy: 0.9736842105263158
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Confusion Matrix:
 [[41  2]
 [ 1 70]]


In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

- Splits data recursively based on feature values that maximize information gain or minimize impurity (e.g. Gini, entropy).

- Creates a tree-like model where each node represents a feature split, and leaves represent class labels.

Suitable for :
- Good for capturing non-linear relationships and feature interactions.

- Easy to interpret and visualize.

- Can overfit on noisy data if not pruned properly.

In [None]:
dt_model = DecisionTreeClassifier()
build_model(dt_model,"Decision Tree Classifier",scaled_X_train,y_train,scaled_X_test,y_test)

model_name :  Decision Tree Classifier model
DecisionTreeClassifier()
-------------------------------------
Accuracy: 0.9385964912280702
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.93      0.92        43
           1       0.96      0.94      0.95        71

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.93       114
weighted avg       0.94      0.94      0.94       114

Confusion Matrix:
 [[40  3]
 [ 4 67]]


In [48]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

- An ensemble of decision trees, each trained on random subsets of data and features (bagging + feature randomness).

- Outputs the majority class prediction across all trees.

Suitable for :
- Reduces overfitting compared to single decision trees.

- Handles non-linear relationships, missing values, and high dimensionality effectively.

- Suitable for most classification problems due to its robustness and generalization.

In [49]:
rfc = RandomForestClassifier()
build_model(rfc,"Random Forest Classifier",scaled_X_train,y_train,scaled_X_test,y_test)

model_name :  Random Forest Classifier model
RandomForestClassifier()
-------------------------------------
Accuracy: 0.9649122807017544
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Confusion Matrix:
 [[40  3]
 [ 1 70]]


In [50]:
# Support Vector Machine (SVM)
from sklearn.svm import SVC

- Finds the optimal hyperplane that maximizes the margin between classes.

- Uses kernels (linear, polynomial, RBF) to handle linear or non-linear data by transforming it into higher dimensions.

Suitable for :
- Effective in high-dimensional spaces and with clear margin separation.

- Good for small to medium-sized datasets where number of features is large.

- Requires careful parameter tuning and scaling of features.

In [52]:
svc_model = SVC()
build_model(svc_model,"Support Vector Machine",scaled_X_train,y_train,scaled_X_test,y_test) 

model_name :  Support Vector Machine model
SVC()
-------------------------------------
Accuracy: 0.9824561403508771
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.98        43
           1       0.97      1.00      0.99        71

    accuracy                           0.98       114
   macro avg       0.99      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114

Confusion Matrix:
 [[41  2]
 [ 0 71]]


In [53]:
# k-Nearest Neighbors (k-NN)
from sklearn.neighbors import KNeighborsClassifier

- Instance-based learning (lazy learning) where no explicit model is built.

- Classifies a data point based on the majority label among its k nearest neighbors using a distance metric (e.g. Euclidean).

Suitable for :
- Simple and effective for small datasets with clear clusters.

- Computationally expensive for large datasets as it stores all data points.

- Sensitive to feature scaling and irrelevant features.

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=4)  
build_model(knn_model,"K-Nearest Neighbors",scaled_X_train,y_train,scaled_X_test,y_test)

model_name :  K-Nearest Neighbors model
KNeighborsClassifier(n_neighbors=4)
-------------------------------------
Accuracy: 0.956140350877193
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.95      0.94        43
           1       0.97      0.96      0.96        71

    accuracy                           0.96       114
   macro avg       0.95      0.96      0.95       114
weighted avg       0.96      0.96      0.96       114

Confusion Matrix:
 [[41  2]
 [ 3 68]]


Best Model Performance Report
- Model Name: Support Vector Machine (SVM)
- Algorithm: sklearn.svm.SVC()

1. Overall Accuracy
Accuracy: 0.982 (98.25%)


2. Classification Report

    Macro Average:
    Precision: 0.99 | Recall: 0.98 | F1-Score: 0.98

    Weighted Average:
    Precision: 0.98 | Recall: 0.98 | F1-Score: 0.98

- Precision: Very high for both classes, indicating few false positives.
- Recall: Perfect (1.00) for class 1, meaning no false negatives for class 1; slightly lower (0.95) for class 0.
- F1-Score: Overall excellent balance between precision and recall for both classes.

3. Confusion Matrix
- False Positives (class 0 predicted as 1): 2
- False Negatives (class 1 predicted as 0): 0