In [3]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

data = load_breast_cancer()

# Extract the features and target variable
X = data.data  
y = data.target  

# Create a DataFrame for easier inspection
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y


In [5]:
# Check for missing values
print(df.isnull().sum())

# Feature scaling: Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) 

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


In [25]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the first few rows of the scaled training and test data
X_train_scaled[:5], X_test_scaled[:5]


(array([[-0.12348985, -0.29680142, -0.17050713, -0.20861569, -1.2016799 ,
         -0.7731696 , -0.76231194, -0.93324109, -1.22994935, -0.94816603,
         -0.53359339, -0.86028757, -0.61678096, -0.39177533, -1.35556152,
         -0.52503193, -0.4817033 , -0.97940018, -0.88459317, -0.68548672,
         -0.19761978, -0.5067476 , -0.30791001, -0.27357592, -1.50742388,
         -0.44926047, -0.57223884, -0.84082156, -0.8563616 , -0.76574773],
        [-0.22826757, -0.65795149, -0.25377521, -0.2965028 , -1.80463697,
         -0.58761605, -0.09198533, -0.54268359, -1.41998468, -0.61249143,
         -0.83040055, -0.12266723, -0.78254381, -0.53126109, -0.36490698,
          0.40861926,  0.57668457, -0.2482875 , -1.03572382,  0.10768859,
         -0.42291745, -0.45849468, -0.4652873 , -0.43812681, -1.27301714,
          0.02704209,  0.31804488, -0.37706655, -1.3415819 , -0.41480748],
        [ 0.14553402, -1.23056444,  0.24583328, -0.01024193,  0.5191843 ,
          1.57000613,  0.73231958,  

### Preprocessing Steps and Justification

#### Explanation:
- **Missing Values Handling:** 
    - We checked for missing values in the dataset. Since there were no missing values, no further imputation was required.
    
- **Train-Test Split:** 
    - We divided the dataset into **80% training data** and **20% testing data**. 
    - This ensures that the models are trained on a sufficient amount of data while leaving enough unseen data to evaluate their performance effectively.
    
- **Feature Scaling:** 
    - We applied **StandardScaler()** to normalize the feature values. This is necessary because:
        - Some machine learning models, such as **SVM** and **k-NN**, are sensitive to the scale of input features. Features with larger numerical values could dominate the learning process if scaling isn't applied.
        - Standardization ensures that all features contribute equally to the model, allowing for better performance.


### Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize and train the Logistic Regression model
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_log = log_reg.predict(X_test)
log_reg_accuracy = log_reg.score(X_test, y_test)

# Output the accuracy
print(f"Logistic Regression Accuracy: {log_reg_accuracy}")



Logistic Regression Accuracy: 0.9824561403508771


**How it works**: 
Logistic Regression is a linear model used for binary classification. It predicts the probability of the dependent variable belonging to a particular class (0 or 1). The logistic function (sigmoid function) is used to model the relationship between the input features and the binary target variable. It outputs values between 0 and 1, which can be interpreted as probabilities.

**Why it is suitable for this dataset**:
- The breast cancer dataset is a binary classification problem (malignant or benign).
- Logistic Regression is suitable when the relationship between the dependent variable and independent variables is approximately linear, which makes it an appropriate choice for this dataset.
- It is computationally efficient, easy to interpret, and provides probability scores for classification.


### Decision Tree Classifier Implementation

In [13]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_dt = dt.predict(X_test)
dt_accuracy = dt.score(X_test, y_test)

# Output the accuracy
print(f"Decision Tree Accuracy: {dt_accuracy}")


Decision Tree Accuracy: 0.9415204678362573


**How it works**:  
A Decision Tree is a tree-like model where each node represents a feature (or attribute), each branch represents a decision rule, and each leaf represents the outcome. The tree is built by recursively splitting the data based on feature values that maximize information gain (reducing entropy) or minimizing impurity (using criteria such as Gini impurity or entropy).

**Why it is suitable for this dataset**:

- Decision Trees can capture non-linear relationships between features, which makes them suitable for datasets where relationships are more complex.
- It is interpretable and easy to visualize, making it easier to understand the model's decision-making process.
- It works well with both numerical and categorical data.


### Random Forest Classifier Implementation

In [15]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_rf = rf.predict(X_test)
rf_accuracy = rf.score(X_test, y_test)

# Output the accuracy
print(f"Random Forest Accuracy: {rf_accuracy}")


Random Forest Accuracy: 0.9707602339181286


**How it works**:  
A Random Forest is an ensemble of decision trees where each tree is built using a random subset of the features and samples. The final prediction is made by averaging the predictions of all individual trees (for regression) or using the majority vote (for classification).

**Why it is suitable for this dataset**:

- Random Forest handles high-dimensional data (many features) and works well with large datasets, making it suitable for complex datasets like breast cancer data.
- It reduces overfitting by averaging the results of multiple trees, improving generalization.
- Random Forest can handle both classification and regression problems and is less sensitive to noise compared to a single decision tree.


### Support Vector Machine (SVM) Implementation

In [19]:
from sklearn.svm import SVC

# Initialize and train the Support Vector Machine model
svm = SVC(random_state=42)
svm.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_svm = svm.predict(X_test)
svm_accuracy = svm.score(X_test, y_test)

# Output the accuracy
print(f"SVM Accuracy: {svm_accuracy}")


SVM Accuracy: 0.9707602339181286


**How it works**:  
Support Vector Machine (SVM) is a supervised machine learning algorithm that finds the hyperplane that best separates the data into two classes. SVM works by maximizing the margin between the classes and finding the optimal hyperplane that minimizes classification errors. SVM can also be extended to multi-class problems and supports both linear and non-linear decision boundaries through the kernel trick.

**Why it is suitable for this dataset**:

- SVM is effective for high-dimensional spaces, making it well-suited for datasets with many features, like the breast cancer dataset.
- It is powerful for binary classification tasks where a clear margin of separation exists between the classes.
- By using kernels, SVM can model non-linear decision boundaries if necessary.

### k-Nearest Neighbors (k-NN) Implementation

In [21]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train the k-NN classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Make predictions and calculate accuracy
y_pred_knn = knn.predict(X_test)
knn_accuracy = knn.score(X_test, y_test)

# Output the accuracy
print(f"k-NN Accuracy: {knn_accuracy}")


k-NN Accuracy: 0.9590643274853801


**How it works**:  
k-Nearest Neighbors (k-NN) is a simple, instance-based learning algorithm. It classifies a data point based on the majority class among its `k` nearest neighbors in the feature space. It doesn't learn a model explicitly but rather memorizes the training data and makes predictions based on similarity (distance metric) to nearby data points.

**Why it is suitable for this dataset**:

- k-NN works well when the dataset has clear clusters, and the decision boundary is not necessarily linear.
- It is easy to implement and interprets the data based on distances between data points.
- It doesn't make strong assumptions about the underlying distribution of data, making it a flexible classifier.


In [27]:
# Create a dictionary to store the accuracies of all models
model_accuracies = {
    "Logistic Regression": log_reg_accuracy,
    "Decision Tree": dt_accuracy,
    "Random Forest": rf_accuracy,
    "SVM": svm_accuracy,
    "k-NN": knn_accuracy
}

# Print the accuracy of each model
print("\nModel Comparison:")
for model, accuracy in model_accuracies.items():
    print(f"{model}: {accuracy}")



Model Comparison:
Logistic Regression: 0.9824561403508771
Decision Tree: 0.9415204678362573
Random Forest: 0.9707602339181286
SVM: 0.9707602339181286
k-NN: 0.9590643274853801
