# Breast Cancer Prediction - Hyperparameter Tuning
This notebook tunes Decision Tree, K-Nearest Neighbors, and Logistic Regression using GridSearchCV and evaluates their optimized performance on the test set.



In [1]:
# Import appropriate libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import RFE, VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
# Load the breast cancer dataset
try:
    cancer = load_breast_cancer()

    print("Dataset loaded successfully.")
    print(f"Number of samples: {cancer.data.shape[0]}")
    print(f"Number of features: {cancer.data.shape[1]}")

except:
    print("Error loading the dataset. Please ensure you have the required libraries installed.")
    

Dataset loaded successfully.
Number of samples: 569
Number of features: 30


In [3]:
# Split the dataset for model training, testing and validation
X_temp, X_test, y_temp, y_test = train_test_split(cancer.data, cancer.target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [4]:
# Variance Threshold Filtering
selector = VarianceThreshold(threshold=0.01)
X_train_var = selector.fit_transform(X_train)
var_mask = selector.get_support()
selected_features = np.array(cancer.feature_names)[var_mask]
print("Features kept after variance threshold filtering:")
print(selected_features)

X_val_var = selector.transform(X_val)
X_test_var = selector.transform(X_test)



Features kept after variance threshold filtering:
['mean radius' 'mean texture' 'mean perimeter' 'mean area' 'radius error'
 'texture error' 'perimeter error' 'area error' 'worst radius'
 'worst texture' 'worst perimeter' 'worst area' 'worst compactness'
 'worst concavity']


In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_var)

# Feature Selection using RFE
lr = LogisticRegression(solver='liblinear')
rfe = RFE(lr, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)

print("Features selected by RFE:")
print(np.array(cancer.feature_names)[var_mask][rfe.get_support()])
print("Ranking of features by RFE:")
print(rfe.ranking_)
print("RFE score:")
print(rfe.score(X_train_scaled, y_train))

Features selected by RFE:
['mean perimeter' 'mean area' 'radius error' 'texture error' 'area error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst concavity']
Ranking of features by RFE:
[3 5 1 1 1 1 4 1 1 1 1 1 2 1]
RFE score:
0.9648093841642229


In [11]:
rfe = RFE(lr, n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)

print("Features selected by RFE:")
print(np.array(cancer.feature_names)[var_mask][rfe.get_support()])
print("Ranking of features by RFE:")
print(rfe.ranking_)
print("RFE score:")
print(rfe.score(X_train_scaled, y_train))

# Save the mask for later use on val/test
rfe_mask = rfe.get_support()

Features selected by RFE:
['area error' 'worst radius' 'worst texture' 'worst area'
 'worst concavity']
Ranking of features by RFE:
[ 8 10  5  6  3  4  9  1  1  1  2  1  7  1]
RFE score:
0.9648093841642229


In [13]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000, solver='liblinear'))
])

param_grid = {
    'logreg__penalty': ['l1', 'l2'],
    'logreg__C': [0.01, 0.1, 1, 10, 100],
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', verbose=1)
grid.fit(X_train_var[:, rfe_mask], y_train)

print("Best params:", grid.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best params: {'logreg__C': 0.1, 'logreg__penalty': 'l1'}


In [14]:
# Validation performance
y_val_pred = grid.predict(X_val_var[:, rfe_mask])
print("Validation Report:\n", classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("MSE:", np.mean((y_val - y_val_pred) ** 2))

# Final test performance
y_test_pred = grid.predict(X_test_var[:, rfe_mask])
print("Test Report:\n", classification_report(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("MSE:", np.mean((y_test - y_test_pred) ** 2))

Validation Report:
               precision    recall  f1-score   support

           0       0.89      0.95      0.92        44
           1       0.97      0.93      0.95        70

    accuracy                           0.94       114
   macro avg       0.93      0.94      0.94       114
weighted avg       0.94      0.94      0.94       114

Confusion Matrix:
 [[42  2]
 [ 5 65]]
MSE: 0.06140350877192982
Test Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95        43
           1       0.97      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114

Confusion Matrix:
 [[41  2]
 [ 2 69]]
MSE: 0.03508771929824561
