In [1]:
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd

# Define column names as per UCI dataset description
column_names = [
    "ID", "Diagnosis", "Radius_mean", "Texture_mean", "Perimeter_mean", "Area_mean",
    "Smoothness_mean", "Compactness_mean", "Concavity_mean", "Concave_points_mean",
    "Symmetry_mean", "Fractal_dimension_mean", "Radius_se", "Texture_se", "Perimeter_se",
    "Area_se", "Smoothness_se", "Compactness_se", "Concavity_se", "Concave_points_se",
    "Symmetry_se", "Fractal_dimension_se", "Radius_worst", "Texture_worst", "Perimeter_worst",
    "Area_worst", "Smoothness_worst", "Compactness_worst", "Concavity_worst",
    "Concave_points_worst", "Symmetry_worst", "Fractal_dimension_worst"
]

# Load the dataset
df = pd.read_csv("wdbc.data", header=None, names=column_names)

# Save to CSV with column names
df.to_csv("breast_cancer_diagnostic.csv", index=False)

# Display the first few rows
print(df.head())

         ID Diagnosis  Radius_mean  Texture_mean  Perimeter_mean  Area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   Smoothness_mean  Compactness_mean  Concavity_mean  Concave_points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  Radius_worst  Texture_worst  Perimeter_worst  Area_wor

# mRMR Feature Selection with Grid Search

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from mrmr import mrmr_classif
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

# ✅ Custom Transformer for mRMR Feature Selection
class MRMRFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, K=10):
        self.K = K
        self.selected_features = None

    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):  # Ensure X is a DataFrame
            raise ValueError("X must be a Pandas DataFrame for mRMR")
        self.selected_features = mrmr_classif(X=X, y=y, K=self.K)
        return self

    def transform(self, X):
        return X[self.selected_features]

# ✅ Load Dataset
X = df.iloc[:, 2:36]  # Features
Y = df.iloc[:, 1]  # Target Variable

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# ✅ Define Models
models = [
    ('LR', LogisticRegression(max_iter=1000)),
    ('LDA', LinearDiscriminantAnalysis()),
    ('KNN', KNeighborsClassifier()),
    ('CART', DecisionTreeClassifier()),
    ('NB', GaussianNB()),
    ('SVM', SVC())
]

cv_folds = 5  # Use 5-fold cross-validation
best_overall_score = 0
best_K = None
best_model_name = None

# ✅ Iterate over K values from 5 to 27
for K in range(5, 28):
    print(f"\nEvaluating K={K}...\n")
    
    # Feature Selection
    selector = MRMRFeatureSelector(K=K)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Standardize After Feature Selection
    sc = StandardScaler()
    X_train_selected = sc.fit_transform(X_train_selected)
    X_test_selected = sc.transform(X_test_selected)

    for name, model in models:
        # Perform Cross-Validation
        cv_scores = cross_val_score(model, X_train_selected, y_train, cv=cv_folds, scoring='accuracy')
        mean_accuracy = np.mean(cv_scores)

        # Train on full training set
        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_test_selected)
        test_accuracy = accuracy_score(y_test, y_pred)

        print(f"K={K}, {name}: CV Accuracy = {mean_accuracy:.4f}, Test Accuracy = {test_accuracy:.4f}")

        # ✅ Track best-performing model
        if mean_accuracy > best_overall_score:
            best_overall_score = mean_accuracy
            best_K = K
            best_model_name = name

# ✅ Final Results
print(f"\n🎯 Best Model: {best_model_name} with K={best_K}")
print(f"📈 Best Cross-Validation Accuracy: {best_overall_score:.4f}")



Evaluating K=5...



100%|█████████████████████████████████████████████| 5/5 [00:00<00:00, 79.83it/s]

K=5, LR: CV Accuracy = 0.9385, Test Accuracy = 0.9737
K=5, LDA: CV Accuracy = 0.9473, Test Accuracy = 0.9474
K=5, KNN: CV Accuracy = 0.9451, Test Accuracy = 0.9649
K=5, CART: CV Accuracy = 0.9451, Test Accuracy = 0.9298





K=5, NB: CV Accuracy = 0.9429, Test Accuracy = 0.9474
K=5, SVM: CV Accuracy = 0.9495, Test Accuracy = 0.9561

Evaluating K=6...



100%|█████████████████████████████████████████████| 6/6 [00:00<00:00, 78.25it/s]

K=6, LR: CV Accuracy = 0.9429, Test Accuracy = 0.9737
K=6, LDA: CV Accuracy = 0.9451, Test Accuracy = 0.9737





K=6, KNN: CV Accuracy = 0.9429, Test Accuracy = 0.9649
K=6, CART: CV Accuracy = 0.9385, Test Accuracy = 0.9386
K=6, NB: CV Accuracy = 0.9429, Test Accuracy = 0.9474
K=6, SVM: CV Accuracy = 0.9473, Test Accuracy = 0.9561

Evaluating K=7...



100%|█████████████████████████████████████████████| 7/7 [00:00<00:00, 71.59it/s]


K=7, LR: CV Accuracy = 0.9473, Test Accuracy = 0.9737
K=7, LDA: CV Accuracy = 0.9451, Test Accuracy = 0.9649
K=7, KNN: CV Accuracy = 0.9451, Test Accuracy = 0.9649
K=7, CART: CV Accuracy = 0.9341, Test Accuracy = 0.9298
K=7, NB: CV Accuracy = 0.9407, Test Accuracy = 0.9561
K=7, SVM: CV Accuracy = 0.9473, Test Accuracy = 0.9649

Evaluating K=8...



100%|█████████████████████████████████████████████| 8/8 [00:00<00:00, 70.96it/s]


K=8, LR: CV Accuracy = 0.9473, Test Accuracy = 0.9737
K=8, LDA: CV Accuracy = 0.9363, Test Accuracy = 0.9649
K=8, KNN: CV Accuracy = 0.9407, Test Accuracy = 0.9649
K=8, CART: CV Accuracy = 0.9275, Test Accuracy = 0.9386
K=8, NB: CV Accuracy = 0.9363, Test Accuracy = 0.9737
K=8, SVM: CV Accuracy = 0.9407, Test Accuracy = 0.9649

Evaluating K=9...



100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 68.57it/s]


K=9, LR: CV Accuracy = 0.9495, Test Accuracy = 0.9737
K=9, LDA: CV Accuracy = 0.9341, Test Accuracy = 0.9649
K=9, KNN: CV Accuracy = 0.9319, Test Accuracy = 0.9737
K=9, CART: CV Accuracy = 0.9407, Test Accuracy = 0.9123
K=9, NB: CV Accuracy = 0.9385, Test Accuracy = 0.9649
K=9, SVM: CV Accuracy = 0.9407, Test Accuracy = 0.9649

Evaluating K=10...



100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 65.66it/s]


K=10, LR: CV Accuracy = 0.9495, Test Accuracy = 0.9737
K=10, LDA: CV Accuracy = 0.9363, Test Accuracy = 0.9561
K=10, KNN: CV Accuracy = 0.9297, Test Accuracy = 0.9737
K=10, CART: CV Accuracy = 0.9297, Test Accuracy = 0.9298
K=10, NB: CV Accuracy = 0.9341, Test Accuracy = 0.9737
K=10, SVM: CV Accuracy = 0.9407, Test Accuracy = 0.9649

Evaluating K=11...



100%|███████████████████████████████████████████| 11/11 [00:00<00:00, 67.90it/s]


K=11, LR: CV Accuracy = 0.9473, Test Accuracy = 0.9737
K=11, LDA: CV Accuracy = 0.9363, Test Accuracy = 0.9561
K=11, KNN: CV Accuracy = 0.9363, Test Accuracy = 0.9737
K=11, CART: CV Accuracy = 0.9275, Test Accuracy = 0.9123
K=11, NB: CV Accuracy = 0.9297, Test Accuracy = 0.9649
K=11, SVM: CV Accuracy = 0.9385, Test Accuracy = 0.9561

Evaluating K=12...



100%|███████████████████████████████████████████| 12/12 [00:00<00:00, 58.32it/s]


K=12, LR: CV Accuracy = 0.9736, Test Accuracy = 0.9737
K=12, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9474
K=12, KNN: CV Accuracy = 0.9538, Test Accuracy = 0.9649
K=12, CART: CV Accuracy = 0.9385, Test Accuracy = 0.9211
K=12, NB: CV Accuracy = 0.9319, Test Accuracy = 0.9649
K=12, SVM: CV Accuracy = 0.9758, Test Accuracy = 0.9649

Evaluating K=13...



100%|███████████████████████████████████████████| 13/13 [00:00<00:00, 65.85it/s]


K=13, LR: CV Accuracy = 0.9736, Test Accuracy = 0.9737
K=13, LDA: CV Accuracy = 0.9495, Test Accuracy = 0.9474
K=13, KNN: CV Accuracy = 0.9582, Test Accuracy = 0.9561
K=13, CART: CV Accuracy = 0.9385, Test Accuracy = 0.9298
K=13, NB: CV Accuracy = 0.9319, Test Accuracy = 0.9649
K=13, SVM: CV Accuracy = 0.9736, Test Accuracy = 0.9649

Evaluating K=14...



100%|███████████████████████████████████████████| 14/14 [00:00<00:00, 65.83it/s]


K=14, LR: CV Accuracy = 0.9736, Test Accuracy = 0.9737
K=14, LDA: CV Accuracy = 0.9495, Test Accuracy = 0.9649
K=14, KNN: CV Accuracy = 0.9560, Test Accuracy = 0.9649
K=14, CART: CV Accuracy = 0.9385, Test Accuracy = 0.9298
K=14, NB: CV Accuracy = 0.9341, Test Accuracy = 0.9649
K=14, SVM: CV Accuracy = 0.9692, Test Accuracy = 0.9649

Evaluating K=15...



100%|███████████████████████████████████████████| 15/15 [00:00<00:00, 68.11it/s]


K=15, LR: CV Accuracy = 0.9780, Test Accuracy = 0.9737
K=15, LDA: CV Accuracy = 0.9538, Test Accuracy = 0.9561
K=15, KNN: CV Accuracy = 0.9626, Test Accuracy = 0.9649
K=15, CART: CV Accuracy = 0.9385, Test Accuracy = 0.9298
K=15, NB: CV Accuracy = 0.9341, Test Accuracy = 0.9649
K=15, SVM: CV Accuracy = 0.9670, Test Accuracy = 0.9561

Evaluating K=16...



100%|███████████████████████████████████████████| 16/16 [00:00<00:00, 68.30it/s]


K=16, LR: CV Accuracy = 0.9758, Test Accuracy = 0.9737
K=16, LDA: CV Accuracy = 0.9538, Test Accuracy = 0.9561
K=16, KNN: CV Accuracy = 0.9648, Test Accuracy = 0.9649
K=16, CART: CV Accuracy = 0.9407, Test Accuracy = 0.9298
K=16, NB: CV Accuracy = 0.9231, Test Accuracy = 0.9649
K=16, SVM: CV Accuracy = 0.9692, Test Accuracy = 0.9561

Evaluating K=17...



100%|███████████████████████████████████████████| 17/17 [00:00<00:00, 66.26it/s]


K=17, LR: CV Accuracy = 0.9758, Test Accuracy = 0.9825
K=17, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9561
K=17, KNN: CV Accuracy = 0.9648, Test Accuracy = 0.9649
K=17, CART: CV Accuracy = 0.9385, Test Accuracy = 0.9035
K=17, NB: CV Accuracy = 0.9253, Test Accuracy = 0.9649
K=17, SVM: CV Accuracy = 0.9736, Test Accuracy = 0.9649

Evaluating K=18...



100%|███████████████████████████████████████████| 18/18 [00:00<00:00, 71.53it/s]


K=18, LR: CV Accuracy = 0.9758, Test Accuracy = 0.9825
K=18, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9561
K=18, KNN: CV Accuracy = 0.9626, Test Accuracy = 0.9649
K=18, CART: CV Accuracy = 0.9297, Test Accuracy = 0.9035
K=18, NB: CV Accuracy = 0.9319, Test Accuracy = 0.9649
K=18, SVM: CV Accuracy = 0.9780, Test Accuracy = 0.9649

Evaluating K=19...



100%|███████████████████████████████████████████| 19/19 [00:00<00:00, 69.07it/s]


K=19, LR: CV Accuracy = 0.9758, Test Accuracy = 0.9825
K=19, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9561
K=19, KNN: CV Accuracy = 0.9626, Test Accuracy = 0.9649
K=19, CART: CV Accuracy = 0.9297, Test Accuracy = 0.9123
K=19, NB: CV Accuracy = 0.9297, Test Accuracy = 0.9649
K=19, SVM: CV Accuracy = 0.9780, Test Accuracy = 0.9649

Evaluating K=20...



100%|███████████████████████████████████████████| 20/20 [00:00<00:00, 65.36it/s]


K=20, LR: CV Accuracy = 0.9736, Test Accuracy = 0.9825
K=20, LDA: CV Accuracy = 0.9538, Test Accuracy = 0.9561
K=20, KNN: CV Accuracy = 0.9582, Test Accuracy = 0.9649
K=20, CART: CV Accuracy = 0.9253, Test Accuracy = 0.9123
K=20, NB: CV Accuracy = 0.9341, Test Accuracy = 0.9649
K=20, SVM: CV Accuracy = 0.9758, Test Accuracy = 0.9649

Evaluating K=21...



100%|███████████████████████████████████████████| 21/21 [00:00<00:00, 69.14it/s]


K=21, LR: CV Accuracy = 0.9714, Test Accuracy = 0.9825
K=21, LDA: CV Accuracy = 0.9495, Test Accuracy = 0.9474
K=21, KNN: CV Accuracy = 0.9538, Test Accuracy = 0.9649
K=21, CART: CV Accuracy = 0.9319, Test Accuracy = 0.9211
K=21, NB: CV Accuracy = 0.9341, Test Accuracy = 0.9649
K=21, SVM: CV Accuracy = 0.9802, Test Accuracy = 0.9737

Evaluating K=22...



100%|███████████████████████████████████████████| 22/22 [00:00<00:00, 68.55it/s]


K=22, LR: CV Accuracy = 0.9736, Test Accuracy = 0.9737
K=22, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9474
K=22, KNN: CV Accuracy = 0.9604, Test Accuracy = 0.9649
K=22, CART: CV Accuracy = 0.9341, Test Accuracy = 0.9123
K=22, NB: CV Accuracy = 0.9341, Test Accuracy = 0.9649
K=22, SVM: CV Accuracy = 0.9780, Test Accuracy = 0.9737

Evaluating K=23...



100%|███████████████████████████████████████████| 23/23 [00:00<00:00, 59.87it/s]


K=23, LR: CV Accuracy = 0.9714, Test Accuracy = 0.9737
K=23, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9561
K=23, KNN: CV Accuracy = 0.9582, Test Accuracy = 0.9649
K=23, CART: CV Accuracy = 0.9297, Test Accuracy = 0.9211
K=23, NB: CV Accuracy = 0.9275, Test Accuracy = 0.9649
K=23, SVM: CV Accuracy = 0.9758, Test Accuracy = 0.9737

Evaluating K=24...



100%|███████████████████████████████████████████| 24/24 [00:00<00:00, 46.84it/s]


K=24, LR: CV Accuracy = 0.9736, Test Accuracy = 0.9737
K=24, LDA: CV Accuracy = 0.9538, Test Accuracy = 0.9561
K=24, KNN: CV Accuracy = 0.9538, Test Accuracy = 0.9649
K=24, CART: CV Accuracy = 0.9385, Test Accuracy = 0.9211
K=24, NB: CV Accuracy = 0.9253, Test Accuracy = 0.9649
K=24, SVM: CV Accuracy = 0.9758, Test Accuracy = 0.9737

Evaluating K=25...



100%|███████████████████████████████████████████| 25/25 [00:00<00:00, 33.03it/s]


K=25, LR: CV Accuracy = 0.9758, Test Accuracy = 0.9737
K=25, LDA: CV Accuracy = 0.9495, Test Accuracy = 0.9561
K=25, KNN: CV Accuracy = 0.9560, Test Accuracy = 0.9649
K=25, CART: CV Accuracy = 0.9275, Test Accuracy = 0.9211
K=25, NB: CV Accuracy = 0.9253, Test Accuracy = 0.9649
K=25, SVM: CV Accuracy = 0.9714, Test Accuracy = 0.9737

Evaluating K=26...



100%|███████████████████████████████████████████| 26/26 [00:00<00:00, 29.13it/s]


K=26, LR: CV Accuracy = 0.9758, Test Accuracy = 0.9737
K=26, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9561
K=26, KNN: CV Accuracy = 0.9582, Test Accuracy = 0.9649
K=26, CART: CV Accuracy = 0.9231, Test Accuracy = 0.9386
K=26, NB: CV Accuracy = 0.9275, Test Accuracy = 0.9649
K=26, SVM: CV Accuracy = 0.9736, Test Accuracy = 0.9649

Evaluating K=27...



100%|███████████████████████████████████████████| 27/27 [00:01<00:00, 16.98it/s]


K=27, LR: CV Accuracy = 0.9758, Test Accuracy = 0.9737
K=27, LDA: CV Accuracy = 0.9516, Test Accuracy = 0.9561
K=27, KNN: CV Accuracy = 0.9582, Test Accuracy = 0.9649
K=27, CART: CV Accuracy = 0.9187, Test Accuracy = 0.9386
K=27, NB: CV Accuracy = 0.9297, Test Accuracy = 0.9649
K=27, SVM: CV Accuracy = 0.9670, Test Accuracy = 0.9649

🎯 Best Model: SVM with K=21
📈 Best Cross-Validation Accuracy: 0.9802
