In [3]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
# Update the dataset path as necessary
dataset_path = "breast-cancer.csv"  # Replace with the correct file path
data = pd.read_csv(dataset_path)

# Encode target variable (Assuming 'diagnosis' is the target)
le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])

# Separate features and target
X = data.drop(columns=['diagnosis'], axis=1)
y = data['diagnosis']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --------------------
# Feature Selection
# --------------------

# 1. Filter Method: SelectKBest with Chi-Squared
select_k_best = SelectKBest(score_func=chi2, k=10)
X_train_kbest = select_k_best.fit_transform(np.abs(X_train), y_train)
X_test_kbest = select_k_best.transform(np.abs(X_test))

# 2. Wrapper Method: Recursive Feature Elimination (RFE)
logistic_model = LogisticRegression(max_iter=500)
rfe = RFE(estimator=logistic_model, n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

# 3. Embedded Method: Feature importance using Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
importances = rf_model.feature_importances_

# Select top 10 features based on importance
top_features = np.argsort(importances)[-10:]
X_train_embedded = X_train[:, top_features]
X_test_embedded = X_test[:, top_features]

# --------------------
# Machine Learning Model
# --------------------
# Train and evaluate models for each feature selection method

methods = {
    "Filter (SelectKBest)": (X_train_kbest, X_test_kbest),
    "Wrapper (RFE)": (X_train_rfe, X_test_rfe),
    "Embedded (Random Forest)": (X_train_embedded, X_test_embedded)
}

for method, (X_train_fs, X_test_fs) in methods.items():
    # Initialize Random Forest Classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_fs, y_train)
    
    # Predict and calculate accuracy
    y_pred = model.predict(X_test_fs)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy using {method}: {accuracy * 100:.2f}%")


Accuracy using Filter (SelectKBest): 85.96%
Accuracy using Wrapper (RFE): 95.61%
Accuracy using Embedded (Random Forest): 95.61%
