In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score

In [21]:
data = pd.read_csv('breast_cancer_data.csv')

X = data.drop(columns=['id', 'diagnosis'])
y = data['diagnosis']

y = y.map({'M': 1, 'B': 0})

In [22]:
# Standardizing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# MinMaxScaler for non-negative features
min_max_scaler = MinMaxScaler()
X_min_max = min_max_scaler.fit_transform(X)

In [23]:
# 1.Filter Method (Chi-Square Test)
filter_selector = SelectKBest(chi2, k=10)
X_filter = filter_selector.fit_transform(X_min_max, y)

# 2.Wrapper Method (Recursive Feature Elimination)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe_selector = RFE(rf_model, n_features_to_select=10)
X_rfe = rfe_selector.fit_transform(X_scaled, y)

# 3.Embedded Method (Random Forest Feature Importance)
rf_model.fit(X_scaled, y)
importances = rf_model.feature_importances_
selected_features = [i for i, x in enumerate(importances) if x > 0.05]
X_embedded = X_scaled[:, selected_features]

In [24]:
# Split data for train/test
X_train_filter, X_test_filter, y_train, y_test = train_test_split(X_filter, y, test_size=0.2, random_state=42)
X_train_rfe, X_test_rfe = train_test_split(X_rfe, test_size=0.2, random_state=42)
X_train_embedded, X_test_embedded = train_test_split(X_embedded, test_size=0.2, random_state=42)

# Train Random Forest model on Filtered Features
model_filter = RandomForestClassifier(n_estimators=100, random_state=42)
model_filter.fit(X_train_filter, y_train)
y_pred_filter = model_filter.predict(X_test_filter)
accuracy_filter = accuracy_score(y_test, y_pred_filter)

# Train Random Forest model on RFE Features
model_rfe = RandomForestClassifier(n_estimators=100, random_state=42)
model_rfe.fit(X_train_rfe, y_train)
y_pred_rfe = model_rfe.predict(X_test_rfe)
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)

# Train Random Forest model on Embedded Features
model_embedded = RandomForestClassifier(n_estimators=100, random_state=42)
model_embedded.fit(X_train_embedded, y_train)
y_pred_embedded = model_embedded.predict(X_test_embedded)
accuracy_embedded = accuracy_score(y_test, y_pred_embedded)

In [None]:
# Displaying all three acuracy
print(f"Accuracy with Filter Method (Chi-Square): {accuracy_filter * 100:.2f}%")
print(f"Accuracy with Wrapper Method (RFE): {accuracy_rfe * 100:.2f}%")
print(f"Accuracy with Embedded Method (Random Forest): {accuracy_embedded * 100:.2f}%")

# Display highest accuracy method
best_method = max(accuracy_filter, accuracy_rfe, accuracy_embedded)
if best_method == accuracy_filter:
    print("Filter Method (Chi-Square) provided the highest accuracy.")
elif best_method == accuracy_rfe:
    print("Wrapper Method (RFE) provided the highest accuracy.")
else:
    print("Embedded Method (Random Forest) provided the highest accuracy.")

Accuracy with Filter Method (Chi-Square): 95.61%
Accuracy with Wrapper Method (RFE): 95.61%
Accuracy with Embedded Method (Random Forest): 95.61%
Filter Method (Chi-Square) provided the highest accuracy.
