In [19]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from imblearn.over_sampling import SMOTE   # NEW

In [20]:

# ============================================================
# 1. LOAD DATA
# ============================================================

df = pd.read_csv('BreastCancer.csv')

# Target variable (binary)
y = df['diagnosis'].map({'yes': 1, 'no': 0})
X = df.drop(columns=['diagnosis'])


In [21]:
# Identify categorical and numeric columns
categorical = X.select_dtypes(include=['object']).columns
numeric = X.select_dtypes(exclude=['object']).columns


In [22]:

# One-hot encode categoricals
preprocess = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), categorical),
    ('num', 'passthrough', numeric)
])

X_enc = preprocess.fit_transform(X)


In [23]:
# Extract feature names
feature_names = (
    preprocess.named_transformers_['cat']
    .get_feature_names_out(categorical).tolist()
    + numeric.tolist()
)

In [27]:
# ============================================================
# 2. BALANCE THE DATASET (SMOTE)
# ============================================================

sm = SMOTE(random_state=42,k_neighbors=4)
X_bal, y_bal = sm.fit_resample(X_enc, y)

print("\nDataset balanced using SMOTE:")
print("Original:", np.bincount(y))
print("Balanced:", np.bincount(y_bal))


Dataset balanced using SMOTE:
Original: [ 5 14]
Balanced: [14 14]


In [28]:
# ============================================================
# 3. TRAIN-TEST SPLIT (AFTER BALANCING)
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X_bal, y_bal, test_size=0.3, random_state=42
)

In [29]:
# ============================================================
# 4. FILTER METHOD — CHI-SQUARE (Requires Scaling)
# ============================================================

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

k = 5
chi2_selector = SelectKBest(chi2, k=k)
chi2_selector.fit(X_train_scaled, y_train)

chi2_features = [feature_names[i] for i in chi2_selector.get_support(indices=True)]

print("\nCHI-SQUARE SELECTED FEATURES:")
print(chi2_features)


CHI-SQUARE SELECTED FEATURES:
['gender_M', 'Excerise level_low', 'familyhistory_yes', 'cholsterol', 'BMI']


In [30]:
# ============================================================
# 5. WRAPPER METHODS — FORWARD & BACKWARD
# ============================================================

logreg = LogisticRegression(max_iter=1000)

# Forward Selection
fwd_selector = SequentialFeatureSelector(
    logreg, n_features_to_select=5, direction='forward'
)
fwd_selector.fit(X_train, y_train)
fwd_features = [feature_names[i] for i in fwd_selector.get_support(indices=True)]

print("\nFORWARD SELECTION FEATURES:")
print(fwd_features)

# Backward Selection
bwd_selector = SequentialFeatureSelector(
    logreg, n_features_to_select=5, direction='backward'
)
bwd_selector.fit(X_train, y_train)
bwd_features = [feature_names[i] for i in bwd_selector.get_support(indices=True)]

print("\nBACKWARD SELECTION FEATURES:")
print(bwd_features)



FORWARD SELECTION FEATURES:
['gender_M', 'Smoking_yes', 'familyhistory_yes', 'age', 'cholsterol']

BACKWARD SELECTION FEATURES:
['medication_yes', 'age', 'bp', 'cholsterol', 'BMI']


In [31]:
# ============================================================
# 6. EMBEDDED METHOD — ELASTIC NET
# ============================================================

elnet = LogisticRegression(
    penalty='elasticnet', solver='saga',
    l1_ratio=0.5, max_iter=3000
)
elnet.fit(X_train, y_train)
coef = elnet.coef_[0]
elnet_features = [feature_names[i] for i, c in enumerate(coef) if abs(c) > 1e-6]

print("\nELASTIC NET SELECTED FEATURES:")
print(elnet_features)



ELASTIC NET SELECTED FEATURES:
['gender_M', 'Excerise level_low', 'familyhistory_yes', 'age', 'bp', 'cholsterol', 'BMI']


In [33]:
# ============================================================
# 7. MODEL EVALUATION FUNCTION
# ============================================================

def evaluate(features):
    idx = [feature_names.index(f) for f in features]
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train[:, idx], y_train)
    pred = clf.predict(X_test[:, idx])
    return accuracy_score(y_test, pred), f1_score(y_test, pred)

# Full model
clf_full = LogisticRegression(max_iter=1000)
clf_full.fit(X_train, y_train)
pred_full = clf_full.predict(X_test)

acc_full = accuracy_score(y_test, pred_full)
f1_full = f1_score(y_test, pred_full)

In [34]:
# ============================================================
# 8. COMPARE METHOD PERFORMANCE
# ============================================================

results = pd.DataFrame({
    'Method': ['Full', 'Chi2', 'Forward', 'Backward', 'ElasticNet'],
    'Accuracy': [
        acc_full,
        evaluate(chi2_features)[0],
        evaluate(fwd_features)[0],
        evaluate(bwd_features)[0],
        evaluate(elnet_features)[0]
    ],
    'F1 Score': [
        f1_full,
        evaluate(chi2_features)[1],
        evaluate(fwd_features)[1],
        evaluate(bwd_features)[1],
        evaluate(elnet_features)[1]
    ]
})

print("\nMODEL PERFORMANCE COMPARISON:")
print(results)


MODEL PERFORMANCE COMPARISON:
       Method  Accuracy  F1 Score
0        Full  0.555556       0.5
1        Chi2  0.555556       0.5
2     Forward  0.333333       0.0
3    Backward  0.555556       0.5
4  ElasticNet  0.555556       0.5
