# 03 – Feature Selection

Techniques:
1. Filter: Chi-Square (categorical vs target) or ANOVA (numeric)
2. Wrapper: Recursive Feature Elimination (RFE) with estimator
3. Embedded: Tree-based feature importances (RandomForest, XGBoost)

Goal: Decide a reduced, performant, and interpretable subset for modeling.

In [None]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import chi2, SelectKBest, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import joblib

df = pd.read_csv(Path('../data/heart_disease.csv'))
target_col = 'target' if 'target' in df.columns else 'num'
y = df[target_col]
X = df.drop(columns=[target_col])

categorical = [c for c in X.columns if X[c].dtype=='object']
numeric = [c for c in X.columns if c not in categorical]

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer([('num', numeric_transformer, numeric), ('cat', categorical_transformer, categorical)])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

## 1. Embedded: RandomForest Feature Importance

In [None]:
rf_pipeline = Pipeline([('prep', preprocessor), ('rf', RandomForestClassifier(n_estimators=300, random_state=42))])
rf_pipeline.fit(X_train, y_train)
importances = rf_pipeline.named_steps['rf'].feature_importances_
# Get feature names post-transform
ohe = rf_pipeline.named_steps['prep'].named_transformers_['cat'].named_steps['ohe'] if categorical else None
num_feats = numeric
cat_feats = list(ohe.get_feature_names_out(categorical)) if categorical else []
all_feats = num_feats + cat_feats
feat_imp = pd.Series(importances, index=all_feats).sort_values(ascending=False)[:25]
feat_imp.head()

In [None]:
feat_imp.plot(kind='barh', figsize=(6,10)).invert_yaxis()

## 2. Wrapper: RFE with LogisticRegression

In [None]:
# For RFE we need transformed array; use a simpler pipeline then apply RFE externally
prep_only = preprocessor.fit(X_train)
X_train_enc = prep_only.transform(X_train)
log_reg = LogisticRegression(max_iter=1000, penalty='l2', solver='liblinear')
rfe = RFE(estimator=log_reg, n_features_to_select=10)
rfe.fit(X_train_enc, y_train)
selected_mask = rfe.support_
selected_features = np.array(all_feats)[selected_mask]
selected_features

## 3. Filter: Chi-Square (on non-negative features)

In [None]:
# Chi2 requires non-negative; apply only to scaled positives or use MinMax; for simplicity re-transform with MinMax.
from sklearn.preprocessing import MinMaxScaler
mm_numeric = Pipeline([('imputer', SimpleImputer(strategy='median')), ('mm', MinMaxScaler())])
chi_preprocessor = ColumnTransformer([('num', mm_numeric, numeric), ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)])
X_train_chi = chi_preprocessor.fit_transform(X_train)
chi_selector = SelectKBest(score_func=chi2, k=10)
chi_selector.fit(X_train_chi, y_train)
chi_mask = chi_selector.get_support()
chi_feature_names = []
# Build feature names again
chi_ohe = chi_preprocessor.named_transformers_['cat'] if categorical else None
chi_all = numeric + (list(chi_ohe.get_feature_names_out(categorical)) if categorical else [])
chi_feature_names = list(np.array(chi_all)[chi_mask])
chi_feature_names

## 4. Consolidate & Decide Final Subset
Intersect / union strategies: start with intersection of RF & RFE; add high chi2 if missing but conceptually relevant.

In [None]:
rf_top_15 = set(feat_imp.index[:15])
rfe_set = set(selected_features)
chi_set = set(chi_feature_names)
intersection = rf_top_15 & rfe_set
candidate_final = list(intersection | chi_set)
candidate_final

## 5. Save Selected Feature List

In [None]:
import json, pathlib
pathlib.Path('../models').mkdir(exist_ok=True)
with open('../models/selected_features.json','w') as f: json.dump(candidate_final, f)
print('Saved selected feature list.')

## Notes
- RFE cost grows with #features * #iterations; consider RFECV for automatic selection with cross-val.
- Chi2 only for categorical / non-negative counts; scale to [0,1] otherwise.
- Stability selection or permutation importance can validate robustness.