In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [None]:
# Load cleaned dataset
df = pd.read_csv("data/heart_disease_cleveland_clean.csv")   

X = df.drop("target", axis=1)
y = df["target"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Load preprocessor
preprocessor = joblib.load(r"C:\Users\moham\miniconda3\envs\heart_ml\models\final_pipeline.pkl")

X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)


In [6]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_trans, y_train)

importances = rf.feature_importances_

# Get feature names from preprocessor
ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_names = ohe.get_feature_names_out(preprocessor.transformers_[1][2])
all_features = preprocessor.transformers_[0][2] + list(cat_names)

feat_imp = pd.Series(importances, index=all_features).sort_values(ascending=False)
feat_imp.head(15)


thal_3.0     0.090375
thalach      0.087598
oldpeak      0.085813
cp_4.0       0.084182
ca_0.0       0.074695
age          0.073600
thal_7.0     0.073571
chol         0.068672
trestbps     0.066678
exang_1.0    0.035474
slope_2.0    0.029744
cp_3.0       0.029335
slope_1.0    0.028958
exang_0.0    0.025954
ca_1.0       0.018111
dtype: float64

In [7]:
lr = LogisticRegression(solver="liblinear", max_iter=1000)
rfe = RFE(lr, n_features_to_select=10)
rfe.fit(X_train_trans, y_train)

selected_features = pd.Series(rfe.support_, index=all_features)
selected_features[selected_features == True].index.tolist()


['oldpeak',
 'cp_3.0',
 'cp_4.0',
 'restecg_0.0',
 'slope_1.0',
 'ca_0.0',
 'ca_1.0',
 'ca_2.0',
 'thal_3.0',
 'thal_7.0']

In [8]:
from sklearn.preprocessing import MinMaxScaler

X_train_chi2 = MinMaxScaler().fit_transform(X_train_trans)

chi2_selector = SelectKBest(chi2, k=10)
chi2_selector.fit(X_train_chi2, y_train)

chi2_features = pd.Series(chi2_selector.get_support(), index=all_features)
chi2_features[chi2_features == True].index.tolist()


['cp_3.0',
 'cp_4.0',
 'exang_0.0',
 'exang_1.0',
 'slope_1.0',
 'slope_2.0',
 'ca_0.0',
 'ca_2.0',
 'thal_3.0',
 'thal_7.0']

In [10]:
with open(r"C:\Users\moham\miniconda3\envs\heart_ml\results\evaluation_metrics.txt", "a") as f:
    f.write("\n--- Feature Selection ---\n")
    f.write("Top 10 RF features: " + str(feat_imp.head(10).index.tolist()) + "\n")
    f.write("RFE selected: " + str(selected_features[selected_features == True].index.tolist()) + "\n")
    f.write("Chi2 selected: " + str(chi2_features[chi2_features == True].index.tolist()) + "\n")
