# 03 - Feature Selection (Importance, RFE, Chi-Square)

In [15]:
# Required Libs
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# Import Dataset
heart_disease_clean = pd.read_csv(r'C:\Users\eyad0\Documents\python\Heart_Disease_Project\data\heart_disease_clean.csv')

In [16]:
# 2) Split features & target
x = heart_disease_clean.drop("num", axis=1)
y = heart_disease_clean["num"]

# 3) Train/Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# 4) Train Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(x_train, y_train)

# 5) Get Feature Importances
importances = rf.feature_importances_
feature_names = x.columns

fi_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
fi_df = fi_df.sort_values(by="Importance", ascending=False)

# Print feature ranking
print(fi_df)

      Feature  Importance
5     thalach    0.143979
7     oldpeak    0.121443
3        chol    0.108823
0         age    0.106639
2    trestbps    0.097531
16   thal_7.0    0.066591
10       cp_4    0.057483
6       exang    0.044042
17     ca_1.0    0.036725
13    slope_2    0.035043
1         sex    0.031087
18     ca_2.0    0.030665
19     ca_3.0    0.026751
12  restecg_2    0.026450
4         fbs    0.018708
9        cp_3    0.015191
15   thal_6.0    0.014149
14    slope_3    0.009049
8        cp_2    0.008137
11  restecg_1    0.001513


In [17]:

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Features
rfe = RFE(estimator=log_reg, n_features_to_select=10)
rfe.fit(x_train, y_train)

selected_features = x_train.columns[rfe.support_]
print("Selected Features by RFE:")
print(selected_features)

rfe_ranking = pd.DataFrame({
    "Feature": x_train.columns,
    "Rank": rfe.ranking_
}).sort_values(by="Rank")

print("\nFeature Rankings:")
print(rfe_ranking)


Selected Features by RFE:
Index(['sex', 'fbs', 'oldpeak', 'cp_4', 'slope_2', 'thal_6.0', 'thal_7.0',
       'ca_1.0', 'ca_2.0', 'ca_3.0'],
      dtype='object')

Feature Rankings:
      Feature  Rank
1         sex     1
4         fbs     1
7     oldpeak     1
10       cp_4     1
13    slope_2     1
15   thal_6.0     1
17     ca_1.0     1
16   thal_7.0     1
19     ca_3.0     1
18     ca_2.0     1
12  restecg_2     2
5     thalach     3
0         age     4
8        cp_2     5
6       exang     6
11  restecg_1     7
9        cp_3     8
3        chol     9
2    trestbps    10
14    slope_3    11


In [18]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Random Forest
rf_estimator = RandomForestClassifier(n_estimators=200, random_state=42)

# Best 10 Features
rfe_rf = RFE(estimator=rf_estimator, n_features_to_select=10)
rfe_rf.fit(x_train, y_train)

# Selected Features
selected_features_rf = x_train.columns[rfe_rf.support_]
print("Selected Features by RFE with Random Forest:")
print(selected_features_rf)

# Ranking For Features
rfe_rf_ranking = pd.DataFrame({
    "Feature": x_train.columns,
    "Rank (RF)": rfe_rf.ranking_
}).sort_values(by="Rank (RF)")

print("\nFeature Rankings (RFE + RF):")
print(rfe_rf_ranking)

Selected Features by RFE with Random Forest:
Index(['age', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'cp_4',
       'slope_2', 'thal_7.0', 'ca_1.0'],
      dtype='object')

Feature Rankings (RFE + RF):
      Feature  Rank (RF)
0         age          1
2    trestbps          1
3        chol          1
5     thalach          1
7     oldpeak          1
6       exang          1
13    slope_2          1
10       cp_4          1
17     ca_1.0          1
16   thal_7.0          1
18     ca_2.0          2
1         sex          3
19     ca_3.0          4
12  restecg_2          5
4         fbs          6
9        cp_3          7
15   thal_6.0          8
14    slope_3          9
8        cp_2         10
11  restecg_1         11


In [19]:
# Logistic & RFE
features_logreg = set(selected_features)

# Random Forest RFE
features_rf_rfe = set(selected_features_rf)

# Random Forest Importance
features_rf_importance = set(fi_df.head(10)["Feature"])

common_features = features_logreg & features_rf_rfe & features_rf_importance
print("Common Features:", common_features)

x_train_selected = x_train[list(common_features)]
x_test_selected = x_test[list(common_features)]

Common Features: {'ca_1.0', 'cp_4', 'oldpeak', 'slope_2', 'thal_7.0'}


In [20]:
# Scale features to non-negative (required for chi2)
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)

# Chi-Square Test
chi2_vals, p_vals = chi2(x_scaled, y)

chi2_df = pd.DataFrame({
    "Feature": x.columns,
    "Chi2 Stat": chi2_vals,
    "p-value": p_vals
}).sort_values(by="Chi2 Stat", ascending=False)

print(chi2_df)


      Feature  Chi2 Stat       p-value
16   thal_7.0  46.721875  1.742480e-09
10       cp_4  45.129615  3.736737e-09
6       exang  41.534482  2.083076e-08
18     ca_2.0  34.648382  5.485601e-07
19     ca_3.0  29.999871  4.894734e-06
13    slope_2  22.408432  1.661808e-04
9        cp_3  21.974965  2.027333e-04
8        cp_2  16.942450  1.983352e-03
7     oldpeak  16.451213  2.469786e-03
17     ca_1.0  15.262296  4.187001e-03
15   thal_6.0  10.943440  2.720708e-02
1         sex   7.499223  1.117436e-01
4         fbs   6.658223  1.550901e-01
11  restecg_1   6.557266  1.612178e-01
12  restecg_2   6.458947  1.673955e-01
14    slope_3   5.528483  2.372371e-01
5     thalach   3.133976  5.356604e-01
0         age   1.244886  8.706545e-01
2    trestbps   0.622014  9.605862e-01
3        chol   0.175931  9.963506e-01


In [21]:
# Final selected features from Step 4 (Top 10 Chi-Square Features)
final_features = ['thal_7.0', 'cp_4', 'exang', 'ca_2.0', 'ca_3.0', 
                  'slope_2', 'cp_3', 'cp_2', 'oldpeak', 'ca_1.0', 'num']

# Create new dataset
heart_disease_selected = heart_disease_clean[final_features]

# Save to CSV
heart_disease_selected.to_csv(r'C:\Users\eyad0\Documents\python\Heart_Disease_Project\data\heart_disease_selected.csv', index=False)

print("New dataset saved as data/heart_disease_selected.csv")
print("Shape:", heart_disease_selected.shape)
print("Columns:", heart_disease_selected.columns.tolist())


New dataset saved as data/heart_disease_selected.csv
Shape: (303, 11)
Columns: ['thal_7.0', 'cp_4', 'exang', 'ca_2.0', 'ca_3.0', 'slope_2', 'cp_3', 'cp_2', 'oldpeak', 'ca_1.0', 'num']
