In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/content/heart_disease_preprocessed_2.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,0,145,233,1,2,150.0,0,2.3,2,0,2,0
1,67,1,3,160,286,0,2,108.0,1,1.5,1,3,1,1
2,67,1,3,120,229,0,2,129.0,1,2.6,1,2,3,1
3,37,1,2,130,250,0,0,187.0,0,3.5,2,0,1,0
4,41,0,1,130,204,0,2,172.0,0,1.4,0,0,1,0


## Chi-square test

Is done on original features ( without encoding or standardizing) because it assess the relationship between raw original categories and the original target.


In [3]:
from sklearn.feature_selection import chi2, SelectKBest

categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

selector = SelectKBest(score_func=chi2, k='all')
selector.fit(df[categorical_features], df['target'])

p_values = selector.pvalues_

chi2_results = pd.DataFrame({'feature': categorical_features, 'p_value': p_values})
chi2_results = chi2_results.sort_values(by='p_value', ascending=True)

display(chi2_results)

Unnamed: 0,feature,p_value
6,ca,4.6677259999999995e-20
7,thal,2.335529e-10
4,exang,6.886243e-10
1,cp,2.515476e-06
5,slope,2.771374e-06
3,restecg,0.003279787
0,sex,0.006404399
2,fbs,0.6848935


# One-Hot Encoding & Standardizing using a pipline
The columns to apply One-Hot Encoding to are:
`sex`,`cp`,`fbs`,`restecg`,`exang`,`slope`,`ca`,`thal`

The columns to apply Standardization to are:
`age`, `trestbps`,`chol`,`thalach`,`oldpeak`

In [8]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
target = 'target'

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

In [9]:
X = df.drop(target, axis=1)
y = df[target]

X_processed = preprocessor.fit_transform(X)

print("original features:", X.shape)
print("processed features:", X_processed.shape)

original features: (303, 13)
processed features: (303, 28)


In [10]:
onehot_features = list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))
processed_column_names = numerical_features + onehot_features

X_processed_df = pd.DataFrame(X_processed, columns=processed_column_names)

display(X_processed_df.head())

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slope_0,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,thal_1,thal_2,thal_3
0,0.948726,0.821446,-0.26504,0.015306,0.7963,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.392002,1.723905,0.851214,-1.835388,0.011015,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1.392002,-0.682652,-0.349285,-0.910041,1.090782,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-1.932564,-0.081013,0.093004,1.645679,1.974227,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.489288,-0.081013,-0.87582,0.984717,-0.087146,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
processed_df = X_processed_df.copy()
processed_df['target'] = y.values
display(processed_df.head())

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slope_1,slope_2,ca_0,ca_1,ca_2,ca_3,thal_1,thal_2,thal_3,target
0,0.948726,0.821446,-0.26504,0.015306,0.7963,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1,1.392002,1.723905,0.851214,-1.835388,0.011015,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1
2,1.392002,-0.682652,-0.349285,-0.910041,1.090782,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
3,-1.932564,-0.081013,0.093004,1.645679,1.974227,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,-1.489288,-0.081013,-0.87582,0.984717,-0.087146,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0


## Feature importance (random forest)



In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_processed_df, y)

feature_importances = rf_model.feature_importances_

feature_importance_df = pd.DataFrame({
    'feature': processed_column_names,
    'importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

display(feature_importance_df)

Unnamed: 0,feature,importance
21,ca_0,0.09444
3,thalach,0.090319
25,thal_1,0.080775
0,age,0.080133
10,cp_3,0.076245
27,thal_3,0.075888
4,oldpeak,0.074463
2,chol,0.062182
1,trestbps,0.057085
16,exang_0,0.044155


## Recursive feature elimination (RFE)


Applying RFE with a suitable estimator (e.g., Logistic Regression or a tree-based model) on the processed features to select a subset of features.


In [14]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression(solver='liblinear', random_state=42)

# Select the top 10 features
rfe = RFE(estimator=estimator, n_features_to_select=10)

# Fit RFE to the processed data
rfe.fit(X_processed_df, y)

# Get the ranked features and support (selection)
feature_ranking = rfe.ranking_
feature_support = rfe.support_

# Create a DataFrame for RFE results
rfe_results_df = pd.DataFrame({
    'feature': X_processed_df.columns,
    'ranking': feature_ranking,
    'selected': feature_support
})

# Sort by rank
rfe_results_df = rfe_results_df.sort_values(by='ranking')

# Display the RFE results
display(rfe_results_df)

Unnamed: 0,feature,ranking,selected
6,sex_1,1,True
5,sex_0,1,True
10,cp_3,1,True
9,cp_2,1,True
13,restecg_0,1,True
23,ca_2,1,True
18,slope_0,1,True
17,exang_1,1,True
21,ca_0,1,True
27,thal_3,1,True


## Combine and analyze results


Combining the results from the Chi-Square test, Feature Importance, and RFE to identify the most relevant features.


In [15]:
combined_results = chi2_results.merge(feature_importance_df, on='feature', how='left')
combined_results = combined_results.merge(rfe_results_df, on='feature', how='left')

display(combined_results.head())

Unnamed: 0,feature,p_value,importance,ranking,selected
0,ca,4.6677259999999995e-20,,,
1,thal,2.335529e-10,,,
2,exang,6.886243e-10,,,
3,cp,2.515476e-06,,,
4,slope,2.771374e-06,,,


#⭐Observation:
The above results displays only chi-square results because chi-square was done on the original features with no encoding or standardizing whereas RFE and feature importance was done on processed features, so we need to analyze which categories scored best on each feature.*italicized text*



In [16]:


print("Top features from Chi-Square test (low p-value):")
display(chi2_results)

print("\nTop features by Random Forest Feature Importance:")
display(feature_importance_df.head(15))

print("\nFeatures selected by RFE (ranking = 1):")
display(rfe_results_df[rfe_results_df['selected'] == True])




# Features to consider for the final model based on this analysis:
relevant_features = [
    'age', 'trestbps', 'chol', 'thalach', 'oldpeak',
    'sex_0', 'sex_1',
    'cp_0', 'cp_1', 'cp_2', 'cp_3', # Include all one-hot categories if any part is important
    'fbs_0', 'fbs_1', # fbs is less important, but include for completeness.
    'restecg_0', 'restecg_1', 'restecg_2', # Include all one-hot categories if any part is important
    'exang_0', 'exang_1',
    'slope_0', 'slope_1', 'slope_2', # Include all one-hot categories if any part is important
    'ca_0', 'ca_1', 'ca_2', 'ca_3', # Include all one-hot categories if any part is important
    'thal_1', 'thal_2', 'thal_3' # Include all one-hot categories if any part is important
]



refined_relevant_features = [
    'age', 'trestbps', 'chol', 'thalach', 'oldpeak',
    'sex_0', 'sex_1',
    'cp_0', 'cp_1', 'cp_2', 'cp_3',
    'restecg_0', 'restecg_2',
    'exang_0', 'exang_1',
    'slope_0', 'slope_1',
    'ca_0', 'ca_2',
    'thal_1', 'thal_3'
]

print("\nProposed list of relevant features based on combined analysis:")
print(refined_relevant_features)

# Create a DataFrame of the selected features
relevant_features_df = pd.DataFrame({'feature': refined_relevant_features})
display(relevant_features_df)

Top features from Chi-Square test (low p-value):


Unnamed: 0,feature,p_value
6,ca,4.6677259999999995e-20
7,thal,2.335529e-10
4,exang,6.886243e-10
1,cp,2.515476e-06
5,slope,2.771374e-06
3,restecg,0.003279787
0,sex,0.006404399
2,fbs,0.6848935



Top features by Random Forest Feature Importance:


Unnamed: 0,feature,importance
21,ca_0,0.09444
3,thalach,0.090319
25,thal_1,0.080775
0,age,0.080133
10,cp_3,0.076245
27,thal_3,0.075888
4,oldpeak,0.074463
2,chol,0.062182
1,trestbps,0.057085
16,exang_0,0.044155



Features selected by RFE (ranking = 1):


Unnamed: 0,feature,ranking,selected
6,sex_1,1,True
5,sex_0,1,True
10,cp_3,1,True
9,cp_2,1,True
13,restecg_0,1,True
23,ca_2,1,True
18,slope_0,1,True
17,exang_1,1,True
21,ca_0,1,True
27,thal_3,1,True



Proposed list of relevant features based on combined analysis:
['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'sex_0', 'sex_1', 'cp_0', 'cp_1', 'cp_2', 'cp_3', 'restecg_0', 'restecg_2', 'exang_0', 'exang_1', 'slope_0', 'slope_1', 'ca_0', 'ca_2', 'thal_1', 'thal_3']


Unnamed: 0,feature
0,age
1,trestbps
2,chol
3,thalach
4,oldpeak
5,sex_0
6,sex_1
7,cp_0
8,cp_1
9,cp_2


#🔽Saving the new refined dataset

In [19]:
relevant_features_dataset = pd.DataFrame(X_processed_df, columns=refined_relevant_features)
relevant_features_dataset['target'] = y.values
display(relevant_features_dataset.head())
relevant_features_dataset.to_csv('selected_features_dataset.csv', index=False)
print("Dataset saved successfully ✅")

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,restecg_2,exang_0,exang_1,slope_0,slope_1,ca_0,ca_2,thal_1,thal_3,target
0,0.948726,0.821446,-0.26504,0.015306,0.7963,0.0,1.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,1.392002,1.723905,0.851214,-1.835388,0.011015,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1
2,1.392002,-0.682652,-0.349285,-0.910041,1.090782,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1
3,-1.932564,-0.081013,0.093004,1.645679,1.974227,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0
4,-1.489288,-0.081013,-0.87582,0.984717,-0.087146,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0


Dataset saved successfully ✅


## Summary:

### Data Analysis Key Findings

*   The Chi-Square test indicated that original categorical features such as `ca`, `thal`, `exang`, `cp`, and `slope` have a statistically significant relationship with the target variable (low p-values), while `fbs` appears independent (high p-value).
*   Random Forest feature importance highlighted `ca_0`, `thalach`, and `thal_1` as the most important processed features. Other highly ranked features included `age`, `cp_3`, `thal_3`, and `oldpeak`.
*   Recursive Feature Elimination (RFE) selected a subset of processed features, including one-hot encoded components related to `sex`, `cp`, `restecg`, `ca`, `slope`, `exang`, and `thal`, indicating their relevance according to the Logistic Regression estimator.
*   Combining the insights from all three methods allowed for the identification of features that consistently demonstrated importance or were selected across different techniques. Features related to `age`, `thalach`, `oldpeak`, `chol`, `trestbps` (numerical) and `sex`, `cp`, `restecg`, `exang`, `slope`, `ca`, `thal` (categorical, considering their one-hot encoded parts) were frequently identified as relevant.


