# Feature Engineering for Heart Disease Dataset
This notebook performs comprehensive feature engineering including normalization, standardization, categorical encoding, and feature selection.

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
import sys, os
sys.path.append("..")
from scripts.feature_engineering import(
    normalize_features, standardize_features,
    encode_categorical, create_derived_features,
    feature_selection_analysis
)

In [33]:
# 1. Load processed dataset
df = pd.read_csv("../data/processed/heart_disease_cleaned.csv")

In [34]:
# 2. Apply derived features
df = create_derived_features(df)

In [35]:
# 3. Apply transformations - Normalization
df_norm = normalize_features(df, ['chol', 'trestbps'])

In [36]:
# 3. Apply transformations - Standardization
df_std = standardize_features(df, ['chol', 'trestbps'])
df_std.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,age_group,risk_score
0,63.0,Male,typical angina,0.7554,0.273889,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0.0,Senior,197.8
1,67.0,Male,asymptomatic,1.628081,0.842068,False,lv hypertrophy,108.0,True,1.5,flat,2.7,normal,2.0,Senior,235.6
2,67.0,Male,asymptomatic,-0.699068,0.231008,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1.0,Senior,185.4
3,37.0,Male,non-anginal,-0.117281,0.456135,False,normal,187.0,False,3.5,downsloping,0.0,normal,0.0,Middle-aged,202.0
4,41.0,Female,atypical angina,-0.117281,-0.037002,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0.0,Middle-aged,174.4


In [37]:
# 4. Encode categorical features
df_encoded = encode_categorical(df, ['sex', 'cp','restecg', 'slope', 'thal', 'fbs', 'exang','age_group'])
df_encoded.head()

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,num,risk_score,sex_Male,cp_atypical angina,...,restecg_st-t abnormality,slope_flat,slope_upsloping,thal_normal,thal_reversable defect,fbs_True,exang_True,age_group_Middle-aged,age_group_Senior,age_group_Young
0,63.0,0.7554,0.273889,150.0,2.3,0.0,0.0,197.8,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,67.0,1.628081,0.842068,108.0,1.5,2.7,2.0,235.6,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,67.0,-0.699068,0.231008,129.0,2.6,2.0,1.0,185.4,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,37.0,-0.117281,0.456135,187.0,3.5,0.0,0.0,202.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,41.0,-0.117281,-0.037002,172.0,1.4,0.0,0.0,174.4,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
# 5. Feature selection
selected_features, importance_scores = feature_selection_analysis(df_encoded, target_col="num")

In [39]:
# 6. Save transformed dataset (encoded version)
df_encoded.to_csv("../data/processed/heart_disease_features.csv", index=False)

In [40]:
print("✅ Feature engineering complete. Transformed dataset saved.")
print("Top Selected Features (chi²):", list(selected_features))
print("\nFeature Importance (RandomForest):")
print(importance_scores.head(10))

✅ Feature engineering complete. Transformed dataset saved.
Top Selected Features (chi²): ['age', 'chol', 'thalch', 'oldpeak', 'ca', 'risk_score', 'cp_atypical angina', 'thal_reversable defect', 'exang_True', 'age_group_Middle-aged']

Feature Importance (RandomForest):
               feature  importance
3               thalch    0.118582
4              oldpeak    0.108964
6           risk_score    0.102108
0                  age    0.101841
5                   ca    0.094503
1             trestbps    0.084340
2                 chol    0.082455
18          exang_True    0.054069
8   cp_atypical angina    0.032651
9       cp_non-anginal    0.027338
