# Feature Engineering for Heart Disease Dataset
This notebook performs comprehensive feature engineering including normalization, standardization, categorical encoding, and feature selection.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
import sys, os
sys.path.append("..")
from scripts.feature_engineering import(
    normalize_features, standardize_features,
    encode_all_categoricals,
    create_derived_features,
    feature_selection_analysis
)

In [3]:
# 1. Load processed dataset
df = pd.read_csv("../data/processed/heart_disease_cleaned.csv")

In [4]:
# 2. Apply derived features
df = create_derived_features(df)
df = df.drop(columns=["age_group"])
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         920 non-null    float64
 1   sex         920 non-null    object 
 2   cp          920 non-null    object 
 3   trestbps    920 non-null    float64
 4   chol        920 non-null    float64
 5   fbs         920 non-null    bool   
 6   restecg     920 non-null    object 
 7   thalch      920 non-null    float64
 8   exang       920 non-null    bool   
 9   oldpeak     920 non-null    float64
 10  slope       920 non-null    object 
 11  ca          920 non-null    float64
 12  thal        920 non-null    object 
 13  num         920 non-null    float64
 14  risk_score  920 non-null    float64
dtypes: bool(2), float64(8), object(5)
memory usage: 95.4+ KB


In [5]:
# 3. Apply transformations - Normalization
df_norm = normalize_features(df, ['chol', 'trestbps'])

In [12]:
# 3. Apply transformations - Standardization
df_std = standardize_features(df, ['chol', 'trestbps'])


In [13]:
# 4. Encode categorical features
df_encoded = encode_all_categoricals(df)
df_encoded.head()

Unnamed: 0,age,trestbps,chol,thalch,oldpeak,ca,num,risk_score,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,fbs_True,restecg_normal,restecg_st-t abnormality,exang_True,slope_flat,slope_upsloping,thal_normal,thal_reversable defect
0,63.0,0.7554,0.273889,150.0,2.3,0.0,0.0,197.8,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,67.0,1.628081,0.842068,108.0,1.5,2.7,2.0,235.6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,67.0,-0.699068,0.231008,129.0,2.6,2.0,1.0,185.4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
3,37.0,-0.117281,0.456135,187.0,3.5,0.0,0.0,202.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,41.0,-0.117281,-0.037002,172.0,1.4,0.0,0.0,174.4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [14]:
# 5. Feature selection
selected_features, importance_scores = feature_selection_analysis(df_encoded, target_col="num")

In [15]:
# 6. Save transformed dataset (encoded version)
df_encoded.to_csv("../data/processed/heart_disease_features.csv", index=False)

In [11]:
print("✅ Feature engineering complete. Transformed dataset saved.")
print("Top Selected Features (chi²):", list(selected_features))
print("\nFeature Importance (RandomForest):")
print(importance_scores.head(10))

✅ Feature engineering complete. Transformed dataset saved.
Top Selected Features (chi²): ['age', 'chol', 'thalch', 'oldpeak', 'ca', 'risk_score', 'cp_atypical angina', 'cp_non-anginal', 'exang_True', 'thal_reversable defect']

Feature Importance (RandomForest):
               feature  importance
3               thalch    0.117991
6           risk_score    0.110276
0                  age    0.110155
4              oldpeak    0.106508
5                   ca    0.095870
2                 chol    0.089730
1             trestbps    0.080955
14          exang_True    0.060883
8   cp_atypical angina    0.033617
17         thal_normal    0.027170
