In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import chi2

In [3]:
data = pd.read_csv('../data/raw/heart.csv')

In [4]:
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
data = pd.get_dummies(data=data, drop_first=True)
data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


In [6]:
data = data.astype(int)
data

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1,1,1,0,0,1,1,0,0,1,0
914,68,144,193,1,141,3,1,1,0,0,0,1,0,0,1,0
915,57,130,131,0,115,1,1,1,0,0,0,1,0,1,1,0
916,57,130,236,0,174,0,1,0,1,0,0,0,0,0,1,0


In [7]:
numeric_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
sacler = StandardScaler()
data[numeric_cols] = sacler.fit_transform(data[numeric_cols])
data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,0.410909,0.82507,0,1.382928,-0.727592,0,1,1,0,0,1,0,0,0,1
1,-0.478484,1.491752,-0.171961,0,0.754157,0.282891,1,0,0,1,0,1,0,0,1,0
2,-1.751359,-0.129513,0.770188,0,-1.525138,-0.727592,0,1,1,0,0,0,1,0,0,1
3,-0.584556,0.302825,0.13904,0,-1.132156,0.282891,1,0,0,0,0,1,0,1,1,0
4,0.051881,0.951331,-0.034755,0,-0.581981,-0.727592,0,1,0,1,0,1,0,0,0,1


In [8]:
data.columns

Index(['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak',
       'HeartDisease', 'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP',
       'ChestPainType_TA', 'RestingECG_Normal', 'RestingECG_ST',
       'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up'],
      dtype='object')

In [9]:
selected_features = [
    'Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR',
    'Oldpeak', 'Sex_M', 'ChestPainType_ATA',
    'ChestPainType_NAP','ChestPainType_TA', 'RestingECG_Normal', 'RestingECG_ST',
    'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up'
]

correlation = {
    feature: pearsonr(data[feature], data['HeartDisease'])[0]
    for feature in selected_features
}

correlated_data = pd.DataFrame(list(correlation.items()), columns=['Feature', 'Correlation'])
correlated_data.sort_values(by='Correlation', ascending=False)

Unnamed: 0,Feature,Correlation
13,ST_Slope_Flat,0.554134
12,ExerciseAngina_Y,0.494282
5,Oldpeak,0.392385
6,Sex_M,0.305445
0,Age,0.282039
3,FastingBS,0.267291
1,RestingBP,0.107589
11,RestingECG_ST,0.102527
9,ChestPainType_TA,-0.05479
10,RestingECG_Normal,-0.09158


In [10]:
X = data.drop(columns=['HeartDisease'])
y = data['HeartDisease']

scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

chi2_stats, p_values = chi2(X_scaled, y)

alpha = 0.05
decision = ["Reject Null (Keep Feature)" if p < alpha else "Accept Null (Drop Feature)" for p in p_values]

chi2_results = pd.DataFrame({
    "Feature": X.columns,
    "Chi2_Stat": chi2_stats,
    "p_value": p_values,
    "Decision": decision
}).sort_values(by="Chi2_Stat", ascending=False)

chi2_results

Unnamed: 0,Feature,Chi2_Stat,p_value,Decision
14,ST_Slope_Up,202.447182,6.106858e-46,Reject Null (Keep Feature)
13,ST_Slope_Flat,140.635329,1.933233e-32,Reject Null (Keep Feature)
12,ExerciseAngina_Y,133.640134,6.549289e-31,Reject Null (Keep Feature)
7,ChestPainType_ATA,120.349494,5.304212e-28,Reject Null (Keep Feature)
3,FastingBS,50.296983,1.321526e-12,Reject Null (Keep Feature)
8,ChestPainType_NAP,32.427756,1.237068e-08,Reject Null (Keep Feature)
6,Sex_M,18.006243,2.201817e-05,Reject Null (Keep Feature)
4,MaxHR,8.738325,0.003115906,Reject Null (Keep Feature)
11,RestingECG_ST,7.778755,0.005286421,Reject Null (Keep Feature)
5,Oldpeak,6.361276,0.01166376,Reject Null (Keep Feature)


In [11]:
data.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,0.410909,0.82507,0,1.382928,-0.727592,0,1,1,0,0,1,0,0,0,1
1,-0.478484,1.491752,-0.171961,0,0.754157,0.282891,1,0,0,1,0,1,0,0,1,0
2,-1.751359,-0.129513,0.770188,0,-1.525138,-0.727592,0,1,1,0,0,0,1,0,0,1
3,-0.584556,0.302825,0.13904,0,-1.132156,0.282891,1,0,0,0,0,1,0,1,1,0
4,0.051881,0.951331,-0.034755,0,-0.581981,-0.727592,0,1,0,1,0,1,0,0,0,1


In [12]:
final_data = data[['Age', 'Cholesterol', 'MaxHR', 'Oldpeak', 'FastingBS',
                   'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP',
                  'RestingECG_ST', 'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up', 'HeartDisease']]

In [13]:
final_data.head()

Unnamed: 0,Age,Cholesterol,MaxHR,Oldpeak,FastingBS,Sex_M,ChestPainType_ATA,ChestPainType_NAP,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,-1.43314,0.82507,1.382928,-0.727592,0,1,1,0,0,0,0,1,0
1,-0.478484,-0.171961,0.754157,0.282891,0,0,0,1,0,0,1,0,1
2,-1.751359,0.770188,-1.525138,-0.727592,0,1,1,0,1,0,0,1,0
3,-0.584556,0.13904,-1.132156,0.282891,0,0,0,0,0,1,1,0,1
4,0.051881,-0.034755,-0.581981,-0.727592,0,1,0,1,0,0,0,1,0


In [14]:
final_data.to_csv("../data/processed/heart.csv", index=False)