In [18]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

df = pd.read_csv("data/heart_disease.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [19]:
# Identify missing values
print(f"Missing values per column:\n{df.isnull().sum()}")

Missing values per column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
target      0
dtype: int64


In [20]:
# Strategy: Impute with median for numerical features
imputer = SimpleImputer(strategy="median")
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [21]:
print(df_imputed.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [22]:
# Create new features
df_imputed["age_group"] = pd.cut(
    df_imputed["age"], bins=[0, 40, 55, 70, 100], labels=[0, 1, 2, 3]
)
df_imputed["chol_risk"] = (df_imputed["chol"] > 240).astype(int)
df_imputed["bp_risk"] = (df_imputed["trestbps"] > 140).astype(int)
df_imputed["heart_rate_reserve"] = 220 - df_imputed["age"] - df_imputed["thalach"]

# One-hot encode categorical variables
df_encoded = pd.get_dummies(
    df_imputed, columns=["cp", "restecg", "slope", "thal"], drop_first=True
).astype("int")
print(df_encoded)

     age  sex  trestbps  chol  fbs  thalach  exang  oldpeak  ca  target  ...  \
0     63    1       145   233    1      150      0        2   0       0  ...   
1     67    1       160   286    0      108      1        1   3       1  ...   
2     67    1       120   229    0      129      1        2   2       1  ...   
3     37    1       130   250    0      187      0        3   0       0  ...   
4     41    0       130   204    0      172      0        1   0       0  ...   
..   ...  ...       ...   ...  ...      ...    ...      ...  ..     ...  ...   
298   45    1       110   264    0      132      0        1   0       1  ...   
299   68    1       144   193    1      141      0        3   2       1  ...   
300   57    1       130   131    0      115      1        1   1       1  ...   
301   57    0       130   236    0      174      0        0   1       1  ...   
302   38    1       138   175    0      173      0        0   0       0  ...   

     heart_rate_reserve  cp_2.0  cp_3.0

In [23]:
# split data into train and test set
X = df_encoded.drop("target", axis=1)
y = df_encoded["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=100, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Training set: (242, 22)
Test set: (61, 22)


In [24]:
df_encoded.to_csv("data/heart_disease_preprocessed.csv", index=False)
X_train.to_csv("data/X_train.csv", index=False)
X_test.to_csv("data/X_test.csv", index=False)
y_train.to_csv("data/y_train.csv", index=False)
y_test.to_csv("data/y_test.csv", index=False)
print("Done Exporting")

Done Exporting
