# Heart Disease Dataset – Complete Data Preprocessing Notebook

This notebook covers:
- Data Cleaning
- Feature Engineering
- Data Transformation
- Categorical Encoding

The goal is to prepare raw medical data for Machine Learning models.

In [28]:
import zipfile
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


## Load Dataset

In [29]:
from google.colab import files
uploaded = files.upload()


Saving heart+disease.zip to heart+disease (1).zip


## Data Cleaning

In [30]:
zip_path = "/content/heart+disease.zip"
extract_path = "/content/heart_disease_raw"

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_path)

print("Dataset extracted successfully!")


Dataset extracted successfully!


In [31]:
columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

files = [
    "processed.cleveland.data",
    "processed.hungarian.data",
    "processed.switzerland.data",
    "processed.va.data"
]

dfs = []
for f in files:
    df = pd.read_csv(f"{extract_path}/{f}", header=None, names=columns)
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


## Feature Engineering

In [32]:
data.info()
data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       920 non-null    float64
 1   sex       920 non-null    float64
 2   cp        920 non-null    float64
 3   trestbps  920 non-null    object 
 4   chol      920 non-null    object 
 5   fbs       920 non-null    object 
 6   restecg   920 non-null    object 
 7   thalach   920 non-null    object 
 8   exang     920 non-null    object 
 9   oldpeak   920 non-null    object 
 10  slope     920 non-null    object 
 11  ca        920 non-null    object 
 12  thal      920 non-null    object 
 13  target    920 non-null    int64  
dtypes: float64(3), int64(1), object(10)
memory usage: 100.8+ KB


Unnamed: 0,age,sex,cp,target
count,920.0,920.0,920.0,920.0
mean,53.51087,0.78913,3.25,0.995652
std,9.424685,0.408148,0.930969,1.142693
min,28.0,0.0,1.0,0.0
25%,47.0,1.0,3.0,0.0
50%,54.0,1.0,4.0,1.0
75%,60.0,1.0,4.0,2.0
max,77.0,1.0,4.0,4.0


In [33]:
data.replace("?", np.nan, inplace=True)

for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors="coerce")


In [34]:
for col in data.select_dtypes(include=[np.number]).columns:
    data[col] = data[col].fillna(data[col].median())


## Data Transformation

In [35]:
data["age_group"] = pd.cut(
    data["age"],
    bins=[0, 40, 55, 70, 120],
    labels=["young", "middle", "senior", "elder"]
)


## Categorical Encoding

In [36]:
data["age_group"] = pd.cut(
    data["age"],
    bins=[0, 40, 55, 70, 120],
    labels=["young", "middle", "senior", "elder"]
)


## Final Dataset Ready for ML

In [37]:
data["weight_est"] = data["chol"] / 2.5
data["bmi"] = data["weight_est"] / (1.70 ** 2)


In [38]:
data["age_chol_interaction"] = data["age"] * data["chol"]


In [39]:
scaler = StandardScaler()

scale_cols = ["age", "trestbps", "chol", "thalach", "oldpeak", "bmi"]
data[scale_cols] = scaler.fit_transform(data[scale_cols])


In [40]:
categorical_cols = ["sex", "cp", "restecg", "slope", "thal", "age_group"]
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)


In [41]:
data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   920 non-null    float64
 1   trestbps              920 non-null    float64
 2   chol                  920 non-null    float64
 3   fbs                   920 non-null    float64
 4   thalach               920 non-null    float64
 5   exang                 920 non-null    float64
 6   oldpeak               920 non-null    float64
 7   ca                    920 non-null    float64
 8   target                920 non-null    int64  
 9   weight_est            920 non-null    float64
 10  bmi                   920 non-null    float64
 11  age_chol_interaction  920 non-null    float64
 12  sex_1.0               920 non-null    bool   
 13  cp_2.0                920 non-null    bool   
 14  cp_3.0                920 non-null    bool   
 15  cp_4.0                9

Unnamed: 0,age,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target,weight_est,...,cp_4.0,restecg_1.0,restecg_2.0,slope_2.0,slope_3.0,thal_6.0,thal_7.0,age_group_middle,age_group_senior,age_group_elder
0,1.007386,0.705176,0.303643,1.0,0.489727,0.0,1.368109,0.0,0,93.2,...,False,False,True,False,True,True,False,False,True,False
1,1.432034,1.518569,0.789967,0.0,-1.181478,1.0,0.611589,3.0,2,114.4,...,True,False,True,True,False,False,False,False,True,False
2,1.432034,-0.650479,0.266939,0.0,-0.345875,1.0,1.651804,2.0,1,91.6,...,True,False,True,True,False,False,True,False,True,False
3,-1.752828,-0.108217,0.459634,0.0,1.961979,0.0,2.502889,0.0,0,100.0,...,False,False,False,False,True,False,False,False,False,False
4,-1.32818,-0.108217,0.037541,0.0,1.36512,0.0,0.517024,0.0,0,81.6,...,False,False,True,False,False,False,False,True,False,False


In [42]:
data.to_csv("heart_disease_processed.csv", index=False)
