# Dataset: 2025 Heart Disease (Smoking, Diabetes) — Cleaning Notebook

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

raw_file = 'heart_2025.csv'
df = pd.read_csv(raw_file)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,smoking,diabetes,bmi,heart_disease
0,67,1,2,111,536,0,2,88,0,1.3,3,2,3,1,0,23.4,1
1,57,1,3,109,107,0,2,119,0,5.4,2,0,3,0,1,35.4,0
2,43,1,4,171,508,0,1,113,0,3.7,3,0,7,1,1,29.9,0
3,71,0,4,90,523,0,2,152,0,4.7,2,1,3,1,0,15.2,1
4,36,1,2,119,131,0,2,128,0,5.9,3,1,3,1,0,16.7,1


## Basic checks

In [4]:
df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            3069 non-null   int64  
 1   sex            3069 non-null   int64  
 2   cp             3069 non-null   int64  
 3   trestbps       3069 non-null   int64  
 4   chol           3069 non-null   int64  
 5   fbs            3069 non-null   int64  
 6   restecg        3069 non-null   int64  
 7   thalach        3069 non-null   int64  
 8   exang          3069 non-null   int64  
 9   oldpeak        3069 non-null   float64
 10  slope          3069 non-null   int64  
 11  ca             3069 non-null   int64  
 12  thal           3069 non-null   int64  
 13  smoking        3069 non-null   int64  
 14  diabetes       3069 non-null   int64  
 15  bmi            3069 non-null   float64
 16  heart_disease  3069 non-null   int64  
dtypes: float64(2), int64(15)
memory usage: 407.7 KB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,3069.0,52.50114,13.70752,29.0,41.0,53.0,64.0,76.0
sex,3069.0,0.555556,0.496985,0.0,0.0,1.0,1.0,1.0
cp,3069.0,2.489736,1.110417,1.0,2.0,2.0,3.0,4.0
trestbps,3069.0,145.306289,31.602321,90.0,118.0,146.0,172.0,200.0
chol,3069.0,347.219941,146.853319,100.0,215.0,350.0,475.0,600.0
fbs,3069.0,0.14565,0.352813,0.0,0.0,0.0,0.0,1.0
restecg,3069.0,1.004236,0.812016,0.0,0.0,1.0,2.0,2.0
thalach,3069.0,135.15738,43.353197,60.0,98.0,135.0,172.0,210.0
exang,3069.0,0.176279,0.381119,0.0,0.0,0.0,0.0,1.0
oldpeak,3069.0,3.189052,1.802815,0.0,1.6,3.2,4.7,6.2


## Cleaning steps (fill in as needed)

In [5]:
# 1) Trim columns, standardise names
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace('[^a-z0-9_]+','_', regex=True))

# 2) Handle missing values (example)
num_cols = df.select_dtypes(include=['float64','int64']).columns
for c in num_cols:
    df[c] = df[c].replace([np.inf, -np.inf], np.nan)
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# 3) Categorical normalisation (example)
if 'sex' in df.columns:
    df['sex'] = (df['sex'].astype(str).str.strip().str.lower()
                 .replace({'male':'Male','m':'Male','1':'Male','female':'Female','f':'Female','0':'Female'}))

# 4) Range sanity checks (example)
if 'age' in df.columns:
    df = df[(df['age'] >= 18) & (df['age'] <= 100)]

# 5) Save cleaned
out_file = 'heart_2025_cleaned.csv'
df.to_csv(out_file, index=False)
out_file

'heart_2025_cleaned.csv'