# Dataset: 2025 Heart Disease (Smoking, Diabetes) — Cleaning Notebook

In [1]:
#Import modules required to perform data cleaning
import pandas as pd
import numpy as np
from pathlib import Path

#Reading the raw data
raw_file = 'heart_2020_cleaned.csv'
df = pd.read_csv(raw_file)
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


## Basic checks

In [2]:
#Getting info on the columns present in the dataset
df.info()

#Initial look into data via a numerical summary
df.describe(include='all').T.head(20)

#Checking for unique values in each column to identify categorical variables that require transformation
for col in df.columns:
    unique_vals = df[col].unique()
    print(f"Column: {col}")
    print(f"Unique values ({len(unique_vals)}): {unique_vals}\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

## Cleaning steps for Dataset 3

In [3]:
# 1) Trim columns, standardise names
df.columns = (df.columns
              .str.strip() # Removes leading/trailing whitespace
              .str.lower() # Converts all characters to lowercase
              .str.replace('[^a-z0-9_]+','_', regex=True))  #Replaces non-alphanumeric characters with underscores

# 2) Handle missing values 

# Step 1: Identify numeric columns (integers and floats) in the DataFrame
num_cols = df.select_dtypes(include=['float64','int64']).columns

# Step 2: Replace infinite values with NaN in each numeric column
for c in num_cols:
    df[c] = df[c].replace([np.inf, -np.inf], np.nan)
# Step 3: Fill missing values (NaN) in numeric columns with the median of each column
df[num_cols] = df[num_cols].fillna(df[num_cols].median())


# 3) Categorical normalisation

# Renames the values in the diabetic varaibles to be more descriptive and clearer
if 'diabetic' in df.columns:
    df['diabetic'] = (df['diabetic'].astype(str).str.strip().str.lower()
                          .replace({'no': 'non-diabetic', 'yes': 'diabetic', 'no, borderline diabetes':"Prediabetic",'yes (during pregnancy)':"Preganancy Diabetes"}))

#Variable for North American is remaned to "native american" to make it more clear and less racially insensitive
if 'race' in df.columns:
    df['race'] = (df['race'].astype(str).str.strip().str.lower()
                          .replace({'american indian/alaskan native':"native american",}))

#Rename the values for smoking varaibles to be more descriptive and clearer
if 'smoking' in df.columns:
    df['smoking'] = (df['smoking'].astype(str).str.strip().str.lower()
                          .replace({'yes':"Smoker","no":"Non-Smoker"}))

# Capitalise the first letter of each word in all string columns (excluding the header)
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype(str).str.title()

# 6) Save cleaned dataset into a new file
out_file = 'heart_2020_recleaned.csv'
df.to_csv(out_file, index=False)
out_file

'heart_2020_recleaned.csv'