# Heart Attack Risk and Prediction Dataset In India

In [3]:
# Import modules required to perform data cleaning
import pandas as pd
import numpy as np
from pathlib import Path

# Load the raw data
raw_file = 'heart_attack_prediction_india.csv'
df = pd.read_csv(raw_file)
df.head()

Unnamed: 0,Patient_ID,State_Name,Age,Gender,Diabetes,Hypertension,Obesity,Smoking,Alcohol_Consumption,Physical_Activity,...,Diastolic_BP,Air_Pollution_Exposure,Family_History,Stress_Level,Healthcare_Access,Heart_Attack_History,Emergency_Response_Time,Annual_Income,Health_Insurance,Heart_Attack_Risk
0,1,Rajasthan,42,Female,0,0,1,1,0,0,...,119,1,0,4,0,0,157,611025,0,0
1,2,Himachal Pradesh,26,Male,0,0,0,0,1,1,...,115,0,0,7,0,0,331,174527,0,0
2,3,Assam,78,Male,0,0,1,0,0,1,...,117,0,1,10,1,0,186,1760112,1,0
3,4,Odisha,58,Male,1,0,1,0,0,1,...,65,0,0,1,1,1,324,1398213,0,0
4,5,Karnataka,22,Male,0,0,0,0,0,1,...,109,0,0,9,0,0,209,97987,0,1


# Basic Checks

In [4]:
# Information about the dataset
df.info()
df.describe(include='all').T.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Patient_ID               10000 non-null  int64 
 1   State_Name               10000 non-null  object
 2   Age                      10000 non-null  int64 
 3   Gender                   10000 non-null  object
 4   Diabetes                 10000 non-null  int64 
 5   Hypertension             10000 non-null  int64 
 6   Obesity                  10000 non-null  int64 
 7   Smoking                  10000 non-null  int64 
 8   Alcohol_Consumption      10000 non-null  int64 
 9   Physical_Activity        10000 non-null  int64 
 10  Diet_Score               10000 non-null  int64 
 11  Cholesterol_Level        10000 non-null  int64 
 12  Triglyceride_Level       10000 non-null  int64 
 13  LDL_Level                10000 non-null  int64 
 14  HDL_Level                10000 non-null

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Patient_ID,10000.0,,,,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
State_Name,10000.0,28.0,Chhattisgarh,399.0,,,,,,,
Age,10000.0,,,,49.3949,17.280301,20.0,35.0,49.0,64.0,79.0
Gender,10000.0,2.0,Male,5516.0,,,,,,,
Diabetes,10000.0,,,,0.0929,0.290307,0.0,0.0,0.0,0.0,1.0
Hypertension,10000.0,,,,0.2469,0.43123,0.0,0.0,0.0,0.0,1.0
Obesity,10000.0,,,,0.3037,0.459878,0.0,0.0,0.0,1.0,1.0
Smoking,10000.0,,,,0.3014,0.458889,0.0,0.0,0.0,1.0,1.0
Alcohol_Consumption,10000.0,,,,0.3528,0.477865,0.0,0.0,0.0,1.0,1.0
Physical_Activity,10000.0,,,,0.5958,0.490761,0.0,0.0,1.0,1.0,1.0


# Cleaning Steps

In [5]:
# 1) Trim columns, standardise names
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace('[^a-z0-9_]+','_', regex=True))

# 2) Handle missing values (example)
num_cols = df.select_dtypes(include=['float64','int64']).columns
for c in num_cols:
    df[c] = df[c].replace([np.inf, -np.inf], np.nan)
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# 3) Drop irrelevant columns
df_clean = df.drop(columns=["patient_id", "state_name"])  

# 4) Create income quartiles to compare low vs high income status
df_clean["income_group"] = pd.qcut(df_clean["annual_income"], q=4, labels=["low", "lower-mid", "upper-mid", "high"])

# 5) Categorize diet quality
df_clean["diet_category"] = pd.cut(df_clean["diet_score"], bins=[0, 3, 7, 10],
                                   labels=["poor", "average", "good"])

# 6) Multiple risk factors
risk_factors = ["smoking", "diabetes", "hypertension", "obesity", "alcohol_consumption", "air_pollution_exposure", "family_history", "healthcare_access", "heart_attack_history", "heart_attack_risk"]
df_clean["num_risk_factors"] = df_clean[risk_factors].sum(axis=1)

# 7) Save cleaned
out_file = 'heart_attack_risk_India_cleaned.csv'
df_clean.to_csv(out_file, index=False)
out_file

'heart_attack_risk_India_cleaned.csv'