# Heart Disease Risk Analysis Using Key Health Indicators: A Comprehensive Study with CDC 2022 Data

In [159]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn as sk

In [160]:
# Load the dataset
file_path = "../data/raw/heart.csv"
df = pd.read_csv(file_path)

# Display the first 5 rows of the dataset
df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [161]:
# Display the number of rows and columns in the dataset
len(df)

445132

## Preprocessing Data

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      445132 non-null  object 
 1   Sex                        445132 non-null  object 
 2   GeneralHealth              443934 non-null  object 
 3   PhysicalHealthDays         434205 non-null  float64
 4   MentalHealthDays           436065 non-null  float64
 5   LastCheckupTime            436824 non-null  object 
 6   PhysicalActivities         444039 non-null  object 
 7   SleepHours                 439679 non-null  float64
 8   RemovedTeeth               433772 non-null  object 
 9   HadHeartAttack             442067 non-null  object 
 10  HadAngina                  440727 non-null  object 
 11  HadStroke                  443575 non-null  object 
 12  HadAsthma                  443359 non-null  object 
 13  HadSkinCancer              44

In [163]:
columns = [
    "State",
    "RemovedTeeth",
    "RaceEthnicityCategory",
    "ChestScan",
    "TetanusLast10Tdap",
    "HighRiskLastYear",
]

df.drop(columns=columns, inplace=True)

In [164]:
df.head()

Unnamed: 0,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,HadHeartAttack,HadAngina,HadStroke,...,ECigaretteUsage,AgeCategory,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,CovidPos
0,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,No,No,No,...,Not at all (right now),Age 80 or older,,,,No,No,Yes,No,No
1,Female,Excellent,0.0,0.0,,No,6.0,No,No,No,...,Never used e-cigarettes in my entire life,Age 80 or older,1.6,68.04,26.57,No,No,No,No,No
2,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,No,No,No,...,Never used e-cigarettes in my entire life,Age 55 to 59,1.57,63.5,25.61,No,No,No,No,Yes
3,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,No,No,No,...,Never used e-cigarettes in my entire life,,1.65,63.5,23.3,No,No,Yes,Yes,No
4,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,No,No,No,...,Never used e-cigarettes in my entire life,Age 40 to 44,1.57,53.98,21.77,Yes,No,No,Yes,No


### Remove duplicates

In [165]:
# Find the number of missing values in each column
missing_values = df.isnull().sum()

# Display the columns with missing values
missing_values[missing_values > 0].sort_values(ascending=False).to_frame(
    "Missing Values"
)

Unnamed: 0,Missing Values
PneumoVaxEver,77040
HIVTesting,66127
CovidPos,50764
BMI,48806
FluVaxLast12,47121
AlcoholDrinkers,46574
WeightInKilograms,42078
ECigaretteUsage,35660
SmokerStatus,35462
HeightInMeters,28652


In [166]:
# Get duplicate rows in the dataset
duplicate_rows = df[df.duplicated()]

# Remove duplicate rows from the dataset and update the dataframe in place
df.drop_duplicates(inplace=True)

print(f"Removed {duplicate_rows.shape[0]} duplicate rows")

Removed 4226 duplicate rows


### Remove missing values

In [167]:
# Fill missing values in the dataset with the mode for categorical columns and the mean for numerical columns

for col in df.columns:
    if df[col].dtype == 'object':
        min_value = df[col].dropna().mode().values[0] 
        df[col] = df[col].fillna(min_value)
    else:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(mean_value).round(2)

In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 440906 entries, 0 to 445131
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sex                        440906 non-null  object 
 1   GeneralHealth              440906 non-null  object 
 2   PhysicalHealthDays         440906 non-null  float64
 3   MentalHealthDays           440906 non-null  float64
 4   LastCheckupTime            440906 non-null  object 
 5   PhysicalActivities         440906 non-null  object 
 6   SleepHours                 440906 non-null  float64
 7   HadHeartAttack             440906 non-null  object 
 8   HadAngina                  440906 non-null  object 
 9   HadStroke                  440906 non-null  object 
 10  HadAsthma                  440906 non-null  object 
 11  HadSkinCancer              440906 non-null  object 
 12  HadCOPD                    440906 non-null  object 
 13  HadDepressiveDisorder      440906 

### Categorical to Numerical Conversion

In [169]:
mappings = {
    "Sex": {"Female": 0, "Male": 1},
    "GeneralHealth": {"Excellent": 0, "Very good": 1, "Good": 2, "Fair": 3, "Poor": 4},
    "LastCheckupTime": {
        "Within past year (anytime less than 12 months ago)": 1,
        "Within past 2 years (1 year but less than 2 years ago)": 2,
        "Within past 5 years (2 years but less than 5 years ago)": 3,
        "5 or more years ago": 4,
    },
    "HadDiabetes": {
        "Yes": 1,
        "Yes, but only during pregnancy (female)": 2,
        "No": 3,
        "No, pre-diabetes or borderline diabetes": 4,
    },
    "SmokerStatus": {
        "Current smoker - now smokes every day": 1,
        "Current smoker - now smokes some days": 2,
        "Former smoker": 3,
        "Never smoked": 4,
    },
    "ECigaretteUsage": {
        "Never used e-cigarettes in my entire life": 1,
        "Use them every day": 2,
        "Use them some days": 3,
        "Not at all (right now)": 4,
    },
    "AgeCategory": {
        "Age 18 to 24": 1,
        "Age 25 to 29": 2,
        "Age 30 to 34": 3,
        "Age 35 to 39": 4,
        "Age 40 to 44": 5,
        "Age 45 to 49": 6,
        "Age 50 to 54": 7,
        "Age 55 to 59": 8,
        "Age 60 to 64": 9,
        "Age 65 to 69": 10,
        "Age 70 to 74": 11,
        "Age 75 to 79": 12,
        "Age 80 or older": 13,
    },
    "CovidPos": {
        "Yes": 1,
        "No": 2,
        "Tested positive using home test without a health professional": 3,
    },
    "Boolean": {"Yes": 1, "No": 0},
}

def map_columns(df):
    for column, mapping in mappings.items():
        if column in df.columns:
            df[column] = df[column].map(mapping)

def preprocess_data(df):
    # Apply mappings to columns
    map_columns(df)
    
    # Map all remaining object columns to Boolean
    for column in df.columns:
        if column not in mappings and df[column].dtype == "object":
            df[column] = df[column].map(mappings["Boolean"])

preprocess_data(df)