In [2]:
# Import dependencies for data cleansing
import pandas as pd
from sklearn.model_selection import train_test_split


# Import Data

In [3]:
#Import the heart_2020_cleaned.csv file
heart_2020_cleaned = pd.read_csv('heart_2020_cleaned.csv')
# Display the first 5 rows of the dataframe
heart_2020_cleaned.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [4]:
# Reviewing dtypes
heart_2020_cleaned.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

# Separating Datasets for balancing purposes

In [5]:

# Separate the dataset into two subsets based on the HeartDisease column
heart_disease_yes = heart_2020_cleaned[heart_2020_cleaned['HeartDisease'] == 'Yes']
heart_disease_no = heart_2020_cleaned[heart_2020_cleaned['HeartDisease'] == 'No']



In [6]:
# Display the first 5 rows of the heart_disease_yes dataframe
heart_disease_yes.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
10,Yes,34.3,Yes,No,No,30.0,0.0,Yes,Male,60-64,White,Yes,No,Poor,15.0,Yes,No,No
35,Yes,32.98,Yes,No,Yes,10.0,0.0,Yes,Male,75-79,White,Yes,Yes,Poor,4.0,No,No,Yes
42,Yes,25.06,No,No,No,0.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,7.0,No,No,Yes
43,Yes,30.23,Yes,No,No,6.0,2.0,Yes,Female,75-79,White,Yes,Yes,Fair,8.0,No,Yes,No


In [7]:
#display the first 5 rows of the heart_disease_no dataframe
heart_disease_no.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [8]:
# Balance the dataset by sampling an equal number of cases from each subset
# Determine the number of samples to match based on the smaller subset
min_count = min(len(heart_disease_yes), len(heart_disease_no))
# Display the number of samples to match
min_count


27373

In [9]:
# Sample from each subset
balanced_heart_disease_yes = heart_disease_yes.sample(n=min_count, random_state=42)
balanced_heart_disease_no = heart_disease_no.sample(n=min_count, random_state=42)


# Combining into a single balanced dataset


In [10]:
# Combine the balanced subsets and reset the index
balanced_dataset = pd.concat([balanced_heart_disease_yes, balanced_heart_disease_no]).reset_index(drop=True)
# Display the first 5 rows of the balanced dataset
balanced_dataset.head()
# Validate the shape of the balanced dataset
balanced_dataset.shape


(54746, 18)

# Train_Test_Split!

In [11]:
# Split the balanced dataset into training and testing sets
train_set, test_set = train_test_split(balanced_dataset, test_size=0.2, random_state=42)


In [12]:

# Display the size of the training and testing sets
len(train_set), len(test_set)

(43796, 10950)

# Encoding 

In [13]:
# Encoding age_category
# Define the order for age categories based on the dataset's unique values
age_category_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49',
                      '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']

# Map the age categories to ordinal values
age_category_mapping = {category: index for index, category in enumerate(age_category_order)}
train_set['AgeCategoryOrdinal'] = train_set['AgeCategory'].map(age_category_mapping)

In [14]:
# Apply ordinal encoding to both training and testing sets
for dataset in [train_set, test_set]:
    dataset['AgeCategoryOrdinal'] = dataset['AgeCategory'].map(age_category_mapping)
    dataset.drop('AgeCategory', axis=1, inplace=True)

# Apply one-hot encoding to the remaining non-numeric columns, excluding 'AgeCategory', for both sets the encoding should be numeric
encoded_train_set = pd.get_dummies(train_set, drop_first=True)
encoded_test_set = pd.get_dummies(test_set, drop_first=True)

In [15]:
# Convert boolean columns to binary (1/0)
columns_to_convert = ['HeartDisease_Yes', 'Smoking_Yes', 'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes',
                      'Diabetic_Yes', 'Diabetic_Yes (during pregnancy)', 'PhysicalActivity_Yes', 
                      'GenHealth_Fair', 'GenHealth_Good', 'GenHealth_Poor', 'GenHealth_Very good', 
                      'Asthma_Yes', 'KidneyDisease_Yes', 'SkinCancer_Yes']

for col in columns_to_convert:
    encoded_train_set[col] = encoded_train_set[col].astype(int)
    encoded_test_set[col] = encoded_test_set[col].astype(int)


In [16]:
# Verify the transformation
display(encoded_train_set.head())
display(encoded_train_set.shape)
display(encoded_test_set.head())
display(encoded_test_set.shape)

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,AgeCategoryOrdinal,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
49873,28.34,0.0,0.0,8.0,9,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,0
29833,26.58,0.0,0.0,7.0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
34118,23.73,0.0,0.0,7.0,4,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7237,23.71,0.0,0.0,7.0,10,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
14143,29.62,0.0,4.0,6.0,7,1,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


(43796, 27)

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,AgeCategoryOrdinal,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
13501,53.22,30.0,30.0,16.0,8,1,1,0,0,1,...,0,0,0,0,0,1,0,1,0,0
15356,21.52,0.0,0.0,8.0,7,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
11209,35.62,0.0,0.0,8.0,11,1,1,0,0,0,...,0,0,1,0,0,0,1,0,1,0
13251,18.32,30.0,0.0,7.0,12,1,1,0,1,1,...,0,0,0,1,0,0,0,0,1,0
27143,23.67,30.0,0.0,2.0,6,1,1,0,0,1,...,0,0,0,1,0,0,0,1,1,0


(10950, 27)

In [17]:
# Exporting the data in a csv so it can be used for training the various models
encoded_train_set.to_csv('heart_2020_encoded_train.csv', index=False)
encoded_test_set.to_csv('heart_2020_encoded_test.csv', index=False)

In [18]:
# Validate the columns in the encoded training set
encoded_train_set.columns


Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime',
       'AgeCategoryOrdinal', 'HeartDisease_Yes', 'Smoking_Yes',
       'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes', 'Sex_Male',
       'Race_Asian', 'Race_Black', 'Race_Hispanic', 'Race_Other', 'Race_White',
       'Diabetic_No, borderline diabetes', 'Diabetic_Yes',
       'Diabetic_Yes (during pregnancy)', 'PhysicalActivity_Yes',
       'GenHealth_Fair', 'GenHealth_Good', 'GenHealth_Poor',
       'GenHealth_Very good', 'Asthma_Yes', 'KidneyDisease_Yes',
       'SkinCancer_Yes'],
      dtype='object')