In [27]:
# Import dependencies for data cleansing
import pandas as pd
from sklearn.model_selection import train_test_split


# Import Data

In [28]:
#Import the heart_2020_cleaned.csv file
heart_2020_cleaned = pd.read_csv('heart_2020_cleaned.csv')
# Display the first 5 rows of the dataframe
heart_2020_cleaned.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


# Separating Datasets for balancing purposes

In [29]:

# Separate the dataset into two subsets based on the HeartDisease column
heart_disease_yes = heart_2020_cleaned[heart_2020_cleaned['HeartDisease'] == 'Yes']
heart_disease_no = heart_2020_cleaned[heart_2020_cleaned['HeartDisease'] == 'No']



In [30]:
# Display the first 5 rows of the heart_disease_yes dataframe
heart_disease_yes.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
5,Yes,28.87,Yes,No,No,6.0,0.0,Yes,Female,75-79,Black,No,No,Fair,12.0,No,No,No
10,Yes,34.3,Yes,No,No,30.0,0.0,Yes,Male,60-64,White,Yes,No,Poor,15.0,Yes,No,No
35,Yes,32.98,Yes,No,Yes,10.0,0.0,Yes,Male,75-79,White,Yes,Yes,Poor,4.0,No,No,Yes
42,Yes,25.06,No,No,No,0.0,0.0,Yes,Female,80 or older,White,Yes,No,Good,7.0,No,No,Yes
43,Yes,30.23,Yes,No,No,6.0,2.0,Yes,Female,75-79,White,Yes,Yes,Fair,8.0,No,Yes,No


In [31]:
#display the first 5 rows of the heart_disease_no dataframe
heart_disease_no.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [32]:
# Balance the dataset by sampling an equal number of cases from each subset
# Determine the number of samples to match based on the smaller subset
min_count = min(len(heart_disease_yes), len(heart_disease_no))
# Display the number of samples to match
min_count


27373

In [33]:
# Sample from each subset
balanced_heart_disease_yes = heart_disease_yes.sample(n=min_count, random_state=42)
balanced_heart_disease_no = heart_disease_no.sample(n=min_count, random_state=42)


# Combining into a single balanced dataset


In [34]:
# Combine the balanced subsets and reset the index
balanced_dataset = pd.concat([balanced_heart_disease_yes, balanced_heart_disease_no]).reset_index(drop=True)
# Display the first 5 rows of the balanced dataset
balanced_dataset.head()


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,Yes,25.61,Yes,No,Yes,30.0,30.0,No,Female,50-54,Asian,No,Yes,Poor,3.0,No,No,Yes
1,Yes,27.12,Yes,No,No,3.0,0.0,No,Male,75-79,White,"No, borderline diabetes",Yes,Good,7.0,No,No,No
2,Yes,30.9,No,No,Yes,4.0,30.0,Yes,Female,70-74,White,Yes,No,Fair,7.0,Yes,Yes,No
3,Yes,27.46,No,No,No,0.0,0.0,No,Female,70-74,White,Yes,Yes,Excellent,8.0,No,No,No
4,Yes,26.63,Yes,No,No,0.0,0.0,No,Male,80 or older,White,No,No,Excellent,5.0,No,No,Yes


# Train_Test_Split!

In [35]:
# Split the balanced dataset into training and testing sets
train_set, test_set = train_test_split(balanced_dataset, test_size=0.2, random_state=42)


In [36]:

# Display the size of the training and testing sets
len(train_set), len(test_set)

(43796, 10950)

# Encoding non-numeric columns

In [37]:
# Encode non-numeric columns using one-hot encoding
encoded_dataset = pd.get_dummies(train_set, drop_first=True)  # Using drop_first to avoid dummy variable trap

# Display the first few rows of the encoded dataset to verify the transformation
encoded_dataset.head(), encoded_dataset.shape


(         BMI  PhysicalHealth  MentalHealth  SleepTime  HeartDisease_Yes  \
 49873  28.34             0.0           0.0        8.0             False   
 29833  26.58             0.0           0.0        7.0             False   
 34118  23.73             0.0           0.0        7.0             False   
 7237   23.71             0.0           0.0        7.0              True   
 14143  29.62             0.0           4.0        6.0              True   
 
        Smoking_Yes  AlcoholDrinking_Yes  Stroke_Yes  DiffWalking_Yes  \
 49873         True                False       False            False   
 29833        False                False       False            False   
 34118         True                False       False            False   
 7237          True                False       False            False   
 14143         True                False       False            False   
 
        Sex_Male  ...  Diabetic_Yes  Diabetic_Yes (during pregnancy)  \
 49873     False  ...         

# Encoding age_category

In [39]:
# Define the order for age categories based on the dataset's unique values
age_category_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49',
                      '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']

# Map the age categories to ordinal values
age_category_mapping = {category: index for index, category in enumerate(age_category_order)}
train_set['AgeCategoryOrdinal'] = train_set['AgeCategory'].map(age_category_mapping)

# Drop the original 'AgeCategory' column
train_set_ordinal = train_set.drop('AgeCategory', axis=1)

# Reapply one-hot encoding to the remaining non-numeric columns, excluding 'AgeCategory'
encoded_dataset_with_ordinal_age = pd.get_dummies(train_set_ordinal, drop_first=True)

# Display the first few rows of the dataset to verify the transformation
display(encoded_dataset_with_ordinal_age.head())
display(encoded_dataset_with_ordinal_age.shape)


Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,AgeCategoryOrdinal,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
49873,28.34,0.0,0.0,8.0,9,False,True,False,False,False,...,False,False,True,False,True,False,False,False,False,False
29833,26.58,0.0,0.0,7.0,1,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
34118,23.73,0.0,0.0,7.0,4,False,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
7237,23.71,0.0,0.0,7.0,10,True,True,False,False,False,...,False,False,True,False,False,False,False,False,False,False
14143,29.62,0.0,4.0,6.0,7,True,True,False,False,False,...,False,False,False,True,False,False,False,False,False,False


(43796, 27)