In [1044]:
# Import dependencies for data cleansing
import pandas as pd
from sklearn.model_selection import train_test_split


# Import Data

In [1045]:
#Import the heart_2022_no_nans file
data_2022 = pd.read_csv('heart_2022_no_nans.csv')
# Display the first 5 rows of the dataframe
data_2022.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [1046]:
#Import the heart_2020_cleaned.csv file
heart_2020_cleaned = pd.read_csv('heart_2020_cleaned.csv')
# Display the first 5 rows of the dataframe
heart_2020_cleaned.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [1047]:
# Display all columns and data types for the 2022 data
data_2022.dtypes


State                         object
Sex                           object
GeneralHealth                 object
PhysicalHealthDays           float64
MentalHealthDays             float64
LastCheckupTime               object
PhysicalActivities            object
SleepHours                   float64
RemovedTeeth                  object
HadHeartAttack                object
HadAngina                     object
HadStroke                     object
HadAsthma                     object
HadSkinCancer                 object
HadCOPD                       object
HadDepressiveDisorder         object
HadKidneyDisease              object
HadArthritis                  object
HadDiabetes                   object
DeafOrHardOfHearing           object
BlindOrVisionDifficulty       object
DifficultyConcentrating       object
DifficultyWalking             object
DifficultyDressingBathing     object
DifficultyErrands             object
SmokerStatus                  object
ECigaretteUsage               object
C

In [1048]:
# Display all columns and data types for the 2020 data
heart_2020_cleaned.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

# Renaming Columns and Selecting Columns

In [1049]:
data_2022.rename(columns={
    'PhysicalHealthDays': 'PhysicalHealth',
    'MentalHealthDays': 'MentalHealth',
    'SleepHours': 'SleepTime',
    'HadHeartAttack': 'HeartDisease',
    'PhysicalActivities': 'PhysicalActivity',
    'AlcoholDrinkers': 'AlcoholDrinking',
    'SmokerStatus': 'Smoking',  # Assuming this matches the Smoking column in the 2020 data
    'HadStroke': 'Stroke',
    'DifficultyWalking': 'DiffWalking',
    'HadKidneyDisease': 'KidneyDisease',
    'HadSkinCancer': 'SkinCancer',
    'HadAsthma': 'Asthma',
    'RaceEthnicityCategory': 'Race',  # Assuming this matches the Race column in the 2020 data
    'HadDiabetes': 'Diabetic',  # Assuming this matches the Diabetic status in the 2020 data
    'GeneralHealth': 'GenHealth'  # Assuming this matches the HealthStatus column in the 2020 data
    # Further mappings might be needed based on the specific columns used in your model
}, inplace=True)





# Final Selection of Columns

In [1050]:
columns_needed = ['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer']
data_2022_final = data_2022[columns_needed]


In [1051]:
# Display the first 5 rows of the dataframe
data_2022_final.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,27.99,Former smoker,No,No,4.0,0.0,No,Female,Age 65 to 69,"White only, Non-Hispanic",No,Yes,Very good,9.0,No,No,No
1,No,30.13,Former smoker,No,No,0.0,0.0,No,Male,Age 70 to 74,"White only, Non-Hispanic",Yes,Yes,Very good,6.0,No,No,No
2,No,31.66,Former smoker,Yes,No,0.0,0.0,Yes,Male,Age 75 to 79,"White only, Non-Hispanic",No,No,Very good,8.0,No,No,No
3,No,31.32,Never smoked,No,No,5.0,0.0,Yes,Female,Age 80 or older,"White only, Non-Hispanic",No,Yes,Fair,9.0,No,No,Yes
4,No,33.07,Never smoked,No,No,3.0,15.0,No,Female,Age 80 or older,"White only, Non-Hispanic",No,Yes,Good,5.0,No,No,No


# HeartDisease

In [1052]:
# Displaying the value counts for the HeartDisease column for 2020 and 2022 data
print(heart_2020_cleaned['HeartDisease'].value_counts())
print(data_2022_final['HeartDisease'].value_counts())

HeartDisease
No     292422
Yes     27373
Name: count, dtype: int64
HeartDisease
No     232587
Yes     13435
Name: count, dtype: int64


# BMI

In [1053]:
# Displaying the value counts for the BMI column for 2020 and 2022 data
print(heart_2020_cleaned['BMI'].value_counts())
print(data_2022_final['BMI'].value_counts())


BMI
26.63    3762
27.46    2767
27.44    2723
24.41    2696
27.12    2525
         ... 
59.85       1
50.59       1
92.53       1
62.95       1
46.56       1
Name: count, Length: 3604, dtype: int64
BMI
26.63    2727
27.46    2040
27.44    1964
24.41    1922
27.12    1893
         ... 
41.65       1
33.24       1
25.57       1
13.64       1
45.28       1
Name: count, Length: 3514, dtype: int64


# Smoking

In [1054]:
# Displaying the value counts for the Smoking column for 2020 and 2022 data
print(heart_2020_cleaned['Smoking'].value_counts())
print(data_2022_final['Smoking'].value_counts())

Smoking
No     187887
Yes    131908
Name: count, dtype: int64
Smoking
Never smoked                             147737
Former smoker                             68527
Current smoker - now smokes every day     21659
Current smoker - now smokes some days      8099
Name: count, dtype: int64


In [1055]:
# Correctly map detailed smoking status to binary "Yes"/"No"
data_2022_final = data_2022_final.copy()

data_2022_final['Smoking'] = data_2022_final['Smoking'].map({
    'Never smoked': 'No',
    'Former smoker': 'Yes',
    'Current smoker - now smokes every day': 'Yes',
    'Current smoker - now smokes some days': 'Yes'
})



In [1056]:
# Validate the mapping
print(data_2022_final['Smoking'].value_counts())

Smoking
No     147737
Yes     98285
Name: count, dtype: int64


# AlcoholDrinking

In [1057]:
# Displaying the value counts for the AlcoholDrinking column for 2020 and 2022 data
print(heart_2020_cleaned['AlcoholDrinking'].value_counts())
print(data_2022_final['AlcoholDrinking'].value_counts())


AlcoholDrinking
No     298018
Yes     21777
Name: count, dtype: int64
AlcoholDrinking
Yes    135307
No     110715
Name: count, dtype: int64


# Stroke


In [1058]:
# Displaying the value counts for the Stroke column for 2020 and 2022 data
print(heart_2020_cleaned['Stroke'].value_counts())
print(data_2022_final['Stroke'].value_counts())

Stroke
No     307726
Yes     12069
Name: count, dtype: int64
Stroke
No     235910
Yes     10112
Name: count, dtype: int64


# PhysicalHealth

In [1059]:
# Displaying the value counts for the PhysicalHealth column for 2020 and 2022 data
print(heart_2020_cleaned['PhysicalHealth'].value_counts())
print(data_2022_final['PhysicalHealth'].value_counts())


PhysicalHealth
0.0     226589
30.0     19509
2.0      14880
1.0      10489
3.0       8617
5.0       7606
10.0      5453
15.0      5012
7.0       4629
4.0       4468
20.0      3216
14.0      2893
6.0       1270
25.0      1164
8.0        924
21.0       626
12.0       605
28.0       446
29.0       204
9.0        180
18.0       167
16.0       135
27.0       124
17.0       110
13.0        91
22.0        89
11.0        85
24.0        67
26.0        66
23.0        46
19.0        35
Name: count, dtype: int64
PhysicalHealth
0.0     152802
30.0     17160
2.0      14728
1.0      10058
3.0       9137
5.0       8939
10.0      6068
7.0       5221
4.0       4906
15.0      4845
20.0      2950
14.0      2813
6.0       1421
25.0      1123
8.0        961
12.0       610
21.0       584
28.0       365
9.0        211
29.0       178
18.0       151
16.0       126
27.0       103
17.0       101
11.0        95
13.0        87
22.0        72
26.0        64
24.0        60
23.0        54
19.0        29
Name: count, d

# MentalHealth

In [1060]:
# Displaying the value counts for the MentalHealth column for 2020 and 2022 data
print(heart_2020_cleaned['MentalHealth'].value_counts())
print(data_2022_final['MentalHealth'].value_counts())


MentalHealth
0.0     205401
30.0     17373
2.0      16495
5.0      14149
10.0     10513
3.0      10466
15.0      9896
1.0       9291
7.0       5528
20.0      5431
4.0       5379
14.0      2048
25.0      1954
6.0       1510
8.0       1094
12.0       757
28.0       515
21.0       352
29.0       317
18.0       211
9.0        203
16.0       152
17.0       128
27.0       126
13.0       110
22.0        98
11.0        83
23.0        68
24.0        67
26.0        59
19.0        21
Name: count, dtype: int64
MentalHealth
0.0     150454
2.0      13810
30.0     13702
5.0      11623
3.0       8849
10.0      8831
1.0       8244
15.0      8061
20.0      4925
4.0       4568
7.0       4485
25.0      1647
14.0      1573
6.0       1326
8.0        973
12.0       715
28.0       484
21.0       301
29.0       261
18.0       166
16.0       165
9.0        145
17.0       132
27.0       113
22.0       101
13.0        96
11.0        69
24.0        68
26.0        59
23.0        51
19.0        25
Name: count, dtype

# DiffWalking


In [1061]:
# Displaying the value counts for the DiffWalking column for 2020 and 2022 data
print(heart_2020_cleaned['DiffWalking'].value_counts())
print(data_2022_final['DiffWalking'].value_counts())


DiffWalking
No     275385
Yes     44410
Name: count, dtype: int64
DiffWalking
No     209952
Yes     36070
Name: count, dtype: int64


# Sex 

In [1062]:
# Displaying the value counts for the Sex column for 2020 and 2022 data
print(heart_2020_cleaned['Sex'].value_counts())
print(data_2022_final['Sex'].value_counts())

Sex
Female    167805
Male      151990
Name: count, dtype: int64
Sex
Female    127811
Male      118211
Name: count, dtype: int64


# AgeCategory

In [1063]:
# Displaying the value counts for the AgeCategory column for 2020 and 2022 data
print(heart_2020_cleaned['AgeCategory'].value_counts())
print(data_2022_final['AgeCategory'].value_counts())

AgeCategory
65-69          34151
60-64          33686
70-74          31065
55-59          29757
50-54          25382
80 or older    24153
45-49          21791
75-79          21482
18-24          21064
40-44          21006
35-39          20550
30-34          18753
25-29          16955
Name: count, dtype: int64
AgeCategory
Age 65 to 69       28557
Age 60 to 64       26720
Age 70 to 74       25739
Age 55 to 59       22224
Age 50 to 54       19913
Age 75 to 79       18136
Age 80 or older    17816
Age 40 to 44       16973
Age 45 to 49       16753
Age 35 to 39       15614
Age 30 to 34       13346
Age 18 to 24       13122
Age 25 to 29       11109
Name: count, dtype: int64


In [1064]:
age_category_mapping_2022_to_2020 = {
    'Age 65 to 69': '65-69',
    'Age 60 to 64': '60-64',
    'Age 70 to 74': '70-74',
    'Age 55 to 59': '55-59',
    'Age 50 to 54': '50-54',
    'Age 75 to 79': '75-79',
    'Age 80 or older': '80 or older',
    'Age 40 to 44': '40-44',
    'Age 45 to 49': '45-49',
    'Age 35 to 39': '35-39',
    'Age 30 to 34': '30-34',
    'Age 18 to 24': '18-24',
    'Age 25 to 29': '25-29'
}

# Correctly map age categories
data_2022_final['AgeCategory'] = data_2022_final['AgeCategory'].map(age_category_mapping_2022_to_2020)


In [1065]:
# Validate the mapping
print(data_2022_final['AgeCategory'].value_counts())

AgeCategory
65-69          28557
60-64          26720
70-74          25739
55-59          22224
50-54          19913
75-79          18136
80 or older    17816
40-44          16973
45-49          16753
35-39          15614
30-34          13346
18-24          13122
25-29          11109
Name: count, dtype: int64


# Race

In [1066]:
# Displaying the value counts for the Race column for 2020 and 2022 data
print(heart_2020_cleaned['Race'].value_counts())
print(data_2022_final['Race'].value_counts())

Race
White                             245212
Hispanic                           27446
Black                              22939
Other                              10928
Asian                               8068
American Indian/Alaskan Native      5202
Name: count, dtype: int64
Race
White only, Non-Hispanic         186336
Hispanic                          22570
Black only, Non-Hispanic          19330
Other race only, Non-Hispanic     12205
Multiracial, Non-Hispanic          5581
Name: count, dtype: int64


In [1067]:
race_mapping = {
    'White only, Non-Hispanic': 'White',
    'Hispanic': 'Hispanic',
    'Black only, Non-Hispanic': 'Black',
    'Other race only, Non-Hispanic': 'Other',  # Assuming "Other race" can be mapped to "Other"
    'Multiracial, Non-Hispanic': 'Other',  # Mapping Multiracial to "Other" due to lack of direct equivalent
    # Assuming "Asian" and "American Indian/Alaskan Native" need handling if present in data
}

# Correctly map race categories
data_2022_final['Race'] = data_2022_final['Race'].map(race_mapping)


In [1068]:
# Validate the mapping
print(data_2022_final['Race'].value_counts())

Race
White       186336
Hispanic     22570
Black        19330
Other        17786
Name: count, dtype: int64


# Diabetic

In [1069]:
# Displaying the value counts for the Diabetic column for 2020 and 2022 data
print(heart_2020_cleaned['Diabetic'].value_counts())
print(data_2022_final['Diabetic'].value_counts())


Diabetic
No                         269653
Yes                         40802
No, borderline diabetes      6781
Yes (during pregnancy)       2559
Name: count, dtype: int64
Diabetic
No                                         204834
Yes                                         33813
No, pre-diabetes or borderline diabetes      5392
Yes, but only during pregnancy (female)      1983
Name: count, dtype: int64


In [1070]:
# Apply the mapping (ensure this is done correctly as per your mapping dictionary)
diabetic_mapping_2022_to_2020 = {
    'No': 'No',  # Non-diabetic
    'Yes': 'Yes',  # Diabetic
    'Yes, but only during pregnancy (female)': 'Yes (during pregnancy)',  # Mapping to a category that indicates pregnancy
    'No, pre-diabetes or borderline diabetes': 'No, borderline diabetes'  # Mapping to a category indicating borderline diabetes
}
# Apply the corrected mapping
data_2022_final['Diabetic'] = data_2022_final['Diabetic'].map(diabetic_mapping_2022_to_2020)


In [1071]:
# Validate the mapping
print(data_2022_final['Diabetic'].value_counts())

Diabetic
No                         204834
Yes                         33813
No, borderline diabetes      5392
Yes (during pregnancy)       1983
Name: count, dtype: int64


# PhysicalActivity

In [1072]:
# Displaying the value counts for the PhysicalActivity column for 2020 and 2022 data
print(heart_2020_cleaned['PhysicalActivity'].value_counts())
print(data_2022_final['PhysicalActivity'].value_counts())


PhysicalActivity
Yes    247957
No      71838
Name: count, dtype: int64
PhysicalActivity
Yes    191318
No      54704
Name: count, dtype: int64


# GenHealth

In [1073]:
# Displaying the value counts for the GenHealth column for 2020 and 2022 data
print(heart_2020_cleaned['GenHealth'].value_counts())
print(data_2022_final['GenHealth'].value_counts())

GenHealth
Very good    113858
Good          93129
Excellent     66842
Fair          34677
Poor          11289
Name: count, dtype: int64
GenHealth
Very good    86999
Good         77409
Excellent    41525
Fair         30659
Poor          9430
Name: count, dtype: int64


# SleepTime

In [1074]:
# Displaying the value counts for the SleepTime column for 2020 and 2022 data
print(heart_2020_cleaned['SleepTime'].value_counts())
print(data_2022_final['SleepTime'].value_counts())


SleepTime
7.0     97751
8.0     97602
6.0     66721
5.0     19184
9.0     16041
10.0     7796
4.0      7750
12.0     2205
3.0      1992
2.0       788
1.0       551
11.0      415
14.0      243
16.0      236
15.0      189
18.0      102
13.0       97
20.0       64
24.0       30
17.0       21
22.0        9
19.0        3
23.0        3
21.0        2
Name: count, dtype: int64
SleepTime
7.0     76447
8.0     69927
6.0     53981
5.0     16417
9.0     11859
4.0      6478
10.0     5468
3.0      1618
12.0     1476
2.0       740
1.0       563
11.0      339
16.0      155
15.0      154
14.0      148
18.0       83
13.0       79
20.0       50
24.0       13
17.0       11
23.0        6
19.0        5
22.0        5
Name: count, dtype: int64


# Asthma

In [1075]:
# Displaying the value counts for the Asthma column for 2020 and 2022 data
print(heart_2020_cleaned['Asthma'].value_counts())
print(data_2022_final['Asthma'].value_counts())

Asthma
No     276923
Yes     42872
Name: count, dtype: int64
Asthma
No     209493
Yes     36529
Name: count, dtype: int64


# KidneyDisease

In [1076]:
# Displaying the value counts for the KidneyDisease column for 2020 and 2022 data
print(heart_2020_cleaned['KidneyDisease'].value_counts())
print(data_2022_final['KidneyDisease'].value_counts())

KidneyDisease
No     308016
Yes     11779
Name: count, dtype: int64
KidneyDisease
No     234738
Yes     11284
Name: count, dtype: int64


# SkinCancer

In [1077]:
# Displaying the value counts for the SkinCancer column for 2020 and 2022 data
print(heart_2020_cleaned['SkinCancer'].value_counts())
print(data_2022_final['SkinCancer'].value_counts())

SkinCancer
No     289976
Yes     29819
Name: count, dtype: int64
SkinCancer
No     225001
Yes     21021
Name: count, dtype: int64


# Validating 2022 vs 2020 data once more

In [1078]:
# Display the dtypes for the 2022 data and 2020 data
print(data_2022_final.dtypes)
print(heart_2020_cleaned.dtypes)

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object
HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCa

In [1079]:
columns_order_2020 = [
    'HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth',
    'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic',
    'PhysicalActivity', 'GenHealth', 'SleepTime', 'Asthma', 'KidneyDisease', 'SkinCancer'
]

# Rearrange the 2022 dataset to match the 2020 columns order
data_2022_final = data_2022_final[columns_order_2020]


In [1080]:
# Display the first 5 rows of the dataframe
data_2022_final.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,27.99,Yes,No,No,4.0,0.0,No,Female,65-69,White,No,Yes,Very good,9.0,No,No,No
1,No,30.13,Yes,No,No,0.0,0.0,No,Male,70-74,White,Yes,Yes,Very good,6.0,No,No,No
2,No,31.66,Yes,Yes,No,0.0,0.0,Yes,Male,75-79,White,No,No,Very good,8.0,No,No,No
3,No,31.32,No,No,No,5.0,0.0,Yes,Female,80 or older,White,No,Yes,Fair,9.0,No,No,Yes
4,No,33.07,No,No,No,3.0,15.0,No,Female,80 or older,White,No,Yes,Good,5.0,No,No,No


In [1081]:
# Look for missing values in the 2022 data
data_2022_final.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [1082]:
data_2022_final.shape

(246022, 18)

In [1083]:
# Export the 2022 data to a new CSV file named "heart_2022_cleaned.csv"
data_2022_final.to_csv('heart_2022_cleaned.csv', index=False)


# Encoding

In [1084]:
# Encoding age_category
# Define the order for age categories based on the dataset's unique values
age_category_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49',
                      '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']

# Map the age categories to ordinal values
age_category_mapping = {category: index for index, category in enumerate(age_category_order)}
data_2022_final['AgeCategoryOrdinal'] = data_2022_final['AgeCategory'].map(age_category_mapping)

In [1085]:
# Apply ordinal encoding to the dataset
for dataset in [data_2022_final]:
    dataset['AgeCategoryOrdinal'] = dataset['AgeCategory'].map(age_category_mapping)
    dataset.drop('AgeCategory', axis=1, inplace=True)

# Apply one-hot encoding to the remaining non-numeric columns, excluding 'AgeCategory'
heart_2022_encoded = pd.get_dummies(data_2022_final, drop_first=True)


In [1086]:
# Convert boolean columns to binary (1/0)
columns_to_convert = ['HeartDisease_Yes', 'Smoking_Yes', 'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes',
                      'Diabetic_Yes', 'Diabetic_Yes (during pregnancy)', 'PhysicalActivity_Yes', 
                      'GenHealth_Fair', 'GenHealth_Good', 'GenHealth_Poor', 'GenHealth_Very good', 
                      'Asthma_Yes', 'KidneyDisease_Yes', 'SkinCancer_Yes']

for col in columns_to_convert:
    heart_2022_encoded[col] = heart_2022_encoded[col].astype(int)



In [1087]:
# Verify the transformation
display(heart_2022_encoded.head())
display(heart_2022_encoded.shape)


Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,AgeCategoryOrdinal,HeartDisease_Yes,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,27.99,4.0,0.0,9.0,9,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
1,30.13,0.0,0.0,6.0,10,0,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,31.66,0.0,0.0,8.0,11,0,1,1,0,1,...,0,0,0,0,0,0,1,0,0,0
3,31.32,5.0,0.0,9.0,12,0,0,0,0,1,...,0,0,1,1,0,0,0,0,0,1
4,33.07,3.0,15.0,5.0,12,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


(246022, 25)

In [1088]:
# Validating value counts for the target column
print(heart_2022_encoded['HeartDisease_Yes'].value_counts())

HeartDisease_Yes
0    232587
1     13435
Name: count, dtype: int64


In [1089]:
# Validating the columns in the encoded dataset
print(heart_2022_encoded.columns)


Index(['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime',
       'AgeCategoryOrdinal', 'HeartDisease_Yes', 'Smoking_Yes',
       'AlcoholDrinking_Yes', 'Stroke_Yes', 'DiffWalking_Yes', 'Sex_Male',
       'Race_Hispanic', 'Race_Other', 'Race_White',
       'Diabetic_No, borderline diabetes', 'Diabetic_Yes',
       'Diabetic_Yes (during pregnancy)', 'PhysicalActivity_Yes',
       'GenHealth_Fair', 'GenHealth_Good', 'GenHealth_Poor',
       'GenHealth_Very good', 'Asthma_Yes', 'KidneyDisease_Yes',
       'SkinCancer_Yes'],
      dtype='object')


In [1090]:
# Exporting the data in a csv so it can be used for training the various models
heart_2022_encoded.to_csv('heart_2022_encoded.csv', index=False)
