In [1]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, TargetEncoder
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression, make_swiss_roll
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR, SVC
import seaborn as sns
import pickle

In [2]:
#Read 2020 and 2022 Data in a DataFrame
heart_2020 = pd.read_csv('heart_2020_cleaned.csv')
heart_2022 = pd.read_csv('heart_2022_cleaned.csv')

In [3]:
#Display 2020 Data info
heart_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [4]:
#Display 2022 Data info
heart_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      246022 non-null  object 
 1   BMI               246022 non-null  float64
 2   Smoking           246022 non-null  object 
 3   AlcoholDrinking   246022 non-null  object 
 4   Stroke            246022 non-null  object 
 5   PhysicalHealth    246022 non-null  float64
 6   MentalHealth      246022 non-null  float64
 7   DiffWalking       246022 non-null  object 
 8   Sex               246022 non-null  object 
 9   AgeCategory       246022 non-null  object 
 10  Race              246022 non-null  object 
 11  Diabetic          246022 non-null  object 
 12  PhysicalActivity  246022 non-null  object 
 13  GenHealth         246022 non-null  object 
 14  SleepTime         246022 non-null  float64
 15  Asthma            246022 non-null  object 
 16  KidneyDisease     24

In [5]:
# Define the order for age categories based on the dataset's unique values
age_category_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49',
                      '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']

# Map the age categories to ordinal values for the 2020 data
age_category_mapping = {category: index for index, category in enumerate(age_category_order)}
heart_2020['AgeCategoryOrdinal'] = heart_2020['AgeCategory'].map(age_category_mapping)

In [6]:
#Drop the 2020 'AgeCategory' column
heart_2020 = heart_2020.drop('AgeCategory', axis=1)

In [7]:
#Dsiplay the 2020 DataFrame
heart_2020

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,AgeCategoryOrdinal
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,White,Yes,Yes,Very good,5.0,Yes,No,Yes,7
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,White,No,Yes,Very good,7.0,No,No,No,12
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,White,Yes,Yes,Fair,8.0,Yes,No,No,9
3,No,24.21,No,No,No,0.0,0.0,No,Female,White,No,No,Good,6.0,No,No,Yes,11
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,White,No,Yes,Very good,8.0,No,No,No,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,Hispanic,Yes,No,Fair,6.0,Yes,No,No,8
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,Hispanic,No,Yes,Very good,5.0,Yes,No,No,3
319792,No,24.24,No,No,No,0.0,0.0,No,Female,Hispanic,No,Yes,Good,6.0,No,No,No,5
319793,No,32.81,No,No,No,0.0,0.0,No,Female,Hispanic,No,No,Good,12.0,No,No,No,1


In [8]:
# Define the order for age categories based on the dataset's unique values
age_category_order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49',
                      '50-54', '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']

# Map the age categories to ordinal values for the 2022 data
age_category_mapping = {category: index for index, category in enumerate(age_category_order)}
heart_2022['AgeCategoryOrdinal'] = heart_2022['AgeCategory'].map(age_category_mapping)

In [9]:
#Drop the 2022 'AgeCategory' column
heart_2022 = heart_2022.drop('AgeCategory', axis=1)

In [10]:
#Drop the 2020 'Race' column
heart_2020 = heart_2020.drop('Race', axis=1)

In [11]:
#Drop the 2022 'Race' column
heart_2022 = heart_2022.drop('Race', axis=1)

In [12]:
#Seperate the 'target' column from the 2020 data
heart_2020_yes = heart_2020[heart_2020['HeartDisease'] == 'Yes']
heart_2020_no = heart_2020[heart_2020['HeartDisease'] == 'No']

In [13]:
#Set minimum size for sampling an equal number of rows from both dataframes
min_count = min(len(heart_2020_yes), len(heart_2020_no))
min_count

27373

In [14]:
#Use undersampling the balanced the 2020 data
balanced_heart_2020_yes = heart_2020_yes.sample(n=min_count, random_state=42)
balanced_heart_2020_no = heart_2020_no.sample(n=min_count, random_state=42)

In [15]:
#Concatenate the 2020 balanced data into a single dataframe
balanced_training_2020 = pd.concat([balanced_heart_2020_yes, balanced_heart_2020_no]).reset_index(drop=True)

In [16]:
#Train, test, split the data
train_set, test_set = train_test_split(balanced_training_2020, test_size=0.2, random_state=42)

In [17]:
#Encode the 2020 data
encoded_2020 = pd.get_dummies(train_set, drop_first=True)  
encoded_2020.head(), encoded_2020.shape

(         BMI  PhysicalHealth  MentalHealth  SleepTime  AgeCategoryOrdinal  \
 49873  28.34             0.0           0.0        8.0                   9   
 29833  26.58             0.0           0.0        7.0                   1   
 34118  23.73             0.0           0.0        7.0                   4   
 7237   23.71             0.0           0.0        7.0                  10   
 14143  29.62             0.0           4.0        6.0                   7   
 
        HeartDisease_Yes  Smoking_Yes  AlcoholDrinking_Yes  Stroke_Yes  \
 49873             False         True                False       False   
 29833             False        False                False       False   
 34118             False         True                False       False   
 7237               True         True                False       False   
 14143              True         True                False       False   
 
        DiffWalking_Yes  ...  Diabetic_Yes  Diabetic_Yes (during pregnancy)  \
 4987

In [18]:
#Display the encoded 2020 data
encoded_2020.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43796 entries, 49873 to 15795
Data columns (total 22 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   BMI                               43796 non-null  float64
 1   PhysicalHealth                    43796 non-null  float64
 2   MentalHealth                      43796 non-null  float64
 3   SleepTime                         43796 non-null  float64
 4   AgeCategoryOrdinal                43796 non-null  int64  
 5   HeartDisease_Yes                  43796 non-null  bool   
 6   Smoking_Yes                       43796 non-null  bool   
 7   AlcoholDrinking_Yes               43796 non-null  bool   
 8   Stroke_Yes                        43796 non-null  bool   
 9   DiffWalking_Yes                   43796 non-null  bool   
 10  Sex_Male                          43796 non-null  bool   
 11  Diabetic_No, borderline diabetes  43796 non-null  bool   
 12  Diabe

In [19]:
#Label the features as 'X' and the target as 'y'
X = encoded_2020.drop('HeartDisease_Yes', axis=1)
y = encoded_2020['HeartDisease_Yes']

In [20]:
#One hot encode the 'X' features
X_2020_encoded = pd.get_dummies(X, drop_first=True)

In [21]:
#Train, test, split the encoded 2020 data
X_train, X_test, y_train, y_test = train_test_split(X_2020_encoded, y, test_size=0.2, random_state=42)

In [22]:
#Scale the 2020 data
scaler = StandardScaler()
X_train_2020_scaled = scaler.fit_transform(X_train)
X_test_2020_scaled = scaler.transform(X_test)

In [23]:
#Create the SVC model and fit the scaled encoded 2020 data
model = SVC(kernel='linear')
model.fit(X_train_2020_scaled, y_train)

In [24]:
#Predict the 2020 data
linear_svc_predictions_2020 = model.predict(X_test_2020_scaled)

In [25]:
#Score the model on the 2020 data
print('2020 Linear SVC Train Accuracy: %.3f' % model.score(X_train_2020_scaled, y_train))
print('2020 Linear SVC Test Accuracy: %.3f' % model.score(X_test_2020_scaled, y_test))
print('2020 Linear SVC Accuracy Score: %.3f' % accuracy_score(y_test, linear_svc_predictions_2020))

2020 Linear SVC Train Accuracy: 0.761
2020 Linear SVC Test Accuracy: 0.764
2020 Linear SVC Accuracy Score: 0.764


In [26]:
#Display the 2020 confusion matrix
print("2020 Confusion Matrix:\n", confusion_matrix(y_test, linear_svc_predictions_2020))

2020 Confusion Matrix:
 [[3245 1130]
 [ 937 3448]]


In [27]:
#Seperate results into True negative/postives and False negative/positives
tn, fp, fn, tp = confusion_matrix(y_test, linear_svc_predictions_2020).ravel()

In [28]:
#Display the 2020 false rates
print(f"2020 False Positive Rate: {fp / (fp + tn)}")
print(f"2020 False Negative Rate: {fn / (fn + tp)}")

2020 False Positive Rate: 0.2582857142857143
2020 False Negative Rate: 0.2136830102622577


In [29]:
#Display 2020 true rates
print(f"2020 True Positive Rate: {tp / (tp + fn)}")
print(f"2020 True Negative Rate: {tn / (tn + fp)}")

2020 True Positive Rate: 0.7863169897377423
2020 True Negative Rate: 0.7417142857142857


In [30]:
#Display the 2020 Classification Report
cm_2020 = confusion_matrix(y_test, linear_svc_predictions_2020)
report_2020 = classification_report(y_test, linear_svc_predictions_2020)
print("Confusion Matrix for 2020 Data:")
print(cm_2020)
print("Classification Report for 2020 Data:")
print(report_2020)

Confusion Matrix for 2020 Data:
[[3245 1130]
 [ 937 3448]]
Classification Report for 2020 Data:
              precision    recall  f1-score   support

       False       0.78      0.74      0.76      4375
        True       0.75      0.79      0.77      4385

    accuracy                           0.76      8760
   macro avg       0.76      0.76      0.76      8760
weighted avg       0.76      0.76      0.76      8760



In [43]:
#Display 2020 accuracy, precision, recall and F1 scores
print(f'2020 Accuracy: {accuracy_score(y_test, linear_svc_predictions_2020)}')
print(f'2020 Precision: {precision_score(y_test, linear_svc_predictions_2020)}')
print(f'2020 Recall: {recall_score(y_test, linear_svc_predictions_2020)}')
print(f'2020 F1 Score: {f1_score(y_test, linear_svc_predictions_2020)}')

2020 Accuracy: 0.7640410958904109
2020 Precision: 0.7531673219746614
2020 Recall: 0.7863169897377423
2020 F1 Score: 0.7693852504741716


In [34]:
#Encode the 2022 data
encoded_2022 = pd.get_dummies(heart_2022, drop_first=True)  

In [35]:
#Display encoded 2022 data info
encoded_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 22 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   BMI                               246022 non-null  float64
 1   PhysicalHealth                    246022 non-null  float64
 2   MentalHealth                      246022 non-null  float64
 3   SleepTime                         246022 non-null  float64
 4   AgeCategoryOrdinal                246022 non-null  int64  
 5   HeartDisease_Yes                  246022 non-null  bool   
 6   Smoking_Yes                       246022 non-null  bool   
 7   AlcoholDrinking_Yes               246022 non-null  bool   
 8   Stroke_Yes                        246022 non-null  bool   
 9   DiffWalking_Yes                   246022 non-null  bool   
 10  Sex_Male                          246022 non-null  bool   
 11  Diabetic_No, borderline diabetes  246022 non-null  b

In [36]:
#Seperate 2022 data into 'X' and 'y'
X_2022 = encoded_2022.drop('HeartDisease_Yes', axis=1)
y_2022 = encoded_2022['HeartDisease_Yes']

In [37]:
#One Hot encode the 'X_2022' features
X_2022_encoded = pd.get_dummies(X_2022, drop_first=True)

In [38]:
#Display 2022 encoded data
X_2022_encoded

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,AgeCategoryOrdinal,Smoking_Yes,AlcoholDrinking_Yes,Stroke_Yes,DiffWalking_Yes,Sex_Male,...,Diabetic_Yes,Diabetic_Yes (during pregnancy),PhysicalActivity_Yes,GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good,Asthma_Yes,KidneyDisease_Yes,SkinCancer_Yes
0,27.99,4.0,0.0,9.0,9,True,False,False,False,False,...,False,False,True,False,False,False,True,False,False,False
1,30.13,0.0,0.0,6.0,10,True,False,False,False,True,...,True,False,True,False,False,False,True,False,False,False
2,31.66,0.0,0.0,8.0,11,True,True,False,True,True,...,False,False,False,False,False,False,True,False,False,False
3,31.32,5.0,0.0,9.0,12,False,False,False,True,False,...,False,False,True,True,False,False,False,False,False,True
4,33.07,3.0,15.0,5.0,12,False,False,False,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,32.28,0.0,0.0,6.0,8,False,True,False,False,True,...,False,False,True,False,False,False,True,False,False,False
246018,24.34,0.0,7.0,7.0,1,False,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
246019,29.86,0.0,15.0,7.0,9,False,True,True,False,True,...,True,False,True,False,True,False,False,False,False,False
246020,28.66,2.0,2.0,7.0,6,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False


In [39]:
#Scale 2022 data
X_test_2022_scaled = scaler.fit_transform(X_2022_encoded)

In [40]:
#Predict 2022 data
linear_svc_predictions_2022 = model.predict(X_test_2022_scaled)

In [41]:
#Display 2022 accuracy scores
print('2022 Linear SVC Test Accuracy: %.3f' % model.score(X_test_2022_scaled, y_2022))
print('2022 Linear SVC Accuracy Score: %.3f' % accuracy_score(y_2022, linear_svc_predictions_2022))

2022 Linear SVC Test Accuracy: 0.569
2022 Linear SVC Accuracy Score: 0.569


In [42]:
# Confusion Matrix and Classification Report for the 2022 data
cm_2022 = confusion_matrix(y_2022, linear_svc_predictions_2022)
report_2022 = classification_report(y_2022, linear_svc_predictions_2022)
print("Confusion Matrix for 2022 Data:")
print(cm_2022)
print("Classification Report for 2022 Data:")
print(report_2022)

Confusion Matrix for 2022 Data:
[[127839 104748]
 [  1165  12270]]
Classification Report for 2022 Data:
              precision    recall  f1-score   support

       False       0.99      0.55      0.71    232587
        True       0.10      0.91      0.19     13435

    accuracy                           0.57    246022
   macro avg       0.55      0.73      0.45    246022
weighted avg       0.94      0.57      0.68    246022

