In [1]:
!pip install scikit-learn



In [2]:
# Import the required libraries and dependencies
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [6]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
heart_df = pd.read_csv(
    Path("../content/heart_2022_no_nans.csv"))

# Review the DataFrame
heart_df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [7]:
# Use general attributes in dataset to create dataframe
generalAttributes_df = heart_df[['AgeCategory','RaceEthnicityCategory','HadHeartAttack','HeightInMeters','WeightInKilograms','BMI','GeneralHealth','Sex']].copy()
generalAttributes_df.head()

Unnamed: 0,AgeCategory,RaceEthnicityCategory,HadHeartAttack,HeightInMeters,WeightInKilograms,BMI,GeneralHealth,Sex
0,Age 65 to 69,"White only, Non-Hispanic",No,1.6,71.67,27.99,Very good,Female
1,Age 70 to 74,"White only, Non-Hispanic",No,1.78,95.25,30.13,Very good,Male
2,Age 75 to 79,"White only, Non-Hispanic",No,1.85,108.86,31.66,Very good,Male
3,Age 80 or older,"White only, Non-Hispanic",No,1.7,90.72,31.32,Fair,Female
4,Age 80 or older,"White only, Non-Hispanic",No,1.55,79.38,33.07,Good,Female


In [8]:
# Split target column from dataset
y = generalAttributes_df['HadHeartAttack']
X = generalAttributes_df.drop(columns='HadHeartAttack')

In [9]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X.head()

Unnamed: 0,HeightInMeters,WeightInKilograms,BMI,AgeCategory_Age 18 to 24,AgeCategory_Age 25 to 29,AgeCategory_Age 30 to 34,AgeCategory_Age 35 to 39,AgeCategory_Age 40 to 44,AgeCategory_Age 45 to 49,AgeCategory_Age 50 to 54,...,"RaceEthnicityCategory_Multiracial, Non-Hispanic","RaceEthnicityCategory_Other race only, Non-Hispanic","RaceEthnicityCategory_White only, Non-Hispanic",GeneralHealth_Excellent,GeneralHealth_Fair,GeneralHealth_Good,GeneralHealth_Poor,GeneralHealth_Very good,Sex_Female,Sex_Male
0,1.6,71.67,27.99,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
1,1.78,95.25,30.13,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
2,1.85,108.86,31.66,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
3,1.7,90.72,31.32,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0
4,1.55,79.38,33.07,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,1,0


In [10]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
X_train.shape

(184516, 28)

In [12]:
X_test.shape

(61506, 28)

In [13]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [15]:
# Train the model
model.fit(X_train_scaled, y_train)

In [16]:
# Create predictions
y_pred = model.predict(X_test_scaled)

In [17]:
#print confusion matrix
confusion_matrix(y_pred,y_test)

array([[57173,  3121],
       [  969,   243]])

In [18]:
# Calculating the confusion matrix
knn = confusion_matrix(y_pred, y_test)
knn_df = pd.DataFrame(
    knn, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [19]:
# Displaying results
print("Confusion Matrix KNN Model")
display(knn_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix KNN Model


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,57173,3121
Actual 1,969,243


Accuracy Score : 0.9335024225278835
Classification Report
              precision    recall  f1-score   support

          No       0.95      0.98      0.97     58142
         Yes       0.20      0.07      0.11      3364

    accuracy                           0.93     61506
   macro avg       0.57      0.53      0.54     61506
weighted avg       0.91      0.93      0.92     61506



In [20]:
# Creating the decision tree classifier instance
DTmodel = tree.DecisionTreeClassifier()

In [21]:
# Fitting the model
DTmodel = DTmodel.fit(X_train_scaled,y_train)

In [22]:
# Making predictions using the testing data
predictions = DTmodel.predict(X_test_scaled)

In [23]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [24]:
# Displaying results
print("Confusion Matrix Decision Tree")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix Decision Tree


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,56523,1619
Actual 1,3049,315


Accuracy Score : 0.9241049653692323
Classification Report
              precision    recall  f1-score   support

          No       0.95      0.97      0.96     58142
         Yes       0.16      0.09      0.12      3364

    accuracy                           0.92     61506
   macro avg       0.56      0.53      0.54     61506
weighted avg       0.91      0.92      0.91     61506



In [25]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [26]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [27]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [28]:
# Calculating the confusion matrix
cmrf = confusion_matrix(y_test, rf_predictions)
cmrf_df = pd.DataFrame(
    cmrf, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [29]:
# Displaying results
print("Confusion Matrix Random Forest")
display(cmrf_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix Random Forest


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,57281,861
Actual 1,3165,199


Accuracy Score : 0.9345429714174227
Classification Report
              precision    recall  f1-score   support

          No       0.95      0.99      0.97     58142
         Yes       0.19      0.06      0.09      3364

    accuracy                           0.93     61506
   macro avg       0.57      0.52      0.53     61506
weighted avg       0.91      0.93      0.92     61506



In [30]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.35957707248266363, 'BMI'),
 (0.3037466344158205, 'WeightInKilograms'),
 (0.15176662189360846, 'HeightInMeters'),
 (0.025149656497156527, 'GeneralHealth_Poor'),
 (0.01584287787666591, 'GeneralHealth_Fair'),
 (0.01499097468592014, 'AgeCategory_Age 80 or older'),
 (0.010644843238756389, 'AgeCategory_Age 75 to 79'),
 (0.008996727314994861, 'AgeCategory_Age 70 to 74'),
 (0.00877883830443165, 'RaceEthnicityCategory_White only, Non-Hispanic'),
 (0.007648638683573209, 'GeneralHealth_Excellent'),
 (0.0076395316171351075, 'GeneralHealth_Very good'),
 (0.006913121806183278, 'AgeCategory_Age 65 to 69'),
 (0.006909636407717134, 'GeneralHealth_Good'),
 (0.0065734269419814575, 'RaceEthnicityCategory_Black only, Non-Hispanic'),
 (0.006436870018525896, 'RaceEthnicityCategory_Hispanic'),
 (0.006242572368786965, 'Sex_Female'),
 (0.00606931896324883, 'Sex_Male'),
 (0.005925943629437344, 'RaceEthnicityCategory_Other race only, Non-Hispanic'),
 (0.005282728249298181, 'AgeCategory_Age 60 to 64'),
 (0.005