In [1]:
!pip install scikit-learn




In [2]:
# Import the required libraries and dependencies
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
heart_df = pd.read_csv(
    Path("../content/heart_2022_no_nans.csv"))

# Review the DataFrame
heart_df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [6]:
# Use illness history in dataset to create dataframe
illnessHistory_df = heart_df[['HadHeartAttack','HadDiabetes','HadStroke','HadAsthma','HadKidneyDisease','HadAngina','HadCOPD','HadArthritis','CovidPos','RemovedTeeth']].copy()
illnessHistory_df.head()

Unnamed: 0,HadHeartAttack,HadDiabetes,HadStroke,HadAsthma,HadKidneyDisease,HadAngina,HadCOPD,HadArthritis,CovidPos,RemovedTeeth
0,No,No,No,No,No,No,No,Yes,No,None of them
1,No,Yes,No,No,No,No,No,Yes,No,None of them
2,No,No,No,No,No,No,No,Yes,Yes,"6 or more, but not all"
3,No,No,No,No,No,No,No,Yes,Yes,None of them
4,No,No,No,No,No,No,No,Yes,No,1 to 5


In [7]:
# Split target column from dataset
y = illnessHistory_df['HadHeartAttack']
X = illnessHistory_df.drop(columns='HadHeartAttack')

In [8]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X.head()

Unnamed: 0,HadDiabetes_No,"HadDiabetes_No, pre-diabetes or borderline diabetes",HadDiabetes_Yes,"HadDiabetes_Yes, but only during pregnancy (female)",HadStroke_No,HadStroke_Yes,HadAsthma_No,HadAsthma_Yes,HadKidneyDisease_No,HadKidneyDisease_Yes,...,HadCOPD_Yes,HadArthritis_No,HadArthritis_Yes,CovidPos_No,CovidPos_Tested positive using home test without a health professional,CovidPos_Yes,RemovedTeeth_1 to 5,"RemovedTeeth_6 or more, but not all",RemovedTeeth_All,RemovedTeeth_None of them
0,1,0,0,0,1,0,1,0,1,0,...,0,0,1,1,0,0,0,0,0,1
1,0,0,1,0,1,0,1,0,1,0,...,0,0,1,1,0,0,0,0,0,1
2,1,0,0,0,1,0,1,0,1,0,...,0,0,1,0,0,1,0,1,0,0
3,1,0,0,0,1,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,1
4,1,0,0,0,1,0,1,0,1,0,...,0,0,1,1,0,0,1,0,0,0


In [9]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
X_train.shape

(184516, 23)

In [10]:
X_test.shape

(61506, 23)

In [11]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [13]:
# Train the model
model.fit(X_train_scaled, y_train)

In [14]:
# Create predictions
y_pred = model.predict(X_test_scaled)

In [15]:
#print confusion matrix
confusion_matrix(y_pred,y_test)

array([[57107,  2525],
       [ 1035,   839]])

In [16]:
# Calculating the confusion matrix
knn = confusion_matrix(y_pred, y_test)
knn_df = pd.DataFrame(
    knn, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [17]:
# Displaying results
print("Confusion Matrix KNN Model")
display(knn_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix KNN Model


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,57107,2525
Actual 1,1035,839


Accuracy Score : 0.9421194680193802
Classification Report
              precision    recall  f1-score   support

          No       0.96      0.98      0.97     58142
         Yes       0.45      0.25      0.32      3364

    accuracy                           0.94     61506
   macro avg       0.70      0.62      0.65     61506
weighted avg       0.93      0.94      0.93     61506



In [18]:
# Creating the decision tree classifier instance
DTmodel = tree.DecisionTreeClassifier()

In [19]:
# Fitting the model
DTmodel = DTmodel.fit(X_train_scaled,y_train)

In [20]:
# Making predictions using the testing data
predictions = DTmodel.predict(X_test_scaled)

In [21]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [22]:
# Displaying results
print("Confusion Matrix Decision Tree")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix Decision Tree


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,57692,450
Actual 1,2853,511


Accuracy Score : 0.9462979221539362
Classification Report
              precision    recall  f1-score   support

          No       0.95      0.99      0.97     58142
         Yes       0.53      0.15      0.24      3364

    accuracy                           0.95     61506
   macro avg       0.74      0.57      0.60     61506
weighted avg       0.93      0.95      0.93     61506



In [23]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [24]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [25]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [26]:
# Calculating the confusion matrix
cmrf = confusion_matrix(y_test, rf_predictions)
cmrf_df = pd.DataFrame(
    cmrf, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [27]:
# Displaying results
print("Confusion Matrix Random Forest")
display(cmrf_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix Random Forest


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,57638,504
Actual 1,2791,573


Accuracy Score : 0.9464279907651286
Classification Report
              precision    recall  f1-score   support

          No       0.95      0.99      0.97     58142
         Yes       0.53      0.17      0.26      3364

    accuracy                           0.95     61506
   macro avg       0.74      0.58      0.62     61506
weighted avg       0.93      0.95      0.93     61506



In [28]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3548147005057821, 'HadAngina_Yes'),
 (0.3235644730608795, 'HadAngina_No'),
 (0.036617344490333416, 'HadStroke_Yes'),
 (0.03465349148213466, 'HadStroke_No'),
 (0.024618827911138997, 'HadDiabetes_Yes'),
 (0.021126413426903586, 'RemovedTeeth_All'),
 (0.019877291827073453, 'HadDiabetes_No'),
 (0.017730583349771258, 'RemovedTeeth_None of them'),
 (0.016256192079746573, 'HadCOPD_Yes'),
 (0.0157016513660443, 'HadCOPD_No'),
 (0.01388576969373011, 'HadArthritis_Yes'),
 (0.012720789221570994, 'HadArthritis_No'),
 (0.01251869708274417, 'RemovedTeeth_1 to 5'),
 (0.012379026688157485, 'RemovedTeeth_6 or more, but not all'),
 (0.012238722662465888, 'CovidPos_No'),
 (0.012170507592543421, 'CovidPos_Yes'),
 (0.011477161167698284, 'HadKidneyDisease_Yes'),
 (0.01118713632386945, 'HadKidneyDisease_No'),
 (0.01088555966607568, 'HadAsthma_No'),
 (0.010606535601271241, 'HadAsthma_Yes'),
 (0.006469073227038804, 'HadDiabetes_No, pre-diabetes or borderline diabetes'),
 (0.00550931115351244,
  'CovidPos_Tes