In [None]:
!pip install scikit-learn



In [None]:
# Import the required libraries and dependencies
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
heart_df = pd.read_csv(
    Path("../content/heart_2022_no_nans.csv"))

# Review the DataFrame
heart_df

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


In [39]:
# Use general attributes in dataset to create dataframe
lifestyle_df = heart_df[['SleepHours','AlcoholDrinkers','HadHeartAttack','SmokerStatus','ECigaretteUsage','PhysicalActivities','DifficultyErrands']].copy()
lifestyle_df.head()

Unnamed: 0,SleepHours,AlcoholDrinkers,HadHeartAttack,SmokerStatus,ECigaretteUsage,PhysicalActivities,DifficultyErrands
0,9.0,No,No,Former smoker,Never used e-cigarettes in my entire life,Yes,No
1,6.0,No,No,Former smoker,Never used e-cigarettes in my entire life,Yes,No
2,8.0,Yes,No,Former smoker,Never used e-cigarettes in my entire life,No,No
3,9.0,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,No
4,5.0,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,No


In [40]:
# Split target column from dataset
y = lifestyle_df['HadHeartAttack']
X = lifestyle_df.drop(columns='HadHeartAttack')

In [41]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X.head()

Unnamed: 0,SleepHours,AlcoholDrinkers_No,AlcoholDrinkers_Yes,SmokerStatus_Current smoker - now smokes every day,SmokerStatus_Current smoker - now smokes some days,SmokerStatus_Former smoker,SmokerStatus_Never smoked,ECigaretteUsage_Never used e-cigarettes in my entire life,ECigaretteUsage_Not at all (right now),ECigaretteUsage_Use them every day,ECigaretteUsage_Use them some days,PhysicalActivities_No,PhysicalActivities_Yes,DifficultyErrands_No,DifficultyErrands_Yes
0,9.0,1,0,0,0,1,0,1,0,0,0,0,1,1,0
1,6.0,1,0,0,0,1,0,1,0,0,0,0,1,1,0
2,8.0,0,1,0,0,1,0,1,0,0,0,1,0,1,0
3,9.0,1,0,0,0,0,1,1,0,0,0,0,1,1,0
4,5.0,1,0,0,0,0,1,1,0,0,0,0,1,1,0


In [42]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [43]:
X_train.shape

(184516, 15)

In [44]:
X_test.shape

(61506, 15)

In [45]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [46]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [47]:
# Train the model
model.fit(X_train_scaled, y_train)

In [None]:
# Create predictions
y_pred = model.predict(X_test_scaled)

In [None]:
#print confusion matrix
confusion_matrix(y_pred,y_test)

array([[57776,  3311],
       [  366,    53]])

In [None]:
# Calculating the confusion matrix
knn = confusion_matrix(y_pred, y_test)
knn_df = pd.DataFrame(
    knn, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [None]:
# Displaying results
print("Confusion Matrix KNN Model")
display(knn_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix KNN Model


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,57776,3311
Actual 1,366,53


Accuracy Score : 0.9402172145806913
Classification Report
              precision    recall  f1-score   support

          No       0.95      0.99      0.97     58142
         Yes       0.13      0.02      0.03      3364

    accuracy                           0.94     61506
   macro avg       0.54      0.50      0.50     61506
weighted avg       0.90      0.94      0.92     61506



In [None]:
# Creating the decision tree classifier instance
DTmodel = tree.DecisionTreeClassifier()

In [None]:
# Fitting the model
DTmodel = DTmodel.fit(X_train_scaled,y_train)

In [None]:
# Making predictions using the testing data
predictions = DTmodel.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix Decision Tree")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix Decision Tree


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,58108,34
Actual 1,3358,6


Accuracy Score : 0.9448509088544207
Classification Report
              precision    recall  f1-score   support

          No       0.95      1.00      0.97     58142
         Yes       0.15      0.00      0.00      3364

    accuracy                           0.94     61506
   macro avg       0.55      0.50      0.49     61506
weighted avg       0.90      0.94      0.92     61506



In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cmrf = confusion_matrix(y_test, rf_predictions)
cmrf_df = pd.DataFrame(
    cmrf, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [None]:
# Displaying results
print("Confusion Matrix Random Forest")
display(cmrf_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix Random Forest


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,58106,36
Actual 1,3357,7


Accuracy Score : 0.9448346502780216
Classification Report
              precision    recall  f1-score   support

          No       0.95      1.00      0.97     58142
         Yes       0.16      0.00      0.00      3364

    accuracy                           0.94     61506
   macro avg       0.55      0.50      0.49     61506
weighted avg       0.90      0.94      0.92     61506



In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4231764731144447, 'SleepHours'),
 (0.10590806005785515, 'SmokerStatus_Never smoked'),
 (0.06389966482664167, 'DifficultyErrands_No'),
 (0.06345699267094213, 'DifficultyErrands_Yes'),
 (0.050851071396047276, 'SmokerStatus_Former smoker'),
 (0.04694407654464776, 'AlcoholDrinkers_Yes'),
 (0.04684222038679333, 'PhysicalActivities_Yes'),
 (0.046762763094397725, 'AlcoholDrinkers_No'),
 (0.04435966526977509, 'PhysicalActivities_No'),
 (0.026084117359870557,
  'ECigaretteUsage_Never used e-cigarettes in my entire life'),
 (0.021542885636183468, 'SmokerStatus_Current smoker - now smokes every day'),
 (0.017456565232330603, 'ECigaretteUsage_Use them every day'),
 (0.016882969482344322, 'ECigaretteUsage_Not at all (right now)'),
 (0.013046646543020124, 'SmokerStatus_Current smoker - now smokes some days'),
 (0.012785828384706094, 'ECigaretteUsage_Use them some days')]