In [1]:
!pip install scikit-learn



In [2]:
# Import the required libraries and dependencies
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
heart_df = pd.read_csv(
    Path("../content/heart_2022_no_nans.csv"))

# Review the DataFrame
heart_df.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.6,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.7,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [7]:
# Use general attributes in dataset to create dataframe
lifestyle_df = heart_df[['SleepHours','AlcoholDrinkers','HadHeartAttack','SmokerStatus','ECigaretteUsage','PhysicalActivities','DifficultyErrands']].copy()
lifestyle_df.head()

Unnamed: 0,SleepHours,AlcoholDrinkers,HadHeartAttack,SmokerStatus,ECigaretteUsage,PhysicalActivities,DifficultyErrands
0,9.0,No,No,Former smoker,Never used e-cigarettes in my entire life,Yes,No
1,6.0,No,No,Former smoker,Never used e-cigarettes in my entire life,Yes,No
2,8.0,Yes,No,Former smoker,Never used e-cigarettes in my entire life,No,No
3,9.0,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,No
4,5.0,No,No,Never smoked,Never used e-cigarettes in my entire life,Yes,No


In [8]:
# Split target column from dataset
y = lifestyle_df['HadHeartAttack']
X = lifestyle_df.drop(columns='HadHeartAttack')

In [9]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)
X.head()

Unnamed: 0,SleepHours,AlcoholDrinkers_No,AlcoholDrinkers_Yes,SmokerStatus_Current smoker - now smokes every day,SmokerStatus_Current smoker - now smokes some days,SmokerStatus_Former smoker,SmokerStatus_Never smoked,ECigaretteUsage_Never used,ECigaretteUsage_Never used e-cigarettes in my entire life,ECigaretteUsage_Not at all (right now),ECigaretteUsage_Use them every day,ECigaretteUsage_Use them some days,PhysicalActivities_No,PhysicalActivities_Yes,DifficultyErrands_No,DifficultyErrands_Yes
0,9.0,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0
1,6.0,1,0,0,0,1,0,0,1,0,0,0,0,1,1,0
2,8.0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0
3,9.0,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0
4,5.0,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0


In [10]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
X_train.shape

(115561, 16)

In [12]:
X_test.shape

(38521, 16)

In [13]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [15]:
# Train the model
model.fit(X_train_scaled, y_train)

In [16]:
# Create predictions
y_pred = model.predict(X_test_scaled)

In [17]:
#print confusion matrix
confusion_matrix(y_pred,y_test)

array([[36051,  2100],
       [  330,    40]])

In [18]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

          No       0.99      0.94      0.97     38151
         Yes       0.02      0.11      0.03       370

    accuracy                           0.94     38521
   macro avg       0.50      0.53      0.50     38521
weighted avg       0.98      0.94      0.96     38521



In [19]:
# Creating the decision tree classifier instance
DTmodel = tree.DecisionTreeClassifier()

In [20]:
# Fitting the model
DTmodel = DTmodel.fit(X_train_scaled,y_train)

In [21]:
# Making predictions using the testing data
predictions = DTmodel.predict(X_test_scaled)

In [22]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,36357,24
Actual 1,2138,2


Accuracy Score : 0.9438747696061889
Classification Report
              precision    recall  f1-score   support

          No       0.94      1.00      0.97     36381
         Yes       0.08      0.00      0.00      2140

    accuracy                           0.94     38521
   macro avg       0.51      0.50      0.49     38521
weighted avg       0.90      0.94      0.92     38521



In [24]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [25]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [26]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [27]:
# Calculating the confusion matrix
cmrf = confusion_matrix(y_test, rf_predictions)
cmrf_df = pd.DataFrame(
    cmrf, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, rf_predictions)

In [28]:
# Displaying results
print("Confusion Matrix Random Forest")
display(cmrf_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix Random Forest


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,36355,26
Actual 1,2137,3


Accuracy Score : 0.9438488097401417
Classification Report
              precision    recall  f1-score   support

          No       0.94      1.00      0.97     36381
         Yes       0.10      0.00      0.00      2140

    accuracy                           0.94     38521
   macro avg       0.52      0.50      0.49     38521
weighted avg       0.90      0.94      0.92     38521

