In [1]:
# Import the dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read in the CSV
data = Path("Resources/heart_attack_prediction_dataset.csv")
data_df = pd.read_csv(data)
data_df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [3]:
# Create the dataset based on most important variables
top7_data_df = data_df[['Income', 'Sedentary Hours Per Day', 'Exercise Hours Per Week', 'Triglycerides', 'Cholesterol', 'Age', 'Heart Rate', 'Heart Attack Risk']].copy()
top7_data_df

Unnamed: 0,Income,Sedentary Hours Per Day,Exercise Hours Per Week,Triglycerides,Cholesterol,Age,Heart Rate,Heart Attack Risk
0,261404,6.615001,4.168189,286,208,67,72,0
1,285768,4.963459,1.813242,235,389,21,98,0
2,235282,9.463426,2.078353,587,324,21,72,0
3,125640,7.648981,9.828130,378,383,84,73,0
4,160555,1.514821,5.804299,231,318,66,93,0
...,...,...,...,...,...,...,...,...
8758,235420,10.806373,7.917342,67,121,60,61,0
8759,217881,3.833038,16.558426,617,120,28,73,0
8760,36998,2.375214,3.148438,527,250,47,105,1
8761,209943,0.029104,3.789950,114,178,36,60,0


In [4]:
# Check the data types of the controlled dataframe
top7_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Income                   8763 non-null   int64  
 1   Sedentary Hours Per Day  8763 non-null   float64
 2   Exercise Hours Per Week  8763 non-null   float64
 3   Triglycerides            8763 non-null   int64  
 4   Cholesterol              8763 non-null   int64  
 5   Age                      8763 non-null   int64  
 6   Heart Rate               8763 non-null   int64  
 7   Heart Attack Risk        8763 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 547.8 KB


In [5]:
# # Change the categorical data types to numbers
# final_data_df = pd.get_dummies(data_df_2)
# final_data_df

In [6]:
# Define features set
X = top7_data_df.copy()
X.drop("Heart Attack Risk", axis=1, inplace=True)
X.head()

Unnamed: 0,Income,Sedentary Hours Per Day,Exercise Hours Per Week,Triglycerides,Cholesterol,Age,Heart Rate
0,261404,6.615001,4.168189,286,208,67,72
1,285768,4.963459,1.813242,235,389,21,98
2,235282,9.463426,2.078353,587,324,21,72
3,125640,7.648981,9.82813,378,383,84,73
4,160555,1.514821,5.804299,231,318,66,93


In [7]:
# Define target vector
y = top7_data_df["Heart Attack Risk"].ravel()
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
# Create the StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [11]:
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [13]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [14]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1351,88
Actual 1,710,42


Accuracy Score : 0.6357827476038339
Classification Report
              precision    recall  f1-score   support

           0       0.66      0.94      0.77      1439
           1       0.32      0.06      0.10       752

    accuracy                           0.64      2191
   macro avg       0.49      0.50      0.43      2191
weighted avg       0.54      0.64      0.54      2191



In [15]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.15543550833045147, 'Exercise Hours Per Week'),
 (0.15430897158255769, 'Sedentary Hours Per Day'),
 (0.1538739662674657, 'Income'),
 (0.1475679534968583, 'Triglycerides'),
 (0.14018450932677515, 'Cholesterol'),
 (0.12519009900519593, 'Age'),
 (0.12343899199069581, 'Heart Rate')]