In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load dataset
data = pd.read_csv('heart_attack_prediction_dataset.csv')  # Replace with your dataset path

In [3]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Patient ID                       8763 non-null   object 
 1   Age                              8763 non-null   int64  
 2   Sex                              8763 non-null   object 
 3   Cholesterol                      8763 non-null   int64  
 4   Blood Pressure                   8763 non-null   object 
 5   Heart Rate                       8763 non-null   int64  
 6   Diabetes                         8763 non-null   int64  
 7   Family History                   8763 non-null   int64  
 8   Smoking                          8763 non-null   int64  
 9   Obesity                          8763 non-null   int64  
 10  Alcohol Consumption              8763 non-null   int64  
 11  Exercise Hours Per Week          8763 non-null   float64
 12  Diet                

In [4]:
# Check the data types
print(data.dtypes)

Patient ID                          object
Age                                  int64
Sex                                 object
Cholesterol                          int64
Blood Pressure                      object
Heart Rate                           int64
Diabetes                             int64
Family History                       int64
Smoking                              int64
Obesity                              int64
Alcohol Consumption                  int64
Exercise Hours Per Week            float64
Diet                                object
Previous Heart Problems              int64
Medication Use                       int64
Stress Level                         int64
Sedentary Hours Per Day            float64
Income                               int64
BMI                                float64
Triglycerides                        int64
Physical Activity Days Per Week      int64
Sleep Hours Per Day                  int64
Country                             object
Continent  

In [5]:
# Check for any missing values before filling
print("Missing values before filling:")
print(data.isnull().sum())

Missing values before filling:
Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
dtype: 

In [6]:
# Split the Blood Pressure column into two new columns
data[['Systolic', 'Diastolic']] = data['Blood Pressure'].str.split('/', expand=True)
data['Systolic'] = pd.to_numeric(data['Systolic'])
data['Diastolic'] = pd.to_numeric(data['Diastolic'])

# Drop the original Blood Pressure column
data.drop(columns=['Blood Pressure'], inplace=True)

# Mapping categorical variables
data['Sex'] = data['Sex'].map({'Male': 1, 'Female': 0})  # Example mapping
data['Diet'] = data['Diet'].map({'healthy': 1, 'average': 0, 'unhealthy': -1})  # Example mapping

# Select only the specified features
features = data[['Age', 'Sex', 'Cholesterol', 'Systolic', 'Diastolic', 
                 'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 
                 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week', 
                 'Diet', 'Previous Heart Problems', 'Stress Level', 'BMI', 
                 'Physical Activity Days Per Week', 'Sleep Hours Per Day']]

# Target variable
target = data['Heart Attack Risk']

# Check the final feature DataFrame
features.head()

Unnamed: 0,Age,Sex,Cholesterol,Systolic,Diastolic,Heart Rate,Diabetes,Family History,Smoking,Obesity,Alcohol Consumption,Exercise Hours Per Week,Diet,Previous Heart Problems,Stress Level,BMI,Physical Activity Days Per Week,Sleep Hours Per Day
0,67,1,208,158,88,72,0,0,1,0,0,4.168189,,0,9,31.251233,0,6
1,21,1,389,165,93,98,1,1,1,1,1,1.813242,,1,1,27.194973,1,7
2,21,0,324,174,99,72,1,0,0,0,0,2.078353,,1,9,28.176571,4,4
3,84,1,383,163,100,73,1,1,1,0,1,9.82813,,1,9,36.464704,3,4
4,66,1,318,91,88,93,1,1,1,1,0,5.804299,,1,6,21.809144,1,5


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

# Generate evaluation metrics
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1094   31]
 [ 607   21]]
              precision    recall  f1-score   support

           0       0.64      0.97      0.77      1125
           1       0.40      0.03      0.06       628

    accuracy                           0.64      1753
   macro avg       0.52      0.50      0.42      1753
weighted avg       0.56      0.64      0.52      1753



In [22]:
# Example: Predict heart attack risk for a new patient
#'Age', 'Sex', 'Cholesterol', 'Systolic', 'Diastolic', 'Heart Rate', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Exercise Hours Per Week', 'Diet', 'Previous Heart Problems', 'Stress Level', 'BMI', 'Physical Activity Days Per Week', 'Sleep Hours Per Day'

new_patient_data = [[90	,1,358,102,73,84,0,0,1,0,1,4.09,1,0,7,28.88,4,10]]
new_patient_data_scaled = scaler.transform(new_patient_data)  # Don't forget to scale
prediction = model.predict(new_patient_data_scaled)
print("Heart Attack Risk Prediction:", prediction)


Heart Attack Risk Prediction: [1]




In [12]:
import joblib

joblib.dump(model, 'heart_attack_prediction_model.pkl')

['heart_attack_prediction_model.pkl']