## Importing Required Libraries
We'll begin by importing the necessary libraries for our machine learning task.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

## Loading the Dataset
Next, we'll load the dataset into a pandas DataFrame.

In [2]:
data = pd.read_csv("Heart_Disease_Prediction.csv")
data

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


## Exploratory Data Analysis (EDA)
EDA helps us understand the structure and characteristics of the dataset. Let's take a look at the data.

In [3]:
data.head()  # Display the first few rows of the dataset
data.info()  # Get information about the dataset, such as data types and missing values
data.describe()  # Generate descriptive statistics of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


## Data Preprocessing
We need to preprocess the data before building our machine learning model. This involves handling missing values, converting categorical variables to numerical, and splitting the data into training and testing sets.

In [4]:
# Handle missing values if any
data.dropna(inplace=True)

# Convert categorical variables to numerical using one-hot encoding
#data = pd.get_dummies(data, columns=["Sex", "Chest pain type","Cholesterol", "Thallium"])

# Split the data into features (X) and target variable (y)
X = data.drop("Heart Disease", axis=1)
y = data["Heart Disease"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Training the Model
Now, we'll train a Random Forest Classifier using the preprocessed data.

In [5]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

## Evaluating the Model
Let's evaluate the model's performance on the test set.

In [6]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7592592592592593


In [7]:
y_pred

array(['Absence', 'Presence', 'Absence', 'Absence', 'Absence', 'Presence',
       'Presence', 'Absence', 'Absence', 'Absence', 'Absence', 'Absence',
       'Presence', 'Presence', 'Presence', 'Absence', 'Absence',
       'Absence', 'Presence', 'Absence', 'Presence', 'Absence', 'Absence',
       'Absence', 'Absence', 'Presence', 'Absence', 'Presence', 'Absence',
       'Absence', 'Absence', 'Absence', 'Absence', 'Absence', 'Absence',
       'Absence', 'Presence', 'Absence', 'Presence', 'Presence',
       'Absence', 'Absence', 'Absence', 'Presence', 'Absence', 'Absence',
       'Absence', 'Presence', 'Presence', 'Presence', 'Absence',
       'Absence', 'Absence', 'Presence'], dtype=object)

## Saving the Model
We'll save the trained model to a file using the pickle module for later use.

In [8]:
with open("heart_disease_model.pkl", "wb") as file:
    pickle.dump(model, file)


In [9]:
# Save the scaler object
with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler, file)


### Create the Flask App
Create a new Python file, let's call it 'app.py', and import the necessary libraries.

In [10]:
X_test

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
30,57,1,3,128,229,0,2,150,0,0.4,2,1,7
116,46,1,4,120,249,0,2,144,0,0.8,1,0,7
79,56,1,2,120,236,0,0,178,0,0.8,1,0,3
127,52,0,3,136,196,0,2,169,0,0.1,2,0,3
196,58,0,4,100,248,0,2,122,0,1.0,2,0,3
137,56,1,4,125,249,1,2,144,1,1.2,2,1,3
209,37,1,3,130,250,0,0,187,0,3.5,3,0,3
45,58,1,3,140,211,1,2,165,0,0.0,1,0,3
158,56,1,1,120,193,0,2,162,0,1.9,2,0,7
247,65,0,3,155,269,0,0,148,0,0.8,1,0,3


In [11]:
y_test

30     Presence
116    Presence
79      Absence
127     Absence
196     Absence
137    Presence
209     Absence
45      Absence
158     Absence
247     Absence
183     Absence
268     Absence
227    Presence
82     Presence
165     Absence
194     Absence
226    Presence
146    Presence
104    Presence
60      Absence
221    Presence
266     Absence
46     Presence
42      Absence
185     Absence
9      Presence
22      Absence
199    Presence
109     Absence
24      Absence
113     Absence
68      Absence
144    Presence
224     Absence
252    Presence
6      Presence
120    Presence
67     Presence
119    Presence
118     Absence
25      Absence
125     Absence
244     Absence
19      Absence
77      Absence
216     Absence
90      Absence
208    Presence
93     Presence
180     Absence
15      Absence
152     Absence
232     Absence
250    Presence
Name: Heart Disease, dtype: object