In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [7]:
dataset = pd.read_csv('heart_disease_dataset.csv')
dataset.head()

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,0,228,119,66,2,2,1,0,0,1,8,119,1,1,1
1,48,1,204,165,62,2,0,5,0,0,0,9,70,1,3,0
2,53,1,234,91,67,0,2,3,1,0,1,5,196,1,1,1
3,69,0,192,90,72,2,0,4,0,1,0,7,107,1,2,0
4,62,0,172,163,93,0,0,6,0,1,0,2,183,1,0,0


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Age                      1000 non-null   int64
 1   Gender                   1000 non-null   int64
 2   Cholesterol              1000 non-null   int64
 3   Blood Pressure           1000 non-null   int64
 4   Heart Rate               1000 non-null   int64
 5   Smoking                  1000 non-null   int64
 6   Alcohol Intake           1000 non-null   int64
 7   Exercise Hours           1000 non-null   int64
 8   Family History           1000 non-null   int64
 9   Diabetes                 1000 non-null   int64
 10  Obesity                  1000 non-null   int64
 11  Stress Level             1000 non-null   int64
 12  Blood Sugar              1000 non-null   int64
 13  Exercise Induced Angina  1000 non-null   int64
 14  Chest Pain Type          1000 non-null   int64
 15  Heart

In [9]:
print(len(dataset))

1000


In [13]:
print(X.dtypes)

Age                        int64
Gender                     int64
Cholesterol                int64
Blood Pressure             int64
Heart Rate                 int64
Smoking                    int64
Alcohol Intake             int64
Exercise Hours             int64
Family History             int64
Diabetes                   int64
Obesity                    int64
Stress Level               int64
Blood Sugar                int64
Exercise Induced Angina    int64
Chest Pain Type            int64
dtype: object


In [14]:
print(y.dtypes)

int64


In [15]:
X = dataset.drop('Heart Disease', axis=1)
y = dataset['Heart Disease']

In [16]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns
print(cat_cols)

Index([], dtype='object')


In [17]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns
if len(cat_cols) > 0:
    dummies = pd.get_dummies(X[cat_cols], drop_first=True)
    X[dummies.columns] = dummies
    X.drop(cat_cols, axis=1, inplace=True)
else:
    print("No categorical columns found.")


No categorical columns found.


In [18]:
print(X.head)

<bound method NDFrame.head of      Age  Gender  Cholesterol  Blood Pressure  Heart Rate  Smoking  \
0     75       0          228             119          66        2   
1     48       1          204             165          62        2   
2     53       1          234              91          67        0   
3     69       0          192              90          72        2   
4     62       0          172             163          93        0   
..   ...     ...          ...             ...         ...      ...   
995   56       0          269             111          86        0   
996   78       0          334             145          76        0   
997   79       1          151             179          81        0   
998   60       0          326             151          68        1   
999   53       1          226             116          82        2   

     Alcohol Intake  Exercise Hours  Family History  Diabetes  Obesity  \
0                 2               1               0    

In [19]:
#split dataset
x = dataset.iloc[:, 0:15]
y = dataset.iloc[:, 15]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [20]:
#feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [21]:
#define the KNN model
classifier = KNeighborsClassifier(n_neighbors = 15, metric = "manhattan")
classifier.fit(X_train, y_train)

In [22]:
y_pred = classifier.predict(X_test)

In [23]:
#Evaluatethe model

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


Accuracy: 0.81
Confusion Matrix:
 [[104   8]
 [ 30  58]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.93      0.85       112
           1       0.88      0.66      0.75        88

    accuracy                           0.81       200
   macro avg       0.83      0.79      0.80       200
weighted avg       0.82      0.81      0.80       200



In [24]:
print(f1_score(y_test, y_pred))

0.7532467532467533


In [25]:
print(accuracy_score(y_test, y_pred))

0.81


In [26]:
#Testing the model
new_data = [[62,	0,	172,	163,	93,	0,	0,	6,	0,	1,	0,	2,	183,	1,	0]]
new_data_scaled = sc_X.transform(new_data)
new_prediction = classifier.predict(new_data_scaled)
print("Predicted class:", new_prediction[0])

Predicted class: 0




In [27]:
import joblib

In [28]:
#save the trained KNN model

joblib.dump(classifier, 'knn_model.pkl')

['knn_model.pkl']

In [29]:
#save the standard scaler
joblib.dump(classifier, 'scaler.pkl')

['scaler.pkl']