#Disease Prediction from Medical Records
This project detects heart diseases based on past medical records using multiple classification algorithms and provides each model's respective accuracy.

In [63]:
#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
import tkinter as tk
from tkinter import filedialog, messagebox
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix


In [64]:
df = pd.read_csv('/content/sample_data/DiseasePred.csv')

df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


#Data Exploration/Preprocessing


In [65]:
df.describe()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,0.677778,3.174074,131.344444,249.659259,0.148148,1.022222,149.677778,0.32963,1.05,1.585185,0.67037,4.696296
std,9.109067,0.468195,0.95009,17.861608,51.686237,0.355906,0.997891,23.165717,0.470952,1.14521,0.61439,0.943896,1.940659
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0
25%,48.0,0.0,3.0,120.0,213.0,0.0,0.0,133.0,0.0,0.0,1.0,0.0,3.0
50%,55.0,1.0,3.0,130.0,245.0,0.0,2.0,153.5,0.0,0.8,2.0,0.0,3.0
75%,61.0,1.0,4.0,140.0,280.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      270 non-null    int64  
 1   Sex                      270 non-null    int64  
 2   Chest pain type          270 non-null    int64  
 3   BP                       270 non-null    int64  
 4   Cholesterol              270 non-null    int64  
 5   FBS over 120             270 non-null    int64  
 6   EKG results              270 non-null    int64  
 7   Max HR                   270 non-null    int64  
 8   Exercise angina          270 non-null    int64  
 9   ST depression            270 non-null    float64
 10  Slope of ST              270 non-null    int64  
 11  Number of vessels fluro  270 non-null    int64  
 12  Thallium                 270 non-null    int64  
 13  Heart Disease            270 non-null    object 
dtypes: float64(1), int64(12), 

In [67]:
df.isnull().values.any()

False

In [68]:
missing_values = df.isnull().sum()
print(missing_values)

Age                        0
Sex                        0
Chest pain type            0
BP                         0
Cholesterol                0
FBS over 120               0
EKG results                0
Max HR                     0
Exercise angina            0
ST depression              0
Slope of ST                0
Number of vessels fluro    0
Thallium                   0
Heart Disease              0
dtype: int64


In [69]:
LE = LabelEncoder()

df['Heart Disease'] = LE.fit_transform(df['Heart Disease'])

categorical_columns = ['Chest pain type', 'EKG results', 'Slope of ST', 'Thallium']

for column in categorical_columns:
    df[column] = LE.fit_transform(df[column])

df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,3,130,322,0,2,109,0,2.4,1,3,0,1
1,67,0,2,115,564,0,2,160,0,1.6,1,0,2,0
2,57,1,1,124,261,0,0,141,0,0.3,0,0,2,1
3,64,1,3,128,263,0,0,105,1,0.2,1,1,2,0
4,74,0,1,120,269,0,2,121,1,0.2,0,1,0,0


In [70]:
scaler = MinMaxScaler()

numcolumns = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression']

df[numcolumns] = scaler.fit_transform(df[numcolumns])

df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0.854167,1,3,0.339623,0.447489,0,2,0.290076,0,0.387097,1,3,0,1
1,0.791667,0,2,0.198113,1.0,0,2,0.679389,0,0.258065,1,0,2,0
2,0.583333,1,1,0.283019,0.308219,0,0,0.534351,0,0.048387,0,0,2,1
3,0.729167,1,3,0.320755,0.312785,0,0,0.259542,1,0.032258,1,1,2,0
4,0.9375,0,1,0.245283,0.326484,0,2,0.381679,1,0.032258,0,1,0,0


In [71]:
x = df.drop(columns = ['Heart Disease'])
y = df['Heart Disease']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(216, 13) (54, 13) (216,) (54,)


In [72]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#Classification / Model Training Phase

## Logistic Regression

In [73]:
LR = LogisticRegression(random_state=42)
LR.fit(x_train,y_train)
LR_pred = LR.predict(x_test)
LR_acc = accuracy_score(y_test, LR_pred)
LR_f1 = f1_score(y_test, LR_pred, average='weighted')
print("Accuracy:", LR_acc)
print("F1 Score:", LR_f1)


Accuracy: 92.5925925925926
F1 Score: 92.51633986928105


##Support Vector Machine (SVM)

In [74]:
svm = SVC(probability=True, random_state=12)
svm.fit(x_train, y_train)
svm_pred = svm.predict(x_test)
svm_acc = accuracy_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred, average='weighted')
print("Accuracy:", svm_acc)
print("F1 Score:", svm_f1)


Accuracy: 87.03703703703704
F1 Score: 87.08805406479824


## XGBoost Classifier

In [75]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_estimators=10)
xgb.fit(x_train, y_train)
xgb_y_pred = xgb.predict(x_test)
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
xgb_f1 = f1_score(y_test, xgb_y_pred, average='weighted')
print("Accuracy:", xgb_accuracy)
print("F1 Score:", xgb_f1)

Accuracy: 87.03703703703704
F1 Score: 86.97569065242892


Parameters: { "use_label_encoder" } are not used.



## K Nearest Neighbors (KNN)

In [76]:
k = 99
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(x_train, y_train)
knn_y_pred = knn.predict(x_test)
knn_accuracy = accuracy_score(y_test, knn_y_pred)
knn_f1 = f1_score(y_test, knn_y_pred, average='weighted')
print("Accuracy:", knn_accuracy)
print("F1 SCore", knn_f1)

Accuracy: 88.88888888888889
F1 SCore 88.42592592592592


## Random Forest

In [77]:
RF = RandomForestClassifier(random_state=42)
RF.fit(x_train, y_train)
RF_pred = RF.predict(x_test)
RF_acc = accuracy_score(y_test, RF_pred)
RF_f1 = f1_score(y_test, RF_pred, average='weighted')
print("Accuracy:", RF_acc)
print("F1 Score:", RF_f1)

Accuracy: 75.92592592592592
F1 Score: 75.5233494363929


#Comparing Models

In [78]:
model_acc = {
    "LR": LR_acc,
    "SVM": svm_acc,
    "XGB": xgb_accuracy,
    "KNN": knn_accuracy,
    "RF": RF_acc
}

model_f1 = {
    "LR": LR_f1,
    "SVM": svm_f1,
    "XGB": xgb_f1,
    "KNN": knn_f1,
    "RF": RF_f1
}

results_df = pd.DataFrame({
    "Model": list(model_acc.keys()),
    "Accuracy": list(model_acc.values()),
    "F1 Score": list(model_f1.values())
})

results_df


Unnamed: 0,Model,Accuracy,F1 Score
0,LR,0.925926,0.925163
1,SVM,0.87037,0.870881
2,XGB,0.87037,0.869757
3,KNN,0.888889,0.884259
4,RF,0.759259,0.755233


In [79]:
best_model = max(model_acc, key=model_acc.get)
best_model

'LR'