<a href="https://colab.research.google.com/github/FW0912/ResearchHeartDiseasePrediction/blob/main/Research_Heart_Disease_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install library for fetching dataset
!pip install ucimlrepo



In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from ucimlrepo import fetch_ucirepo

In [None]:
# Fetch Cleveland dataset from UCI (heart disease dataset)
heart_disease = fetch_ucirepo(id=45)
X_Cdataset = heart_disease.data.features
Y_Cdataset = heart_disease.data.targets

# Replace target (num) values 2, 3, 4 as 1 (model only predicts 0 and 1,
# 0 : No heart disease predicted
# 1 : Heart disease predicted
#)
Y_Cdataset.replace({2 : 1, 3 : 1, 4 : 1}, inplace=True)
df_Cdataset = X_Cdataset.join(Y_Cdataset)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y_Cdataset.replace({2 : 1, 3 : 1, 4 : 1}, inplace=True)


In [None]:
# Check for null values
print(df_Cdataset.isna().sum().sum())

# Replace null values using KNN-Imputer
imputer = KNNImputer()
df_Cdataset[:] = imputer.fit_transform(df_Cdataset)

6


In [None]:
# Do one-hot encoding for variables that need it
df_Cdataset = pd.get_dummies(df_Cdataset, columns=['cp', 'restecg', 'thal'], drop_first = True)

In [None]:
# Split into x and y
X_Cdataset = df_Cdataset.drop('num', axis=1)
Y_Cdataset = df_Cdataset['num']

In [None]:
# Split into train and test
xTrain_Cdataset, xTest_Cdataset, yTrain_Cdataset, yTest_Cdataset = train_test_split(X_Cdataset, Y_Cdataset, test_size=0.2, random_state=0, stratify=Y_Cdataset)
trainDf_Cdataset = xTrain_Cdataset.join(yTrain_Cdataset)
testDf_Cdataset = xTest_Cdataset.join(yTest_Cdataset)

In [None]:
# Split into x and y
xTrain_Cdataset = trainDf_Cdataset.drop('num', axis=1)
yTrain_Cdataset = trainDf_Cdataset['num']
xTest_Cdataset = testDf_Cdataset.drop('num', axis=1)
yTest_Cdataset = testDf_Cdataset['num']

In [None]:
# Standardization
scaler = StandardScaler()
xTrain_Cdataset[:] = scaler.fit_transform(xTrain_Cdataset)
xTest_Cdataset[:] = scaler.fit_transform(xTest_Cdataset)

In [None]:
# Box-cox transformation
lambdas = {}
continuous = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

for feature in continuous:
  if xTrain_Cdataset[feature].min() > 0:
    xTrain_Cdataset[feature], lambdas[feature] = boxcox(xTrain_Cdataset[feature])
    xTest_Cdataset[feature] = boxcox(xTest_Cdataset[feature], lmbda=lambdas[feature])

In [None]:
# Function for parameter tuning
def tuneParameters(model, parameters, xTrain, yTrain):
  cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
  grid = GridSearchCV(model, parameters, cv=cv, scoring='accuracy')
  grid.fit(xTrain, yTrain)
  return grid.best_estimator_, grid.best_params_

In [None]:
# Decision Tree
DTmodel = DecisionTreeClassifier(random_state=0)

In [None]:
# DT parameter tuning
DTparameters = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [1, 2, 3, 4, 5],
    'min_samples_split' : [2, 3, 4, 5],
    'min_samples_leaf' : [1, 2, 3, 4, 5],
    'max_leaf_nodes' : [2, 3, 4, 5]
}

optimizedDT_Cdataset, optimizedDTparameters_Cdataset = tuneParameters(DTmodel, DTparameters, xTrain_Cdataset, yTrain_Cdataset)
print(optimizedDTparameters_Cdataset)

{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 4, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [None]:
# Get accuracy of optimized DT
predDT_Cdataset = optimizedDT_Cdataset.predict(xTest_Cdataset)
accDT_Cdataset = accuracy_score(np.ravel(yTest_Cdataset), np.ravel(predDT_Cdataset))
print(accDT_Cdataset)

0.7868852459016393


In [None]:
# Get precision of optimized DT
precDT_Cdataset = precision_score(np.ravel(yTest_Cdataset), np.ravel(predDT_Cdataset))
print(precDT_Cdataset)

0.7586206896551724


In [None]:
# Get recall of optimized DT
recallDT_Cdataset = recall_score(np.ravel(yTest_Cdataset), np.ravel(predDT_Cdataset))
print(recallDT_Cdataset)

0.7857142857142857


In [None]:
# Get f1-score of optimized DT
f1DT_Cdataset = f1_score(np.ravel(yTest_Cdataset), np.ravel(predDT_Cdataset))
print(f1DT_Cdataset)

0.7719298245614034


In [None]:
# Random Forest
RFmodel = RandomForestClassifier(random_state=0)

In [None]:
# RF parameter tuning
RFparameters = {
    'n_estimators': [10, 30, 50, 70, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 4],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'bootstrap': [True, False]
}

optimizedRF_Cdataset, optimizedRFparameters_Cdataset = tuneParameters(RFmodel, RFparameters, xTrain_Cdataset, yTrain_Cdataset)
print(optimizedRFparameters_Cdataset)

{'bootstrap': True, 'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 30}


In [None]:
# Get accuracy of optimized RF
predRF_Cdataset = optimizedRF_Cdataset.predict(xTest_Cdataset)
accRF_Cdataset = accuracy_score(np.ravel(yTest_Cdataset), np.ravel(predRF_Cdataset))
print(accRF_Cdataset)

0.8852459016393442


In [None]:
# Get precision of optimized RF
precRF_Cdataset = precision_score(np.ravel(yTest_Cdataset), np.ravel(predRF_Cdataset))
print(precRF_Cdataset)

0.8888888888888888


In [None]:
# Get recall of optimized RF
recallRF_Cdataset = recall_score(np.ravel(yTest_Cdataset), np.ravel(predRF_Cdataset))
print(recallRF_Cdataset)

0.8571428571428571


In [None]:
# Get f1-score of optimized RF
f1RF_Cdataset = f1_score(np.ravel(yTest_Cdataset), np.ravel(predRF_Cdataset))
print(f1RF_Cdataset)

0.8727272727272727


In [None]:
# Support Vector Machine
SVMmodel = SVC(random_state=0)

In [None]:
# SVM hyperparameter tuning
SVMparameters = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001]
}

optimizedSVM_Cdataset, optimizedSVMparameters_Cdataset = tuneParameters(SVMmodel, SVMparameters, xTrain_Cdataset, yTrain_Cdataset)
print(optimizedSVMparameters_Cdataset)

{'C': 1, 'gamma': 1, 'kernel': 'linear'}


In [None]:
# Get accuracy of optimized SVM
predSVM_Cdataset = optimizedSVM_Cdataset.predict(xTest_Cdataset)
accSVM_Cdataset = accuracy_score(np.ravel(yTest_Cdataset), np.ravel(predSVM_Cdataset))
print(accSVM_Cdataset)

0.8688524590163934


In [None]:
# Get precision of optimized SVM
precSVM_Cdataset = precision_score(np.ravel(yTest_Cdataset), np.ravel(predSVM_Cdataset))
print(precSVM_Cdataset)

0.8571428571428571


In [None]:
# Get recall of optimized SVM
recallSVM_Cdataset = recall_score(np.ravel(yTest_Cdataset), np.ravel(predSVM_Cdataset))
print(recallSVM_Cdataset)

0.8571428571428571


In [None]:
# Get f1-score of optimized SVM
f1SVM_Cdataset = f1_score(np.ravel(yTest_Cdataset), np.ravel(predSVM_Cdataset))
print(f1SVM_Cdataset)

0.8571428571428571


In [None]:
# K-Nearest Neighbor
KNNmodel = KNeighborsClassifier()

In [None]:
# KNN hyperparameter tuning
KNNparameters = {
    'n_neighbors': np.arange(2, 30, 1),
    'weights': ['uniform', 'distance'],
    'leaf_size': [10, 30, 50, 70, 100]
}

optimizedKNN_Cdataset, optimizedKNNparameters_Cdataset = tuneParameters(KNNmodel, KNNparameters, xTrain_Cdataset, yTrain_Cdataset)
print(optimizedKNNparameters_Cdataset)

{'leaf_size': 10, 'n_neighbors': 15, 'weights': 'distance'}


In [None]:
# Get accuracy of optimized KNN
predKNN_Cdataset = optimizedKNN_Cdataset.predict(xTest_Cdataset)
accKNN_Cdataset = accuracy_score(np.ravel(yTest_Cdataset), np.ravel(predKNN_Cdataset))
print(accKNN_Cdataset)

0.8360655737704918


In [None]:
# Get precision of optimized KNN
precKNN_Cdataset = precision_score(np.ravel(yTest_Cdataset), np.ravel(predKNN_Cdataset))
print(precKNN_Cdataset)

0.8


In [None]:
# Get recall of optimized KNN
recallKNN_Cdataset = recall_score(np.ravel(yTest_Cdataset), np.ravel(predKNN_Cdataset))
print(recallKNN_Cdataset)

0.8571428571428571


In [None]:
# Get f1-score of optimized KNN
f1KNN_Cdataset = f1_score(np.ravel(yTest_Cdataset), np.ravel(predKNN_Cdataset))
print(f1KNN_Cdataset)

0.8275862068965518


In [None]:
# Logistic Regression
LRmodel = LogisticRegression(random_state=0)

In [None]:
xTrain_Cdataset.isna().sum().sum()

0

In [None]:
yTrain_Cdataset.isna().sum().sum()

0

In [None]:
# LR hyperparameter tuning
LRparameters = {
    'C': [0.1, 1, 10, 100, 1000],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
}

optimizedLR_Cdataset, optimizedLRparameters_Cdataset = tuneParameters(LRmodel, LRparameters, xTrain_Cdataset, yTrain_Cdataset)
print(optimizedLRparameters_Cdataset)

{'C': 0.1, 'solver': 'lbfgs'}


In [None]:
# Get accuracy of optimized LR
predLR_Cdataset = optimizedLR_Cdataset.predict(xTest_Cdataset)
accLR_Cdataset = accuracy_score(np.ravel(yTest_Cdataset), np.ravel(predLR_Cdataset))
print(accLR_Cdataset)

0.819672131147541


In [None]:
# Get precision of optimized LR
precLR_Cdataset = precision_score(np.ravel(yTest_Cdataset), np.ravel(predLR_Cdataset))
print(precLR_Cdataset)

0.8148148148148148


In [None]:
# Get recall of optimized LR
recallLR_Cdataset = recall_score(np.ravel(yTest_Cdataset), np.ravel(predLR_Cdataset))
print(recallLR_Cdataset)

0.7857142857142857


In [None]:
# Get f1-score of optimized LR
f1LR_Cdataset = f1_score(np.ravel(yTest_Cdataset), np.ravel(predLR_Cdataset))
print(f1LR_Cdataset)

0.7999999999999999


In [None]:
# Fetch Statlog dataset fron UCI
statlog_heart = fetch_ucirepo(id=145)
X_Sdataset = statlog_heart.data.features
Y_Sdataset = statlog_heart.data.targets

df_Sdataset = X_Sdataset.join(Y_Sdataset)

In [None]:
# Check for null values
print(df_Sdataset.isna().sum().sum())

0


In [None]:
# Do one-hot encoding for variables that need it
df_Sdataset = pd.get_dummies(df_Sdataset, columns=['chest-pain', 'electrocardiographic', 'thal'], drop_first = True)

In [None]:
# Split into x and y
X_Sdataset = df_Sdataset.drop('heart-disease', axis=1)
Y_Sdataset = df_Sdataset['heart-disease']

In [None]:
# Split into train and test
xTrain_Sdataset, xTest_Sdataset, yTrain_Sdataset, yTest_Sdataset = train_test_split(X_Sdataset, Y_Sdataset, test_size=0.2, random_state=0, stratify=Y_Sdataset)
trainDf_Sdataset = xTrain_Sdataset.join(yTrain_Sdataset)
testDf_Sdataset = xTest_Sdataset.join(yTest_Sdataset)

In [None]:
# Split into x and y
xTrain_Sdataset = trainDf_Sdataset.drop('heart-disease', axis=1)
yTrain_Sdataset = trainDf_Sdataset['heart-disease']
xTest_Sdataset = testDf_Sdataset.drop('heart-disease', axis=1)
yTest_Sdataset = testDf_Sdataset['heart-disease']

In [None]:
# Standardization
xTrain_Sdataset[:] = scaler.fit_transform(xTrain_Sdataset)
xTest_Sdataset[:] = scaler.fit_transform(xTest_Sdataset)

In [None]:
# Box-cox transformation
lambdas = {}
continuous = ['age', 'rest-bp', 'serum-chol', 'max-heart-rate', 'oldpeak']

for feature in continuous:
  if xTrain_Sdataset[feature].min() > 0:
    xTrain_Sdataset[feature], lambdas[feature] = boxcox(xTrain_Sdataset[feature])
    xTest_Sdataset[feature] = boxcox(xTest_Sdataset[feature], lmbda=lambdas[feature])

In [None]:
# DT parameter tuning
optimizedDT_Sdataset, optimizedDTparameters_Sdataset = tuneParameters(DTmodel, DTparameters, xTrain_Sdataset, yTrain_Sdataset)
print(optimizedDTparameters_Sdataset)

{'criterion': 'gini', 'max_depth': 4, 'max_leaf_nodes': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [None]:
# Get accuracy of optimized DT
predDT_Sdataset = optimizedDT_Sdataset.predict(xTest_Sdataset)
accDT_Sdataset = accuracy_score(np.ravel(yTest_Sdataset), np.ravel(predDT_Sdataset))
print(accDT_Sdataset)

In [None]:
# Get precision of optimized DT
precDT_Sdataset = precision_score(np.ravel(yTest_Sdataset), np.ravel(predDT_Sdataset))
print(precDT_Sdataset)

In [None]:
# Get recall of optimized DT
recallDT_Sdataset = recall_score(np.ravel(yTest_Sdataset), np.ravel(predDT_Sdataset))
print(recallDT_Sdataset)

In [None]:
# Get f1-score of optimized DT
f1DT_Sdataset = f1_score(np.ravel(yTest_Sdataset), np.ravel(predDT_Sdataset))
print(f1DT_Sdataset)

In [None]:
# RF parameter tuning
optimizedRF_Sdataset, optimizedRFparameters_Sdataset = tuneParameters(RFmodel, RFparameters, xTrain_Sdataset, yTrain_Sdataset)
print(optimizedRFparameters_Sdataset)

In [None]:
# Get accuracy of optimized RF
predRF_Sdataset = optimizedRF_Sdataset.predict(xTest_Sdataset)
accRF_Sdataset = accuracy_score(np.ravel(yTest_Sdataset), np.ravel(predRF_Sdataset))
print(accRF_Sdataset)

In [None]:
# Get precision of optimized RF
precRF_Sdataset = precision_score(np.ravel(yTest_Sdataset), np.ravel(predRF_Sdataset))
print(precRF_Sdataset)

In [None]:
# Get recall of optimized RF
recallRF_Sdataset = recall_score(np.ravel(yTest_Sdataset), np.ravel(predRF_Sdataset))
print(recallRF_Sdataset)

In [None]:
# Get f1-score of optimized RF
f1RF_Sdataset = f1_score(np.ravel(yTest_Sdataset), np.ravel(predRF_Sdataset))
print(f1RF_Sdataset)

In [None]:
# SVM parameter tuning
optimizedSVM_Sdataset, optimizedSVMparameters_Sdataset = tuneParameters(SVMmodel, SVMparameters, xTrain_Sdataset, yTrain_Sdataset)
print(optimizedSVMparameters_Sdataset)

In [None]:
# Get accuracy of optimized SVM
predSVM_Sdataset = optimizedSVM_Sdataset.predict(xTest_Sdataset)
accSVM_Sdataset = accuracy_score(np.ravel(yTest_Sdataset), np.ravel(predSVM_Sdataset))
print(accSVM_Sdataset)

In [None]:
# Get precision of optimized SVM
precSVM_Sdataset = precision_score(np.ravel(yTest_Sdataset), np.ravel(predSVM_Sdataset))
print(precSVM_Sdataset)

In [None]:
# Get recall of optimized SVM
recallSVM_Sdataset = recall_score(np.ravel(yTest_Sdataset), np.ravel(predSVM_Sdataset))
print(recallSVM_Sdataset)

In [None]:
# Get f1-score of optimized SVM
f1SVM_Sdataset = f1_score(np.ravel(yTest_Sdataset), np.ravel(predSVM_Sdataset))
print(f1SVM_Sdataset)

In [None]:
# KNN hyperparameter tuning
optimizedKNN_Sdataset, optimizedKNNparameters_Sdataset = tuneParameters(KNNmodel, KNNparameters, xTrain_Sdataset, yTrain_Sdataset)
print(optimizedKNNparameters_Sdataset)

In [None]:
# Get accuracy of optimized KNN
predKNN_Sdataset = optimizedKNN_Sdataset.predict(xTest_Sdataset)
accKNN_Sdataset = accuracy_score(np.ravel(yTest_Sdataset), np.ravel(predKNN_Sdataset))
print(accKNN_Sdataset)

In [None]:
# Get precision of optimized KNN
precKNN_Sdataset = precision_score(np.ravel(yTest_Sdataset), np.ravel(predKNN_Sdataset))
print(precKNN_Sdataset)

In [None]:
# Get recall of optimized KNN
recallKNN_Sdataset = recall_score(np.ravel(yTest_Sdataset), np.ravel(predKNN_Sdataset))
print(recallKNN_Sdataset)

In [None]:
# Get f1-score of optimized KNN
f1KNN_Sdataset = f1_score(np.ravel(yTest_Sdataset), np.ravel(predKNN_Sdataset))
print(f1KNN_Sdataset)

In [None]:
# LR parameter tuning
optimizedLR_Sdataset, optimizedLRparameters_Sdataset = tuneParameters(LRmodel, LRparameters, xTrain_Sdataset, yTrain_Sdataset)
print(optimizedLRparameters_Sdataset)

In [None]:
# Get accuracy of optimized LR
predLR_Sdataset = optimizedLR_Sdataset.predict(xTest_Sdataset)
accLR_Sdataset = accuracy_score(np.ravel(yTest_Sdataset), np.ravel(predLR_Sdataset))
print(accLR_Sdataset)

In [None]:
# Get precision of optimized LR
precLR_Sdataset = precision_score(np.ravel(yTest_Sdataset), np.ravel(predLR_Sdataset))
print(precLR_Sdataset)

In [None]:
# Get recall of optimized LR
recallLR_Sdataset = recall_score(np.ravel(yTest_Sdataset), np.ravel(predLR_Sdataset))
print(recallLR_Sdataset)

In [None]:
# Get f1-score of optimized LR
f1LR_Sdataset = f1_score(np.ravel(yTest_Sdataset), np.ravel(predLR_Sdataset))
print(f1LR_Sdataset)