<a href="https://colab.research.google.com/github/FW0912/ResearchHeartDiseasePrediction/blob/main/Research_Heart_Disease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from scipy.stats import boxcox
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Install library for fetching dataset
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.6-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.6


In [None]:
from ucimlrepo import fetch_ucirepo

# Fetch dataset from UCI (heart disease dataset)
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
Y = heart_disease.data.targets

# Replace target (num) values 2, 3, 4 as 1 (model only predicts 0 and 1,
# 0 : No heart disease predicted
# 1 : Heart disease predicted
#)
Y.replace({2 : 1, 3 : 1, 4 : 1}, inplace=True)

     num
0      0
1      1
2      1
3      0
4      0
..   ...
298    1
299    1
300    1
301    1
302    0

[303 rows x 1 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Y.replace({2 : 1, 3 : 1, 4 : 1}, inplace=True)


In [None]:
# Join features and targets as one dataframe
df = X.join(Y)
print(df)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope   ca  thal  num  
0        3  0.0   6.0    0  
1        2  3.0   3.0    1  


In [None]:
# Data preprocessing

# Check for null values in dataframe
print(df.isna().sum().sum())
# Result is 6, so 6 rows with null value in dataframe

# Drop all rows with null values
cleandf = df.dropna()
print(cleandf)

6
     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
297   57    0   4       140   241    0        0      123      1      0.2   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   

     slope   ca  thal  num  
0        3  0.0   6.0    0  
1        2  3.0   3.0    1 

In [None]:
# cleandf = pd.read_csv('heart.csv')
# cleandf

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [None]:
# Changing categorical variables into objects
continuous = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
categoricalFeatures = [feature for feature in cleandf.columns if feature not in continuous]
cleandf[categoricalFeatures] = cleandf[categoricalFeatures].astype('object')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleandf[categoricalFeatures] = cleandf[categoricalFeatures].astype('object')


In [None]:
cleandf.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,297.0,54.542088,9.049736,29.0,48.0,56.0,61.0,77.0
trestbps,297.0,131.693603,17.762806,94.0,120.0,130.0,140.0,200.0
chol,297.0,247.350168,51.997583,126.0,211.0,243.0,276.0,564.0
thalach,297.0,149.599327,22.941562,71.0,133.0,153.0,166.0,202.0
oldpeak,297.0,1.055556,1.166123,0.0,0.0,0.8,1.6,6.2


In [None]:
# Do one-hot encoding for variables that need it
encodedDf = pd.get_dummies(cleandf, columns=['cp', 'restecg', 'thal'], drop_first = True)
categoricalUnencodedFeatures = ['sex', 'fbs', 'exang', 'slope', 'ca', 'num']
# categoricalUnencodedFeatures = ['sex', 'fbs', 'exang', 'slope', 'ca', 'target']

for feature in categoricalUnencodedFeatures:
  encodedDf[feature] = encodedDf[feature].astype(int)
encodedDf.dtypes
encodedDf.head()

  encodedDf = pd.get_dummies(cleandf, columns=['cp', 'restecg', 'thal'], drop_first = True)
  encodedDf = pd.get_dummies(cleandf, columns=['cp', 'restecg', 'thal'], drop_first = True)
  encodedDf = pd.get_dummies(cleandf, columns=['cp', 'restecg', 'thal'], drop_first = True)


Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,num,cp_2,cp_3,cp_4,restecg_1,restecg_2,thal_6.0,thal_7.0
0,63,1,145,233,1,150,0,2.3,3,0,0,0,0,0,0,1,1,0
1,67,1,160,286,0,108,1,1.5,2,3,1,0,0,1,0,1,0,0
2,67,1,120,229,0,129,1,2.6,2,2,1,0,0,1,0,1,0,1
3,37,1,130,250,0,187,0,3.5,3,0,0,0,1,0,0,0,0,0
4,41,0,130,204,0,172,0,1.4,1,0,0,1,0,0,0,1,0,0


In [None]:
# Convert dataframe into feature dataframe and target dataframe
xDf = encodedDf.drop('num', axis=1)
yDf = encodedDf['num']
# xDf = encodedDf.drop('target', axis=1)
# yDf = encodedDf['target']

# Split feature dataframe and target dataframe into train and test dataframes
xTrain, xTest, yTrain, yTest = train_test_split(xDf, yDf, test_size=0.2, random_state=0, stratify=yDf)

In [None]:
xTrain['oldpeak'] = xTrain['oldpeak'] + 0.001
xTest['oldpeak'] = xTest['oldpeak'] + 0.001

In [None]:
# Function for parameter tuning
def tuneParameters(model, parameters, xTrain, yTrain):
  cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
  grid = GridSearchCV(model, parameters, cv=cv, scoring='accuracy')
  grid.fit(xTrain, yTrain)
  return grid.best_estimator_, grid.best_params_

In [None]:
# Decision Tree
DTmodel = DecisionTreeClassifier(random_state=0)

In [None]:
# Decision Tree parameter tuning
DTparameters = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'max_depth' : [1, 2, 3, 4, 5],
    'min_samples_split' : [2, 3, 4, 5],
    'min_samples_leaf' : [1, 2, 3, 4, 5],
    'max_leaf_nodes' : [2, 3, 4, 5]
}

optimizedDT, optimizedDTparameters = tuneParameters(DTmodel, DTparameters, xTrain, yTrain)
print(optimizedDTparameters)

{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [None]:
# Get accuracy of optimized decision tree
predDT = optimizedDT.predict(xTest)
accDT = accuracy_score(np.ravel(yTest), np.ravel(predDT))
print(accDT)

0.7833333333333333


In [None]:
# Random Forest
RFmodel = RandomForestClassifier(random_state=0)

In [None]:
# Random Forest parameter tuning
RFparameters = {
    'n_estimators': [10, 30, 50, 70, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 3, 4],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3],
    'bootstrap': [True, False]
}

optimizedRF, optimizedRFparameters = tuneParameters(RFmodel, RFparameters, xTrain, yTrain)
print(optimizedRFparameters)

{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 30}


In [None]:
# Get accuracy of optimized random forest
predRF = optimizedRF.predict(xTest)
accRF = accuracy_score(np.ravel(yTest), np.ravel(predRF))
print(accRF)

0.85
