# Background information
#### Context (from Kaggle.com)

This data set dates from 1988 and consists of four databases: Cleveland, Hungary, Switzerland, and Long Beach V. It contains 76 attributes, including the predicted attribute, but all published experiments refer to using a subset of 14 of them. The "target" field refers to the presence of heart disease in the patient. It is integer valued 0 = no disease and 1 = disease.

#### Attribute Information: 
1. age 
2. sex 
3. chest pain type (4 values) 
4. resting blood pressure 
5. serum cholestoral in mg/dl 
6. fasting blood sugar > 120 mg/dl 
7. resting electrocardiographic results (values 0,1,2) 
8. maximum heart rate achieved 
9. exercise induced angina 
10. oldpeak = ST depression induced by exercise relative to rest 
11. the slope of the peak exercise ST segment 
12. number of major vessels (0-3) colored by flourosopy 
13. thal: 0 = normal; 1 = fixed defect; 2 = reversable defect The names and social security numbers of the patients were recently removed from the database, replaced with dummy values.

Link to dataset: https://www.kaggle.com/johnsmith88/heart-disease-dataset

- Ben Wijnen, 21-04-2020
- Department Clinical epidemiology and Medical Technology Assessment 
- ben.wijnen@mumc.nl


In [None]:
## Command to see whether packages are installed and installs packages if not yet installed
if(!any(rownames(installed.packages()) == 'ggplot2')) install.packages('ggplot2')
if(!any(rownames(installed.packages()) == 'caret')) install.packages('caret')
if(!any(rownames(installed.packages()) == 'rpart')) install.packages('rpart')
if(!any(rownames(installed.packages()) == 'rpart.plot')) install.packages('rpart.plot')
if(!any(rownames(installed.packages()) == 'dplyr')) install.packages('dplyr')
if(!any(rownames(installed.packages()) == 'randomForest')) install.packages('randomForest')
if(!any(rownames(installed.packages()) == 'e1071')) install.packages('e1071')
if(!any(rownames(installed.packages()) == 'purrr')) install.packages('purrr')
if(!any(rownames(installed.packages()) == 'DescTools')) install.packages('DescTools')
if(!any(rownames(installed.packages()) == 'tidyr')) install.packages('tidyr')
if(!any(rownames(installed.packages()) == 'glmnet')) install.packages('glmnet')

library(glmnet)
library(tidyr)
library(DescTools)
library(purrr)
library(ggplot2)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(dplyr)
library(e1071)


In [None]:
## Load dataset
rm(list = ls())  # Delete everything that is in R's memory
options(scipen=999) 
dataset <- read.csv("heart.csv", header = TRUE)


In [None]:
## Explore dataset
sapply(dataset, function(x) sum(is.na(x))) # map the number of missings
str(dataset)

In [None]:
## change factors to factors
factor_variables <- c("sex", 
                      "thal", 
                      "target", 
                      "cp", 
                      "exang") 

dataset[factor_variables] <- lapply(dataset[factor_variables], factor) 
                                            

In [None]:
# Describtive statistics 
summary(dataset) ### Numeric variables are highlighted with *

# map levels and frequencies per categorical variable
temp <-  dataset %>% keep(is.factor)
for (i in 1:ncol(temp)) {
  print(names(temp[i]))
  print(Freq(temp[,i]))}

In [None]:
## Histrograms per variable to check distribution and outliers
dataset %>%
  keep(is.numeric) %>%
  gather() %>%
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_histogram()

In [None]:
## Check near-zero variance
nearZeroVar(dataset, saveMetrics = TRUE)


In [None]:
##### SET Y #######
dataset$Y <- dataset$target ## set Y
dataset$target <- NULL # delete duplicate variable 

In [None]:
## Create train/test set
set.seed (123456789)
dt = sort(sample(nrow(dataset), nrow(dataset)*.7)) ## 70% in train set
train<-dataset[dt,]
test<-dataset[-dt,]

# Cross-validation
ctrl <- trainControl(method = "repeatedcv",
                     number = 5,    # ten folds
                     repeats = 3)   # repeated three times

In [None]:
# Run models
## Run LM
lm <- train(Y ~ ., method = "glm", 
            data = train, 
            trControl = ctrl, 
            preProcess = c("center"))
lm
plot(varImp(lm))


In [None]:
## Run LASSO
LassoFit <- train(Y ~ ., data = train,
                  method = "glmnet",
                  family = "binomial",
                  trControl = ctrl)
LassoFit
coef(LassoFit$finalModel, LassoFit$bestTune$lambda)
plot(varImp(LassoFit,scale=F))



In [None]:
## Run kNN
knnFit <- train(Y ~ ., 
                data = train, 
                method = "knn", 
                trControl = ctrl, 
                preProcess = c("center","scale"))

knnFit
plot(knnFit)
plot(varImp(knnFit))


In [None]:
## Run regression tree
tree <- rpart(Y~., data = train, control = rpart.control(minsplit = 15, cp = 0.01))
rpart.plot(tree, main="Y met alle variabelen")

In [None]:
## Run RF
rf <- train(Y ~ ., method = "rf", 
            data = train, 
            trControl = ctrl, 
            preProcess = c("center"))
rf
plot(rf)
varImpPlot(rf$finalModel)


In [None]:
## Check performance on test set
pred_lm <- predict(lm, newdata = test)
print("Confusing matrix LM")
confusionMatrix(pred_lm, test$Y)

pred_knn <- predict(knnFit, newdata = test)
print("Confusing matrix kNN")
confusionMatrix(pred_knn, test$Y)

pred_lasso <- predict(LassoFit, newdata = test)
print("Confusing matrix Lasso")
confusionMatrix(pred_lasso, test$Y)

pred_rf <- predict(rf, newdata = test)
print("Confusing matrix Random Forest")
confusionMatrix(pred_rf, test$Y)