# Machine Learning Models

## Import libraries and data

In [1]:
set.seed(1313)
#to have reproducible results

In [2]:
#Import data 
testing <-  read.csv(file.path('Data/testing.csv'))
training <-  read.csv(file.path('Data/training.csv'))

testing$HeartDisease <- as.factor(testing$HeartDisease)
testing$Sex <- as.factor(testing$Sex)
testing$ChestPainType <- as.factor(testing$ChestPainType)
testing$FastingBS <- as.factor(testing$FastingBS)
testing$RestingECG <- as.factor(testing$RestingECG)
testing$ExerciseAngina <- as.factor(testing$ExerciseAngina)
testing$ST_Slope <- as.factor(testing$ST_Slope)

row.names(testing) <- NULL

training$HeartDisease <- as.factor(training$HeartDisease)
training$Sex <- as.factor(training$Sex)
training$ChestPainType <- as.factor(training$ChestPainType)
training$FastingBS <- as.factor(training$FastingBS)
training$RestingECG <- as.factor(training$RestingECG)
training$ExerciseAngina <- as.factor(training$ExerciseAngina)
training$ST_Slope <- as.factor(training$ST_Slope)
  
row.names(training) <- NULL

In [3]:
head(training)

Unnamed: 0_level_0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Unnamed: 0_level_1,<int>,<fct>,<fct>,<int>,<int>,<fct>,<fct>,<int>,<fct>,<dbl>,<fct>,<fct>
1,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
2,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
3,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
4,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
5,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
6,39,M,NAP,120,339,0,Normal,170,N,0.0,Up,0


## Logistic Regression

In [4]:
lr.heartdisease <- glm(HeartDisease~., data = training, family = "binomial")
lr.heartdisease


Call:  glm(formula = HeartDisease ~ ., family = "binomial", data = training)

Coefficients:
     (Intercept)               Age              SexM  ChestPainTypeATA  
      -5.2198222         0.0386164         2.0703913        -1.4032914  
ChestPainTypeNAP   ChestPainTypeTA         RestingBP       Cholesterol  
      -1.7038252        -2.6096155        -0.0001980         0.0042979  
      FastingBS1  RestingECGNormal      RestingECGST             MaxHR  
       0.5517711        -0.2558801         0.0139842        -0.0004554  
 ExerciseAnginaY           Oldpeak      ST_SlopeFlat        ST_SlopeUp  
       1.2766455         0.5486903         1.6326244        -0.8596617  

Degrees of Freedom: 526 Total (i.e. Null);  511 Residual
Null Deviance:	    727.7 
Residual Deviance: 312.1 	AIC: 344.1

In [5]:
lr.heartdisease$coefficients

In [6]:
#McFadden's pseudo r-squared - good enough
with(summary(lr.heartdisease), 1 - deviance/null.deviance)

Based on the LR model, it appears that that Sex, ChestPainType, FastingBS, ExerciseAngina, Oldpeak, and ST_Slope are the most significant in predicting whether a person has heart disease. Let's check the model performance metrics.

In [7]:
#Predictions on the train set - for lm
predictTrain =  predict(lr.heartdisease, data=training, type = "response")

# Confusion matrix on test set
table_train = table(training$HeartDisease, predictTrain >= 0.5)
table_train

#Predictions on the test set
predictTest =  predict(lr.heartdisease, newdata = testing, type = "response")

# Confusion matrix on test set
table_test = table(testing$HeartDisease, predictTest >= 0.5)
table_test

   
    FALSE TRUE
  0   251   32
  1    29  215

   
    FALSE TRUE
  0    76   18
  1    14   67

In [22]:
#TRAIN
#Accuracy: out of all the predictions, what percentage is correctly made.
Acc_train = (251+215)/nrow(training)
#Precision: out of all the positive predicted, what percentage is truly positive.
Prec_train = 215/(215+29) 
#Recall: Out of the total positive, what percentage are predicted positive. = TPR
Recall_train = 215/(215+32)
#F1: harmonic mean of precision and recall
F1_train = 2 * (Prec_train * Recall_train) / (Prec_train + Recall_train)

#TEST
#Accuracy: out of all the predictions, what percentage is correctly made.
Acc_test = (76+67)/nrow(testing) 
#Precision: out of all the positive predicted, what percentage is truly positive.
Prec_test = 76/(76+14) 
#Recall: Out of the total positive, what percentage are predicted positive. = TPR
Recall_test = 76/(76+18)
#F1: harmonic mean of precision and recall
F1_test = 2 * (Prec_test * Recall_test) / (Prec_test + Recall_test)

In [23]:
print(paste0('Acc LR Train/Test ', round(Acc_train,2),' ', round(Acc_test,2)))
print(paste0('Precison LR Train/Test ', round(Prec_train,2),' ', round(Prec_test,2)))
print(paste0('Recall LR Train/Test ', round(Recall_train,2),' ', round(Recall_test,2)))
print(paste0('F1 LR Train/Test ', round(F1_train,2),' ', round(F1_test,2)))

[1] "Acc LR Train/Test 0.88 0.82"
[1] "Precison LR Train/Test 0.88 0.84"
[1] "Recall LR Train/Test 0.87 0.81"
[1] "F1 LR Train/Test 0.88 0.83"


## Random Forest

In [8]:
library("randomForest")
set.seed(1313)
rf.heartdisease <-randomForest(HeartDisease~.,data=training, importance=TRUE)
rf.heartdisease

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.




Call:
 randomForest(formula = HeartDisease ~ ., data = training, importance = TRUE) 
               Type of random forest: classification
                     Number of trees: 500
No. of variables tried at each split: 3

        OOB estimate of  error rate: 13.47%
Confusion matrix:
    0   1 class.error
0 249  34   0.1201413
1  37 207   0.1516393

**Check performance**

In [11]:
#Predictions on the train set
predictTrain =  predict(rf.heartdisease, data=training, type = "prob")

# Confusion matrix on test set
table(training$HeartDisease, predictTrain[,2] >= 0.5)

#Predictions on the test set
predictTest =  predict(rf.heartdisease, newdata = testing, type = "prob")

# Confusion matrix on test set
table(testing$HeartDisease, predictTest[,2] >= 0.5)

   
    FALSE TRUE
  0   249   34
  1    37  207

   
    FALSE TRUE
  0    77   17
  1    13   68

In [12]:
#TRAIN
#Accuracy: out of all the predictions, what percentage is correctly made.
Acc_train = (249+207)/nrow(training)
#Precision: out of all the positive predicted, what percentage is truly positive.
Prec_train = 207/(207+37) 
#Recall: Out of the total positive, what percentage are predicted positive. = TPR
Recall_train = 207/(207+34)
#F1: harmonic mean of precision and recall
F1_train = 2 * (Prec_train * Recall_train) / (Prec_train + Recall_train)

#TEST
#Accuracy: out of all the predictions, what percentage is correctly made.
Acc_test = (77+68)/nrow(testing) 
#Precision: out of all the positive predicted, what percentage is truly positive.
Prec_test = 68/(68+13) 
#Recall: Out of the total positive, what percentage are predicted positive. = TPR
Recall_test = 68/(68+17)
#F1: harmonic mean of precision and recall
F1_test = 2 * (Prec_test * Recall_test) / (Prec_test + Recall_test)

In [13]:
print(paste0('Acc RF Train/Test ', round(Acc_train,2),' ,', round(Acc_test,2)))
print(paste0('Precison RF Train/Test ', round(Prec_train,2),' ,', round(Prec_test,2)))
print(paste0('Recall RF Train/Test ', round(Recall_train,2),', ', round(Recall_test,2)))
print(paste0('F1 RF Train/Test ', round(F1_train,2),' ', round(F1_test,2)))

[1] "Acc RF Train/Test 0.87 ,0.83"
[1] "Precison RF Train/Test 0.85 ,0.84"
[1] "Recall RF Train/Test 0.86, 0.8"
[1] "F1 RF Train/Test 0.85 0.82"


## Support Vector Machine

In [14]:
library(e1071)
svm.heartdisease <- svm(HeartDisease ~. , data = training, type = "C-classification", probability = TRUE)
svm.heartdisease


Call:
svm(formula = HeartDisease ~ ., data = training, type = "C-classification", 
    probability = TRUE)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 

Number of Support Vectors:  218


In [16]:
#Predictions on the train set - for svm
predictTrain =  attr(predict(svm.heartdisease, training, probability=TRUE),"probabilities")

# Confusion matrix on test set
table_train = table(training$HeartDisease, predictTrain[,2] >= 0.5)
table_train

#Predictions on the test set
predictTest =  attr(predict(svm.heartdisease, testing, probability=TRUE),"probabilities")

# Confusion matrix on test set
table_test = table(testing$HeartDisease, predictTest[,2] >= 0.5)
table_test

   
    FALSE TRUE
  0   253   30
  1    28  216

   
    FALSE TRUE
  0    78   16
  1    11   70

In [17]:
#TRAIN
#Accuracy: out of all the predictions, what percentage is correctly made.
Acc_train = (253+216)/nrow(training)
#Precision: out of all the positive predicted, what percentage is truly positive.
Prec_train = 216/(216+28) 
#Recall: Out of the total positive, what percentage are predicted positive. = TPR
Recall_train = 216/(216+30)
#F1: harmonic mean of precision and recall
F1_train = 2 * (Prec_train * Recall_train) / (Prec_train + Recall_train)

#TEST
#Accuracy: out of all the predictions, what percentage is correctly made.
Acc_test = (78+70)/nrow(testing) 
#Precision: out of all the positive predicted, what percentage is truly positive.
Prec_test = 70/(70+11) 
#Recall: Out of the total positive, what percentage are predicted positive. = TPR
Recall_test = 70/(70+16)
#F1: harmonic mean of precision and recall
F1_test = 2 * (Prec_test * Recall_test) / (Prec_test + Recall_test)

In [18]:
print(paste0('Acc SVM Train/Test ', round(Acc_train,2),', ', round(Acc_test,2)))
print(paste0('Precison SVM Train/Test ', round(Prec_train,2),' ,', round(Prec_test,2)))
print(paste0('Recall SVM Train/Test ', round(Recall_train,2),' ,', round(Recall_test,2)))
print(paste0('F1 SVM Train/Test ', round(F1_train,2),' ,', round(F1_test,2)))

[1] "Acc SVM Train/Test 0.89, 0.85"
[1] "Precison SVM Train/Test 0.89 ,0.86"
[1] "Recall SVM Train/Test 0.88 ,0.81"
[1] "F1 SVM Train/Test 0.88 ,0.84"


## Predictions

In [4]:
testing[1,]

Unnamed: 0_level_0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Unnamed: 0_level_1,<int>,<fct>,<fct>,<int>,<int>,<fct>,<fct>,<int>,<fct>,<dbl>,<fct>,<fct>
1,37,M,ASY,140,207,0,Normal,130,Y,1.5,Flat,1


In [5]:
predict(lr.heartdisease,testing[1,], type ="response") # for class 1
predict(rf.heartdisease, testing[1,], type="prob")
attr(predict(svm.heartdisease, testing[1,], probability = TRUE), "probabilities")

Unnamed: 0,0,1
1,0.04,0.96


Unnamed: 0,0,1
1,0.07389532,0.9261047


In [21]:
testing[15,]

Unnamed: 0_level_0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Unnamed: 0_level_1,<int>,<fct>,<fct>,<int>,<int>,<fct>,<fct>,<int>,<fct>,<dbl>,<fct>,<fct>
15,43,F,NAP,150,254,0,Normal,175,N,0,Up,0


In [6]:
predict(lr.heartdisease,testing[15,], type ="response") # for class 1
predict(rf.heartdisease, testing[15,], type="prob")
attr(predict(svm.heartdisease, testing[15,], probability = TRUE), "probabilities")

Unnamed: 0,0,1
15,0.994,0.006


Unnamed: 0,0,1
15,0.9854863,0.01451368


**Save the models**

In [23]:
saveRDS(lr.heartdisease, "Models/lr.rds")
saveRDS(rf.heartdisease, "Models/rf.rds")
saveRDS(svm.heartdisease, "Models/svm.rds")