## Installing required packages

In [None]:
packages <- c("caret", "randomForest", "e1071", "kernlab", "class")
install.packages(packages, dependencies = TRUE)
lapply(packages, library, character.only = TRUE)

# setting the seed
set.seed(123)

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘prodlim’, ‘recipes’, ‘ipred’, ‘themis’


Loading required package: ggplot2

Loading required package: lattice

randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin



Attaching package: ‘kernlab’


The following object is masked from ‘package:ggplot2’:

    alpha




## Read the data

In [None]:
data <- read.csv("/content/CardiacPrediction.csv")
print("Data loaded successfully.")

data$CoronaryHeartDisease <- factor(data$CoronaryHeartDisease)

[1] "Data loaded successfully."


## Equalizing the classes

In [None]:
class_1 <- subset(data, CoronaryHeartDisease == 1)
class_0 <- subset(data, CoronaryHeartDisease == 0)
class_0_sample <- class_0[sample(nrow(class_0), 1508), ]

balanced_data <- rbind(class_1, class_0_sample)
balanced_data <-  balanced_data[sample(nrow(balanced_data)), ]

## Splitting the data

In [None]:
train_index <- createDataPartition(balanced_data$CoronaryHeartDisease, p = 0.75, list = FALSE)
train_data <- balanced_data[train_index, ]
test_data <- balanced_data[-train_index, ]

## Cross Validation

In [None]:
ctrl <- trainControl(method = "cv", number = 10)

## Training and Evaluation

### Logistic Regression

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data, method = "glm", trControl = ctrl, metric = "Accuracy")

In [None]:
predictions <- predict(model, newdata = test_data)
conf_matrix <- confusionMatrix(predictions, test_data$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 275  70
         1 102 307
                                          
               Accuracy : 0.7719          
                 95% CI : (0.7402, 0.8014)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.5438          
                                          
 Mcnemar's Test P-Value : 0.01809         
                                          
            Sensitivity : 0.7294          
            Specificity : 0.8143          
         Pos Pred Value : 0.7971          
         Neg Pred Value : 0.7506          
             Prevalence : 0.5000          
         Detection Rate : 0.3647          
   Detection Prevalence : 0.4576          
      Balanced Accuracy : 0.7719          
                                          
       'Positive' Class : 0               
                              

### XGBoost

In [None]:
install.packages('xgboost')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data, method = "xgbTree", trControl = ctrl, metric = "Accuracy")



In [None]:
predictions <- predict(model, newdata = test_data)
conf_matrix <- confusionMatrix(predictions, test_data$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 279  65
         1  98 312
                                          
               Accuracy : 0.7838          
                 95% CI : (0.7527, 0.8127)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.5676          
                                          
 Mcnemar's Test P-Value : 0.0122          
                                          
            Sensitivity : 0.7401          
            Specificity : 0.8276          
         Pos Pred Value : 0.8110          
         Neg Pred Value : 0.7610          
             Prevalence : 0.5000          
         Detection Rate : 0.3700          
   Detection Prevalence : 0.4562          
      Balanced Accuracy : 0.7838          
                                          
       'Positive' Class : 0               
                              

### SVM

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data, method = "svmRadial", trControl = ctrl, metric = "Accuracy")

In [None]:
predictions <- predict(model, newdata = test_data)
conf_matrix <- confusionMatrix(predictions, test_data$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 277  62
         1 100 315
                                         
               Accuracy : 0.7851         
                 95% CI : (0.7541, 0.814)
    No Information Rate : 0.5            
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.5703         
                                         
 Mcnemar's Test P-Value : 0.003649       
                                         
            Sensitivity : 0.7347         
            Specificity : 0.8355         
         Pos Pred Value : 0.8171         
         Neg Pred Value : 0.7590         
             Prevalence : 0.5000         
         Detection Rate : 0.3674         
   Detection Prevalence : 0.4496         
      Balanced Accuracy : 0.7851         
                                         
       'Positive' Class : 0              
                                         

### Random Forest

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data, method = "rf", trControl = ctrl, metric = "Accuracy")

In [None]:
predictions <- predict(model, newdata = test_data)
conf_matrix <- confusionMatrix(predictions, test_data$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 262  56
         1 115 321
                                          
               Accuracy : 0.7732          
                 95% CI : (0.7416, 0.8026)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5464          
                                          
 Mcnemar's Test P-Value : 9.191e-06       
                                          
            Sensitivity : 0.6950          
            Specificity : 0.8515          
         Pos Pred Value : 0.8239          
         Neg Pred Value : 0.7362          
             Prevalence : 0.5000          
         Detection Rate : 0.3475          
   Detection Prevalence : 0.4218          
      Balanced Accuracy : 0.7732          
                                          
       'Positive' Class : 0               
                              

### KNN

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data, method = "knn", trControl = ctrl, metric = "Accuracy")

In [None]:
predictions <- predict(model, newdata = test_data)
conf_matrix <- confusionMatrix(predictions, test_data$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 211 176
         1 166 201
                                          
               Accuracy : 0.5464          
                 95% CI : (0.5101, 0.5824)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : 0.005963        
                                          
                  Kappa : 0.0928          
                                          
 Mcnemar's Test P-Value : 0.626496        
                                          
            Sensitivity : 0.5597          
            Specificity : 0.5332          
         Pos Pred Value : 0.5452          
         Neg Pred Value : 0.5477          
             Prevalence : 0.5000          
         Detection Rate : 0.2798          
   Detection Prevalence : 0.5133          
      Balanced Accuracy : 0.5464          
                                          
       'Positive' Class : 0               
                              

## Feature Selection (Mean Decrease Gini Impurity)

In [None]:
rf_model <- randomForest(CoronaryHeartDisease ~ ., data = train_data, ntree = 100, mtry = 2)

In [None]:
variable_importance <- importance(rf_model)

variable_importance_df <- as.data.frame(variable_importance)

important_vars <- rownames(variable_importance_df[variable_importance_df$MeanDecreaseGini > 25, , drop = FALSE])
important_vars <- important_vars[!important_vars %in% "SEQN"]

train_data_rf <- train_data[, c("CoronaryHeartDisease", important_vars)]
test_data_rf <- test_data[, c("CoronaryHeartDisease", important_vars)]

# train_data_rf

## Re-Training the models

In [None]:
train_data_rf$CoronaryHeartDisease = as.factor(train_data_rf$CoronaryHeartDisease)
test_data_rf$CoronaryHeartDisease = as.factor(test_data_rf$CoronaryHeartDisease)

### Logistic Regression

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data_rf, method = "glm", trControl = ctrl, metric = "Accuracy")

In [None]:
predictions <- predict(model, newdata = test_data_rf)
conf_matrix <- confusionMatrix(predictions, test_data_rf$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 273  68
         1 104 309
                                          
               Accuracy : 0.7719          
                 95% CI : (0.7402, 0.8014)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5438          
                                          
 Mcnemar's Test P-Value : 0.007614        
                                          
            Sensitivity : 0.7241          
            Specificity : 0.8196          
         Pos Pred Value : 0.8006          
         Neg Pred Value : 0.7482          
             Prevalence : 0.5000          
         Detection Rate : 0.3621          
   Detection Prevalence : 0.4523          
      Balanced Accuracy : 0.7719          
                                          
       'Positive' Class : 0               
                              

### SVM

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data_rf, method = "svmRadial", trControl = ctrl, metric = "Accuracy")

In [None]:
predictions <- predict(model, newdata = test_data_rf)
conf_matrix <- confusionMatrix(predictions, test_data_rf$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 257  59
         1 120 318
                                          
               Accuracy : 0.7626          
                 95% CI : (0.7306, 0.7926)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5252          
                                          
 Mcnemar's Test P-Value : 7.305e-06       
                                          
            Sensitivity : 0.6817          
            Specificity : 0.8435          
         Pos Pred Value : 0.8133          
         Neg Pred Value : 0.7260          
             Prevalence : 0.5000          
         Detection Rate : 0.3408          
   Detection Prevalence : 0.4191          
      Balanced Accuracy : 0.7626          
                                          
       'Positive' Class : 0               
                              

### Random Forest

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data_rf, method = "rf", trControl = ctrl, metric = "Accuracy")

In [None]:
predictions <- predict(model, newdata = test_data_rf)
conf_matrix <- confusionMatrix(predictions, test_data_rf$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 276  68
         1 101 309
                                          
               Accuracy : 0.7759          
                 95% CI : (0.7444, 0.8052)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.5517          
                                          
 Mcnemar's Test P-Value : 0.01383         
                                          
            Sensitivity : 0.7321          
            Specificity : 0.8196          
         Pos Pred Value : 0.8023          
         Neg Pred Value : 0.7537          
             Prevalence : 0.5000          
         Detection Rate : 0.3660          
   Detection Prevalence : 0.4562          
      Balanced Accuracy : 0.7759          
                                          
       'Positive' Class : 0               
                              

## Selecting more variables

In [None]:
variable_importance <- importance(rf_model)

variable_importance_df <- as.data.frame(variable_importance)

important_vars <- rownames(variable_importance_df[variable_importance_df$MeanDecreaseGini > 20, , drop = FALSE])
important_vars <- important_vars[!important_vars %in% "SEQN"]

train_data_rf <- train_data[, c("CoronaryHeartDisease", important_vars)]
test_data_rf <- test_data[, c("CoronaryHeartDisease", important_vars)]

# train_data_rf

### Logistic Regression

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data_rf, method = "glm", trControl = ctrl, metric = "Accuracy")

predictions <- predict(model, newdata = test_data_rf)
conf_matrix <- confusionMatrix(predictions, test_data_rf$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 275  70
         1 102 307
                                          
               Accuracy : 0.7719          
                 95% CI : (0.7402, 0.8014)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.5438          
                                          
 Mcnemar's Test P-Value : 0.01809         
                                          
            Sensitivity : 0.7294          
            Specificity : 0.8143          
         Pos Pred Value : 0.7971          
         Neg Pred Value : 0.7506          
             Prevalence : 0.5000          
         Detection Rate : 0.3647          
   Detection Prevalence : 0.4576          
      Balanced Accuracy : 0.7719          
                                          
       'Positive' Class : 0               
                              

### SVM

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data_rf, method = "svmRadial", trControl = ctrl, metric = "Accuracy")

predictions <- predict(model, newdata = test_data_rf)
conf_matrix <- confusionMatrix(predictions, test_data_rf$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 263  68
         1 114 309
                                          
               Accuracy : 0.7586          
                 95% CI : (0.7264, 0.7888)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5172          
                                          
 Mcnemar's Test P-Value : 0.0008511       
                                          
            Sensitivity : 0.6976          
            Specificity : 0.8196          
         Pos Pred Value : 0.7946          
         Neg Pred Value : 0.7305          
             Prevalence : 0.5000          
         Detection Rate : 0.3488          
   Detection Prevalence : 0.4390          
      Balanced Accuracy : 0.7586          
                                          
       'Positive' Class : 0               
                              

## Selecting almost all the variables

In [None]:
variable_importance <- importance(rf_model)

variable_importance_df <- as.data.frame(variable_importance)

important_vars <- rownames(variable_importance_df[variable_importance_df$MeanDecreaseGini > 15, , drop = FALSE])
important_vars <- important_vars[!important_vars %in% "SEQN"]

train_data_rf <- train_data[, c("CoronaryHeartDisease", important_vars)]
test_data_rf <- test_data[, c("CoronaryHeartDisease", important_vars)]

# train_data_rf

### SVM

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data_rf, method = "svmRadial", trControl = ctrl, metric = "Accuracy")

predictions <- predict(model, newdata = test_data_rf)
conf_matrix <- confusionMatrix(predictions, test_data_rf$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 263  58
         1 114 319
                                          
               Accuracy : 0.7719          
                 95% CI : (0.7402, 0.8014)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.5438          
                                          
 Mcnemar's Test P-Value : 2.744e-05       
                                          
            Sensitivity : 0.6976          
            Specificity : 0.8462          
         Pos Pred Value : 0.8193          
         Neg Pred Value : 0.7367          
             Prevalence : 0.5000          
         Detection Rate : 0.3488          
   Detection Prevalence : 0.4257          
      Balanced Accuracy : 0.7719          
                                          
       'Positive' Class : 0               
                              

### LR

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = train_data_rf, method = "glm", trControl = ctrl, metric = "Accuracy")

predictions <- predict(model, newdata = test_data_rf)
conf_matrix <- confusionMatrix(predictions, test_data_rf$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 276  73
         1 101 304
                                          
               Accuracy : 0.7692          
                 95% CI : (0.7375, 0.7989)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.5385          
                                          
 Mcnemar's Test P-Value : 0.04067         
                                          
            Sensitivity : 0.7321          
            Specificity : 0.8064          
         Pos Pred Value : 0.7908          
         Neg Pred Value : 0.7506          
             Prevalence : 0.5000          
         Detection Rate : 0.3660          
   Detection Prevalence : 0.4629          
      Balanced Accuracy : 0.7692          
                                          
       'Positive' Class : 0               
                              

## Feature Selection (Lasso Regression)

In [None]:
install.packages('glmnet')
library('glmnet')

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Loading required package: Matrix

Loaded glmnet 4.1-8



In [None]:
train_data_y_numeric = as.numeric(train_data$CoronaryHeartDisease)

x <- model.matrix(train_data$CoronaryHeartDisease ~ ., data = train_data)

lasso_model <- cv.glmnet(x, train_data_y_numeric, alpha = 1)

In [None]:
# Find the optimal lambda value
best_lambda <- lasso_model$lambda.min

# Extract selected features
selected_features <- coef(lasso_model, s = best_lambda)

# Reduce dimensionality
reduced_train_data <- train_data[, which(selected_features[-1] != 0)]
reduced_test_data <- test_data[, which(selected_features[-1] != 0)]
# reduced_test_data

## Re-Training the models

### LR

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = reduced_train_data, method = "glm", trControl = ctrl, metric = "Accuracy")

predictions <- predict(model, newdata = reduced_test_data)
conf_matrix <- confusionMatrix(predictions, reduced_test_data$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 281  77
         1  96 300
                                          
               Accuracy : 0.7706          
                 95% CI : (0.7389, 0.8001)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.5411          
                                          
 Mcnemar's Test P-Value : 0.1712          
                                          
            Sensitivity : 0.7454          
            Specificity : 0.7958          
         Pos Pred Value : 0.7849          
         Neg Pred Value : 0.7576          
             Prevalence : 0.5000          
         Detection Rate : 0.3727          
   Detection Prevalence : 0.4748          
      Balanced Accuracy : 0.7706          
                                          
       'Positive' Class : 0               
                              

### SVM

In [None]:
model <- train(CoronaryHeartDisease ~ ., data = reduced_train_data, method = "svmRadial", trControl = ctrl, metric = "Accuracy")

predictions <- predict(model, newdata = reduced_test_data)
conf_matrix <- confusionMatrix(predictions, reduced_test_data$CoronaryHeartDisease)
conf_matrix

Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 262  66
         1 115 311
                                        
               Accuracy : 0.7599        
                 95% CI : (0.7278, 0.79)
    No Information Rate : 0.5           
    P-Value [Acc > NIR] : < 2e-16       
                                        
                  Kappa : 0.5199        
                                        
 Mcnemar's Test P-Value : 0.00036       
                                        
            Sensitivity : 0.6950        
            Specificity : 0.8249        
         Pos Pred Value : 0.7988        
         Neg Pred Value : 0.7300        
             Prevalence : 0.5000        
         Detection Rate : 0.3475        
   Detection Prevalence : 0.4350        
      Balanced Accuracy : 0.7599        
                                        
       'Positive' Class : 0             
                                        