In [None]:
install.packages("caret")
install.packages("e1071")
install.packages("caTools")
install.packages("ggplot2")
install.packages("dplyr")
install.packages("corrplot")
install.packages("grid")
install.packages("gridExtra")

In [None]:
library(ggplot2)
library(grid)
library(lattice)
library(gridExtra)

library(caTools)
library(caret)
library(e1071)

library(dplyr)
library(corrplot)

In [5]:
dataset <- read.csv("./dataset.csv", header = TRUE)

In [6]:
# Basic summarizations
dim(dataset)
summary(dataset)

In [7]:
# Removing obvious useless columns
dataset$EmployeeNumber <- NULL
dataset$StandardHours <- NULL
dataset$Over18 <- NULL
dataset$EmployeeCount <- NULL

In [None]:
corr_dataset <- dataset

numeric_data <- corr_dataset %>%
  select_if(is.numeric)

cor_matrix <- cor(numeric_data)

par(mar = c(2, 2, 2, 2))  # Adjust the margins if needed
corrplot(cor_matrix, method = "color", order="AOE", addCoef.col = "black", tl.col = "black", tl.srt = 90, number.cex = 0.4)

In [None]:
plot_list <- c()
for(i in names(filtered_dataset)){
    curr_plot <- ggplot(filtered_dataset, aes(x = i)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
  geom_density(color = "red", linewidth = 1) +
  labs(title = paste("Distribution of", i),
       x = i,
       y = "Density")
    plot_list <- c(plot_list, curr_plot)
}
grid.arrange(plot_list, ncol=4)

## Naive Bayes classifier

In [None]:
# Splitting the dataset into the Training set and Test set
set.seed(7)
split_ratio <- sample.split(nb_dataset, SplitRatio = 0.7)

training_set <-  subset(nb_dataset, split_ratio=="TRUE")
testing_set <- subset(nb_dataset, split_ratio=="FALSE")

In [None]:
# Training the Naive Bayes model and predicting the Test set results
nb_model <- naiveBayes(Attrition~., data=training_set)

prediction_results <- predict(nb_model, newdata=testing_set)

matrix <- table(testing_set$Attrition, prediction_results)
cm <- confusionMatrix(matrix)

print(cm)

In [None]:
# Automated test to check the overall accuracy of the model
nb_dataset <- filtered_dataset

test_seeds = c(7, 10, 32, 64, 82, 100, 200, 312, 152, 123, 23)

seed_len <- length(test_seeds)
overall_accuracy <- 0

for (curr_seed in test_seeds){
    set.seed(test_seeds)
    
    split_ratio <- sample.split(nb_dataset, SplitRatio = 0.7)

    training_set <-  subset(nb_dataset, split_ratio=="TRUE")
    testing_set <- subset(nb_dataset, split_ratio=="FALSE")
    
    nb_model <- naiveBayes(Attrition~., data=training_set)

    prediction_results <- predict(nb_model, newdata=testing_set)

    matrix <- table(testing_set$Attrition, prediction_results)
    cm <- confusionMatrix(matrix)
    overall_accuracy <- overall_accuracy + cm$overall["Accuracy"]
}
overall_accuracy <- (overall_accuracy/seed_len)*100

corrplot(cor_matrix, method = "color", order="AOE", addCoef.col = "black", tl.col = "black", tl.srt = 90, number.cex = 0.2)
print(paste("Overall Accuracy:", sprintf("%.2f%%", overall_accuracy)))