In [None]:
# Load required libraries
# Install the necessary packages if they are not already installed
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("randomForest")) install.packages("randomForest")
if (!require("caret")) install.packages("caret")
if (!require("pROC")) install.packages("pROC")
if (!require("openxlsx")) install.packages("openxlsx")
if (!require("DMwR2")) install.packages("DMwR2")
if (!require("smotefamily")) install.packages("smotefamily")

In [None]:
# Load required libraries
library(tidyverse)
library(randomForest)
library(caret)
library(pROC)
library(openxlsx)
library(DMwR2)  # For SMOTE

In [None]:
library(smotefamily)

In [41]:
# Step 1: Data Preprocessing
# Load the datasets
case_data <- read.csv("PC_case.csv")
control_data <- read.csv("PC_control.csv")
genes_data <- read.csv("geneList.csv")


# Assign column names
colnames(case_data) <- c("chromosome", "start", "end", "variation", "patient_id")
colnames(control_data) <- c("chromosome", "start", "end", "variation", "patient_id")
colnames(genes_data) <- c("gene_id", "placeholder", "chromosome", "gene_start", "gene_end")

In [42]:
# Combine case and control datasets
case_data$Group <- "Case"
control_data$Group <- "Control"
cnv_data <- rbind(case_data, control_data)

In [43]:
# Feature engineering: Create CNV_Length
cnv_data$CNV_Length <- cnv_data$end - cnv_data$start

In [44]:
# Convert necessary columns to factors
cnv_data$chromosome <- as.factor(cnv_data$chromosome)
cnv_data$variation <- as.factor(cnv_data$variation)
cnv_data$Group <- as.factor(cnv_data$Group)

In [45]:
# Encode target variable
cnv_data$Target <- ifelse(cnv_data$Group == "Case", 1, 0)

In [46]:
# One-hot encoding for categorical features
dummy_vars <- dummyVars(~ chromosome + variation, data = cnv_data)
encoded_features <- as.data.frame(predict(dummy_vars, newdata = cnv_data))

In [47]:
# Combine features and target
final_data <- cbind(encoded_features, CNV_Length = cnv_data$CNV_Length, Target = cnv_data$Target)

In [48]:
# Split data into training and testing sets
set.seed(42)
train_index <- createDataPartition(final_data$Target, p = 0.7, list = FALSE)
train_data <- final_data[train_index, ]
test_data <- final_data[-train_index, ]

In [50]:
# Balance the training data using SMOTE from smotefamily
smote_train <- smotefamily::SMOTE(
  train_data[, -which(names(train_data) == "Target")],
  train_data$Target,
  K = 5,
  dup_size = 50
)

# Structure the output to match what you expect
smote_train <- as.data.frame(smote_train$data)
colnames(smote_train)[ncol(smote_train)] <- "Target" # Rename the last column to "Target"

In [51]:
# Train Random Forest model with hyperparameter tuning
rf_grid <- expand.grid(mtry = c(2, 5, 10), ntree = c(100, 200, 300), maxnodes = c(10, 20, 30))
best_model <- NULL
best_auc <- 0

In [52]:
# Ensure the Target variable is a factor for classification
smote_train$Target <- as.factor(smote_train$Target)
test_data$Target <- as.factor(test_data$Target)

In [53]:
# Convert Target to numeric only for AUC
test_data_numeric_target <- as.numeric(as.character(test_data$Target))

In [None]:
# for (i in 1:nrow(rf_grid)) {
#   rf_model <- randomForest(
#     Target ~ .,
#     data = smote_train,
#     mtry = rf_grid$mtry[i],
#     ntree = rf_grid$ntree[i],
#     maxnodes = rf_grid$maxnodes[i]
#   )

#   # Predict on test set
#   pred_probs <- predict(rf_model, test_data, type = "prob")[, 2]
#   auc <- auc(roc(test_data$Target, pred_probs))

#   if (auc > best_auc) {
#     best_auc <- auc
#     best_model <- rf_model
#   }
# }

best_auc <- 0
best_model <- NULL

for (i in 1:nrow(rf_grid)) {
  rf_model <- randomForest(
    Target ~ .,
    data = smote_train,
    mtry = rf_grid$mtry[i],
    ntree = rf_grid$ntree[i],
    maxnodes = rf_grid$maxnodes[i]
  )

  # Predict probabilities on the test set
  pred_probs <- predict(rf_model, test_data, type = "prob")[, 2]
  auc <- auc(roc(as.numeric(as.character(test_data$Target)), pred_probs))

  # Save the best model
  if (auc > best_auc) {
    best_auc <- auc
    best_model <- rf_model
  }
}

cat("Best AUC-ROC:", best_auc, "\n")

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases

Setting levels: control = 0, case = 1

Setting direction: controls < cases



In [None]:
# Evaluate the best model
pred <- predict(best_model, test_data, type = "class")
conf_matrix <- confusionMatrix(as.factor(pred), as.factor(test_data$Target))
roc_curve <- roc(test_data$Target, predict(best_model, test_data, type = "prob")[, 2])

In [None]:
# Print results
print(conf_matrix)
cat("AUC-ROC:", auc(roc_curve), "\n")

In [None]:
# Feature importance
importance <- data.frame(Feature = names(best_model$importance), Importance = best_model$importance)
importance <- importance[order(-importance$Importance), ]

In [None]:
# Plot ROC Curve
plot(roc_curve, main = "ROC Curve for Random Forest Model", col = "blue", lwd = 2)

In [None]:
# Save results to Excel
write.xlsx(
  list(
    "Confusion Matrix" = as.data.frame(conf_matrix$table),
    "Feature Importance" = importance,
    "ROC AUC" = data.frame(AUC = auc(roc_curve))
  ),
  file = "CNV_Analysis_Results_ML.xlsx"
)

cat("Results saved to 'CNV_Analysis_Results_ML.xlsx'\n")