In [1]:
system("sudo apt install python3-venv python3-pip python3-dev")

In [2]:
# Install and load required packages
install.packages(c("keras", "tensorflow", "ggplot2", "caret"))
library(keras)
library(tensorflow)
library(ggplot2)
library(caret)
library(readxl)
# Install TensorFlow (run once)
install_tensorflow()

Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘listenv’, ‘parallelly’, ‘future’, ‘globals’, ‘shape’, ‘future.apply’, ‘numDeriv’, ‘progressr’, ‘SQUAREM’, ‘diagram’, ‘lava’, ‘prodlim’, ‘RcppTOML’, ‘here’, ‘png’, ‘proxy’, ‘iterators’, ‘clock’, ‘gower’, ‘hardhat’, ‘ipred’, ‘sparsevctrs’, ‘timeDate’, ‘reticulate’, ‘tfruns’, ‘zeallot’, ‘config’, ‘tfautograph’, ‘e1071’, ‘foreach’, ‘ModelMetrics’, ‘plyr’, ‘pROC’, ‘recipes’, ‘reshape2’


Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:tensorflow’:

    train




Using Python: /usr/bin/python3.10
Creating virtual environment 'r-tensorflow' ... 


+ /usr/bin/python3.10 -m venv /root/.virtualenvs/r-tensorflow



Done!
Installing packages: pip, wheel, setuptools


+ /root/.virtualenvs/r-tensorflow/bin/python -m pip install --upgrade pip wheel setuptools



Virtual environment 'r-tensorflow' successfully created.
Using virtual environment 'r-tensorflow' ...


+ /root/.virtualenvs/r-tensorflow/bin/python -m pip install --upgrade --no-user 'tensorflow==2.16.*'




Installation complete.



In [3]:
# Load your data
df <- read_excel('final.xlsx')
df <- df[2:17]  # Select relevant columns

In [4]:

# Convert target to factor and numeric
df$mq_type <- as.factor(df$mq_type)
y <- as.numeric(df$mq_type) - 1  # Convert to 0,1,2 for deep_mq, impact_mq, shallow_mq

In [5]:
# Remove constant columns (like Arrival Time)
constant_cols <- sapply(df, function(x) length(unique(x)) == 1)
df <- df[, !constant_cols]

In [6]:
# Split data into training and test sets
set.seed(123)
train_index <- createDataPartition(y, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

In [7]:
# Scale numeric features (excluding the target variable)
preprocess_params <- preProcess(train_data[, -which(names(train_data) == "mq_type")],
                              method = c("center", "scale"))
train_scaled <- predict(preprocess_params, train_data[, -which(names(train_data) == "mq_type")])
test_scaled <- predict(preprocess_params, test_data[, -which(names(test_data) == "mq_type")])


In [8]:
# Convert to matrices for keras
x_train <- as.matrix(train_scaled)
x_test <- as.matrix(test_scaled)


In [9]:
library(caret)
dummy <- dummyVars(" ~ .", data = data.frame(mq_type = train_data$mq_type))
y_train <- predict(dummy, newdata = data.frame(mq_type = train_data$mq_type))
y_test <- predict(dummy, newdata = data.frame(mq_type = test_data$mq_type))


In [10]:
colnames(df)

In [11]:
colnames(df)
colnames(df) <- gsub(" ", "_", colnames(df))
colnames(df) <- gsub("/", "_", colnames(df))
colnames(df)

In [12]:
# Install if not already installed
install.packages("randomForest")
library(randomForest)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

randomForest 4.7-1.2

Type rfNews() to see new features/changes/bug fixes.


Attaching package: ‘randomForest’


The following object is masked from ‘package:ggplot2’:

    margin




In [13]:
# Prepare data
df$mq_type <- as.factor(df$mq_type)  # Make sure target is a factor
df <- df[, !sapply(df, function(x) length(unique(x)) == 1)]  # Remove constant columns

In [14]:
# Split into train and test
set.seed(123)
train_idx <- sample(1:nrow(df), 0.8 * nrow(df))
train_data <- df[train_idx, ]
test_data <- df[-train_idx, ]

In [15]:
# Train a simple Random Forest
model_rf <- randomForest(mq_type ~ ., data = train_data, ntree = 100)

In [16]:
# Predict
pred <- predict(model_rf, newdata = test_data)

In [17]:
# Confusion matrix
confusion <- table(Predicted = pred, Actual = test_data$mq_type)
print(confusion)

            Actual
Predicted    deep_mq impact_mq shallow_mq
  deep_mq          0         0          0
  impact_mq        1        14          0
  shallow_mq       0         0          0


In [18]:

# Accuracy
accuracy <- sum(diag(confusion)) / sum(confusion)
cat("Accuracy:", accuracy, "\n")

Accuracy: 0.9333333 


In [20]:
# Check class distribution in your training data
table(train_data$mq_type)


   deep_mq  impact_mq shallow_mq 
         8         49          2 

In [25]:
install.packages("smotefamily")
library(smotefamily)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘FNN’, ‘dbscan’, ‘igraph’




In [26]:
# Prepare data: Features (X) and Target (y)
X <- train_data[, -which(names(train_data) == "mq_type")]
y <- as.numeric(train_data$mq_type)  # Convert factor to numeric (1,2,3)

In [31]:
# 1. Check class sizes first
class_counts <- table(train_data$mq_type)
print(class_counts)


   deep_mq  impact_mq shallow_mq 
         8         49          2 


In [32]:
# 2. Find the smallest class size
min_class_size <- min(class_counts)

# 3. Set K adaptively (must be < smallest class size)
safe_K <- min(5, min_class_size - 1)  # Ensures K < min_class_size


In [33]:
# 5. Apply SMOTE with safe parameters
smote_result <- SMOTE(
  X = X,
  target = y,
  K = safe_K,           # Dynamic K based on smallest class
  dup_size = 1          # How many synthetic samples per real sample
)


In [34]:
# 6. Convert back to original format
smote_train <- data.frame(smote_result$data)
smote_train$mq_type <- factor(
  smote_train$class,
  levels = 1:length(levels(train_data$mq_type)),
  labels = levels(train_data$mq_type)
)
smote_train$class <- NULL

# Verify new distribution
table(smote_train$mq_type)


   deep_mq  impact_mq shallow_mq 
         8         49          4 

In [35]:
# 2. Train Random Forest on SMOTE-balanced data
set.seed(123)
model_rf <- randomForest(
  mq_type ~ .,
  data = smote_train,
  ntree = 100,
  importance = TRUE  # To check variable importance later
)

In [36]:
# 3. Predict on original (unmodified) test set
pred <- predict(model_rf, newdata = test_data)


In [37]:
# 4. Evaluate performance
conf_matrix <- table(Predicted = pred, Actual = test_data$mq_type)
cat("\nConfusion Matrix:\n")
print(conf_matrix)


Confusion Matrix:
            Actual
Predicted    deep_mq impact_mq shallow_mq
  deep_mq          0         0          0
  impact_mq        1        14          0
  shallow_mq       0         0          0


In [38]:
metrics <- confusionMatrix(pred, test_data$mq_type)
cat("\nOverall Accuracy:", metrics$overall["Accuracy"], "\n")
cat("\nClass-wise Metrics:\n")
print(metrics$byClass)



Overall Accuracy: 0.9333333 

Class-wise Metrics:
                  Sensitivity Specificity Pos Pred Value Neg Pred Value
Class: deep_mq              0           1            NaN      0.9333333
Class: impact_mq            1           0      0.9333333            NaN
Class: shallow_mq          NA           1             NA             NA
                  Precision Recall        F1 Prevalence Detection Rate
Class: deep_mq           NA      0        NA 0.06666667      0.0000000
Class: impact_mq  0.9333333      1 0.9655172 0.93333333      0.9333333
Class: shallow_mq        NA     NA        NA 0.00000000      0.0000000
                  Detection Prevalence Balanced Accuracy
Class: deep_mq                       0               0.5
Class: impact_mq                     1               0.5
Class: shallow_mq                    0                NA
