In [None]:
# First, we need to load all necessary R packages using the library function

install.packages("kknn")
library(kknn)
library(tidyverse)
library(tidymodels)
library(repr)
library(dplyr)

In [None]:
# Then, we need to read the data from a URL using appropriate read_* and assign it to a variable
# The dataset was downloaded from: "https://www.kaggle.com/datasets/armanakbari/connectionist-bench-sonar-mines-vs-rocks"

sonar_url <- "https://raw.githubusercontent.com/DannyPirouz/DSCI_100_Project/main/sonar.all-data.csv"
sonar_data <- read_csv(sonar_url)

In [None]:
# Check to see if we have missing data

sum(is.na(sonar_data))

In [None]:
# Next, we need to convert the column "Label" to a factor type and change the name of the labels to improve readability

sonar_data <- sonar_data |>
              mutate(Label = as.factor(Label)) |>
              mutate(Label = fct_recode(Label, "Rock" = "R", "Mine" = "M"))
head(sonar_data, 5)
# Our data is now wrangled and clean

In [None]:
# Now, we want to see how many "Rocks" and "Mines" we have in our entire data set

count_table <- sonar_data |> 
               group_by(Label) |>
               summarize(Count = n()) |>
               mutate(Percentage = 100*Count/nrow(sonar_data))
count_table
# It is good that the percentages are close to 50%

In [None]:
# We need to split the data into the training portion and the testing portion 
# The initial_split function will create randomness for us 
set.seed(1)

sonar_split <- initial_split(sonar_data, prop = 0.75, strata = Label)
sonar_train <- training(sonar_split)
sonar_test <- testing(sonar_split)

head(sonar_train, 5)

In [None]:
# We have created a table to see how many "Rocks" and "Mines" we have in our training data

label_proportions_table <- sonar_train |>
                           group_by(Label) |> 
                           summarize(Count = n()) |>
                           mutate(Percentage = 100*Count/nrow(sonar_train))

label_proportions_table

# We notice that our "Label" proportions were preserved when we split the data 

In [None]:
# Finding the mean of every frequency

freq_means_per_column <- sonar_train |>
              select(Freq_1:Freq_60) |>
              map_df(mean)

freq_means_per_column <- pivot_longer(freq_means_per_column, cols = Freq_1:Freq_60, names_to = "Frequency_Number", values_to = "Mean_Frequency_Value")
head(freq_means_per_column, 5)

In [None]:
# Finding the mean of every frequency grouped by "Label"

freq_means_grouped_by_label <- sonar_train |>
                               group_by(Label) |>
                               summarize(across(Freq_1:Freq_60, ~ mean(.x, na.rm = TRUE)))
 
freq_means_grouped_by_label <- pivot_longer(freq_means_grouped_by_label, cols = Freq_1:Freq_60, names_to = "Frequency Number", values_to = "Mean Frequency Value")
head(freq_means_grouped_by_label, 5)
tail(freq_means_grouped_by_label, 5)

In [None]:
# Finding the maximum of every frequency

freq_max_per_column <- sonar_train |>
            select(Freq_1:Freq_60) |>
            map_df(max)

freq_max_per_column <- pivot_longer(freq_max_per_column, cols = Freq_1:Freq_60, names_to = "Frequency Number", values_to = " Maximum Frequency Value")
head(freq_max_per_column, 5)

In [None]:
# Finding the minimum of every frequency

freq_min_per_column <- sonar_train |>
            select(Freq_1:Freq_60) |>
            map_df(min)

freq_min_per_column <- pivot_longer(freq_min_per_column, cols = Freq_1:Freq_60, names_to = "Frequency Number", values_to = " Minimum Frequency Value")
head(freq_min_per_column, 5)

In [None]:
# Plotting a histogram of various frequencies to see what their distributions look like

# The data never mentioned what the units for the energy represented by each frequency so we were not able to include those units

hist_18 <- ggplot(sonar_train, aes(x = Freq_18, fill = Label)) + 
          geom_histogram(position = "identity", binwidth = 0.04) + 
          labs(x = "Frequency 18", y = "Count", fill = "Label") +
          ggtitle("Distribution of Frequency 18 Labeled as Mine or Rock") +
          theme(text = element_text(size = 12)) +
          facet_grid(rows = vars(Label))
hist_18

# These two histograms have very similar distributions as they both have their modes at around 0.025. 
# Since the distributions are very similar, it is very unlikely that this frequency would be a driving force.

In [None]:
hist_12 <- ggplot(sonar_train, aes(x = Freq_12, fill = Label)) + 
          geom_histogram(position = "identity", binwidth = 0.04) + 
          labs(x = "Frequency 12", y = "Count", fill = "Label") +
          ggtitle("Distribution of Frequency 12 Labeled as Mine or Rock") +
          theme(text = element_text(size = 12)) +
          facet_grid(rows = vars(Label))
hist_12

# These two histograms have different distributions. 
# The mine distribution resembles a bell-shape with the average around 0.3.
# The rock distribution is very right-skewed with a mode at around 0.1.
# Since the distributions are different, this suggests that this frequency could be a driving factor.

In [None]:
# Now we are going to create scatter plots with various frequencies to see if we can identify any groupings or patterns
# First, we have to scale our data for the scatter plot
# Then, we can create the scatter plot

set.seed(3)

sonar_train_recipe <- recipe(Label ~ ., data = sonar_train) |>
                      step_scale(all_predictors()) |>
                      step_center(all_predictors()) |>
                      prep()

scaled_sonar_train <- bake(sonar_train_recipe, sonar_train)

In [None]:
scatterplot_1_and_2 <- ggplot(scaled_sonar_train, aes(x = Freq_1, y = Freq_2, colour = Label)) + 
                       geom_point() +
                       labs(x = "Frequency 1 (standardized)", y = "Frequency 2 (standardized)") +
                       ggtitle("Frequency 1 vs Frequency 2") +
                       theme(text = element_text(size = 12))
                       
scatterplot_1_and_2

# The plot looks to have a positive relationship with the mines dominating the higher frequency values. 
# As Frequency 1 increases, Frequency 2 also increases.
# The strength doesn't seem to be very strong as the data points are fairly spread.

In [None]:
scatterplot_28_and_39 <- ggplot(scaled_sonar_train, aes(x = Freq_28, y = Freq_39, colour = Label)) + 
                         geom_point() +
                         labs(x = "Frequency 28 (standardized)", y = "Frequency 39 (standardized)") +
                         ggtitle("Frequency 28 vs Frequency 39") +
                         theme(text = element_text(size = 12))
                       
scatterplot_28_and_39

# There is no relationship between these two frequencies. The data points are all over the place.
# This suggests that these two frequencies have nothing to do with each other. 

In [None]:
# This is a bar plot of the average frequency value of every frequency


sonar_bar_mean <- ggplot(freq_means_per_column, aes(x = Mean_Frequency_Value, y = fct_reorder(Frequency_Number, Mean_Frequency_Value, .desc = TRUE))) + 
                  geom_bar(stat = "identity") +
                  labs(x = "Frequency Value", y = "Frequency Number") +
                  ggtitle("Average Frequency Value per Frequency") +
                  theme(text = element_text(size = 12))
sonar_bar_mean

# We see that frequency 60 has the lowest average frequency close to 0 and frequency 26 has the highest just above 0.7.
# Not many of the frequencies have similar frequency averages. 

In [None]:
# We alreadly split our data into our training set and our testing set
# We also have already made our recipe

set.seed(3)

# Now we need to make a model specification with tune() as the neighbours 
# to find the best number of neighbours

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
set_engine("kknn") |>
set_mode("classification")

# Here we want to specify how many coss-validations we want

number_vfold <- vfold_cv(sonar_train, v = 5, strata = Label)

# This is the range of k values we are testing

k_vals <- tibble(neighbors = seq(from = 1, to = 25, by = 1))

# Now we are putting it all in a workflow

knn_fit <- workflow() |>
add_recipe(sonar_train_recipe) |>
add_model(knn_spec) |>
tune_grid(resamples = number_vfold, grid = k_vals) |>
collect_metrics()

accuracy <- knn_fit |>
filter(.metric == "accuracy")

# We make a plot to find the k with the highest accuracy

cross_val_plot <- ggplot(accuracy, aes(x = neighbors, y = mean)) + geom_point() + geom_line () + labs(x = "Neighbours", y = "Accuracy")

cross_val_plot

# We see that k is highest at 3 or 4 but we will use 3 as we have 2 labels
# which means we won't have even splits now.

head(accuracy, 5)

# The highest accuracy is 86.35618% to be exact

In [None]:
set.seed(4)

# Now that we found k = 3, we create a new workflow

new_knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) |>
set_engine("kknn") |>
set_mode("classification")

new_knn_fit <- workflow() |>
add_recipe(sonar_train_recipe) |>
add_model(new_knn_spec) |>
fit(data = sonar_train)

# Now we can test our testing set

sonar_test_predictions <- predict(new_knn_fit, sonar_test) |>
bind_cols(sonar_test)

head(sonar_test_predictions, 5)
tail(sonar_test_predictions, 5)

In [None]:
set.seed(4)

# We get the confusion matrix for our testing set

sonar_metrics <- sonar_test_predictions |>
metrics(truth = Label, estimate = .pred_class) |>
filter(.metric == "accuracy")
sonar_metrics

sonar_conf_mat <- sonar_test_predictions |>
conf_mat(truth = Label, estimate = .pred_class)

sonar_conf_mat

# We see we have an accuracy of ~85% and that rocks are more likely 
# to be predicted mines than mines are predicted to be rocks.

In [None]:
set.seed(4)

# We are finally able to test our unknown label to determine what it is

unknown_label <- tibble(
Freq_1=0.029293548, Freq_2=0.038125161, Freq_3=0.043919355, Freq_4=0.053570968, Freq_5=0.073985161,
Freq_6=0.107596129, Freq_7=0.123939355, Freq_8=0.135519355, Freq_9=0.182131613, Freq_10=0.210321290,
Freq_11=0.239072258, Freq_12=0.252205161, Freq_13=0.269441935, Freq_14=0.287030968, Freq_15=0.307488387,
Freq_16=0.368141935, Freq_17=0.409109677, Freq_18=0.442067097, Freq_19=0.493798710, Freq_20=0.567036774,
Freq_21=0.629974839, Freq_22=0.650916774, Freq_23=0.668149032, Freq_24=0.692389677, Freq_25=0.706011613,
Freq_26=0.731724516, Freq_27=0.728425806, Freq_28=0.714849677, Freq_29=0.656634839, Freq_30=0.586917419,
Freq_31=0.498092903, Freq_32=0.438041290, Freq_33=0.410323226, Freq_34=0.399729032, Freq_35=0.392028387,
Freq_36=0.383221290, Freq_37=0.355252903, Freq_38=0.328312258, Freq_39=0.315471613, Freq_40=0.297972903,
Freq_41=0.273301290, Freq_42=0.261722581, Freq_43=0.240483871, Freq_44=0.210916774, Freq_45=0.194581935,
Freq_46=0.158237419, Freq_47=0.118338065, Freq_48=0.088425806, Freq_49=0.050437419, Freq_50=0.020080645,
Freq_51=0.015726452, Freq_52=0.013165161, Freq_53=0.010220645, Freq_54=0.010878065, Freq_55=0.008923871,
Freq_56=0.007916774, Freq_57=0.007650323, Freq_58=0.007411613, Freq_59=0.007715484, Freq_60=0.006610968)

predict(new_knn_fit, unknown_label)