In [3]:
library(tidyverse)
library(repr)
library(tidymodels)
library(RCurl)
options(repr.matrix.max.rows = 6)
set.seed(1)

── [1mAttaching packages[22m ──────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.0     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.1.8     [32m✔[39m [34mdplyr  [39m 1.1.0
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.3     [32m✔[39m [34mforcats[39m 1.0.0
── [1mConflicts[22m ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
── [1mAttaching packages[22m ─────────────────────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.3     [32m✔[39m [34mrsample     [39m 1.1.1
[32m✔[39m [34mdials       [39m 1.1.0     [32m✔[39m [34mtune        [39m 1.0.1
[32m✔[39m [34minfer       [39m 1.0.4     

In [4]:
# Read the data from the web into jupyter
x <- getURL("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-06/forest.csv")
data <- read.csv(text = x)

In [5]:
# Overview of the original dataset
#tidydata

entity,code,year,net_forest_conversion
<chr>,<chr>,<int>,<dbl>
Algeria,DZA,1990,-8800
Algeria,DZA,2000,33900
Algeria,DZA,2010,7600
⋮,⋮,⋮,⋮
Zimbabwe,ZWE,2000,-46070
Zimbabwe,ZWE,2010,-46070
Zimbabwe,ZWE,2015,-46070


**Table 1.1** - Original dataset

In [None]:
# Tidy the data if needed, only keep relevant columns
#tidy_data

**Table 1.2** - Tidy dataset

In [None]:
# Split the data into training and testing sets at 75:25 ratio

set.seed(1) # Set the seed for reproducability
split_data <- initial_split(tidy_data, prop = 0.75, strata = PREDICTED)

# training set
train_data <- training(split_data)

# testing set
test_data <- testing(split_data)

In [None]:
# Preliminary data analysis

**Table 1.3** - Preliminary data analysis

In [6]:
# Preliminary data visualization

**Table 1.4** - Preliminary data 

In [None]:
# Building the classifier/regressor 
# Scale predictors, use standard recipe, setup knn_spec to tune for best k value
data_recipe <- recipe(PREDICTED ~ PREDICTOR1 + PREDICTOR2, data = train_data) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())

knn_spec <- nearest_neighbor(weight_func = "rectangular", 
                             neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("classification")

In [None]:
#trying 15 different k values, count by every 2 from 1 to 30

k_vals <- tibble(neighbors = seq(from = 1, to = 31, by = 2))

data_vfold <- vfold_cv(train_data, v = 5, strata = PREDICTED)

knn_results <- workflow() |>
  add_recipe(data_recipe) |>
  add_model(knn_spec) |>
  tune_grid(resamples = data_vfold, grid = k_vals) |>
  collect_metrics() 

accuracies <- knn_results |>
  filter(.metric == "accuracy")

In [None]:
# Accuracy table for different k values
accuracies |>  arrange(desc(mean))

**Table 2.1** - Accuracy of the different k values

In [None]:
# Plot the different accuracies of k, 
accuracy_vs_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
  geom_point() +
  geom_line() +
  labs(x = "Neighbors", y = "Accuracy Estimate") + 
  theme(text = element_text(size = 15))
     

**Figure 2.2** - Plot of accuracies of the different k

In [None]:
# Calculate accuracy of the model using the best k and cross-validation

knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 10) |>
  set_engine("kknn") |>
  set_mode("classification")

knn_fit <- workflow() |>
  add_recipe(data_recipe) |>
  add_model(knn_spec) |>
  fit_resamples(resamples = data_vfold)

accuracy_value <- knn_fit |> 
  collect_metrics() 

In [None]:
# Display accuracy of the model
accuracy_value

**Table 2.3** - Accuracy of the model

In [None]:
#test predictions using test-data
knn_fit <- workflow() |>
  add_recipe(data_recipe) |>
  add_model(knn_spec) |>
  fit(train_data)

data_test_predictions <- predict(knn_fit, test_data) |>
  bind_cols(match_test)

accuracy_only <- data_test_predictions |>
  metrics(truth = PREDICTED, estimate = .pred_class) |>
  filter(.metric == "accuracy")


confusion <- data_test_predictions |>
             conf_mat(truth = PREDICTED, estimate = .pred_class)

In [None]:
# Accuracy of the model on testing data
accuracy_only

**Table 2.4** - Accuracy of model on testing data

In [None]:
# A table of predictions of the model
data_test_predications

In [None]:
# Confusion
confusion

In [None]:
# Data visualization 

In [None]:
# Forward Selection?

In [None]:
# Discussion