In [4]:
# loading packages
library(repr)
library(tidyverse)
library(tidymodels)
# library(themis)
# set.seed(31)

stroke <- read_csv("data/stroke-data.csv")
stroke_clean <- stroke %>%
    select(age, hypertension, heart_disease, avg_glucose_level, bmi, stroke) %>%
    mutate(stroke = as_factor(stroke),
           bmi = as.numeric(bmi)) 

Parsed with column specification:
cols(
  id = [32mcol_double()[39m,
  gender = [31mcol_character()[39m,
  age = [32mcol_double()[39m,
  hypertension = [32mcol_double()[39m,
  heart_disease = [32mcol_double()[39m,
  ever_married = [31mcol_character()[39m,
  work_type = [31mcol_character()[39m,
  Residence_type = [31mcol_character()[39m,
  avg_glucose_level = [32mcol_double()[39m,
  bmi = [31mcol_character()[39m,
  smoking_status = [31mcol_character()[39m,
  stroke = [32mcol_double()[39m
)

“Problem with `mutate()` input `bmi`.
[34mℹ[39m NAs introduced by coercion
[34mℹ[39m Input `bmi` is `as.numeric(bmi)`.”
“NAs introduced by coercion”


In [11]:
stroke_split <- initial_split(stroke_clean, prop = 0.75, strata = stroke)
stroke_train <- training(stroke_split)
stroke_test <- testing(stroke_split)

In [12]:
for (i in 1:10) {
    up_recipe <- recipe(stroke ~ ., data = stroke_train) %>%
        step_scale(all_predictors()) %>%
        step_center(all_predictors()) %>%
        step_upsample(stroke, over_ratio = i * 0.1, skip = FALSE) %>%
        prep()
    upsampled_stroke <- bake(up_recipe, stroke_train)
    
    stroke_recipe <- recipe(stroke ~ avg_glucose_level + age + hypertension, data = upsampled_stroke) %>%
      step_scale(all_predictors()) %>%
      step_center(all_predictors())
    
    tune_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
      set_engine("kknn") %>%
      set_mode("classification")
    
    knn_results <- workflow() %>%
      add_recipe(stroke_recipe) %>%
      add_model(knn_spec) %>%
      tune_grid(resamples = stroke_vfold, grid = k_vals) %>%
      collect_metrics() %>% # Plot accuracies vs neighbours
      filter(.metric == "accuracy") %>%
      filter(mean == max(mean)) %>%
      slice(1) %>%
      pull(neighbors)
    
    kmin <- 
    
    knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = kmin) %>%
      set_engine("kknn") %>%
      set_mode("classification")

    # workflow + fit data
    knn_fit <- workflow() %>%
      add_recipe(stroke_recipe) %>%
      add_model(knn_spec) %>%
      fit(data = upsampled_stroke)

    stroke_test_predictions1 <- predict(knn_fit, stroke_test) %>%
      bind_cols(stroke_test) %>%
      metrics(truth = stroke, estimate = .pred_class)
    print(i)
    print(stroke_test_predictions1)
}

[1] 1
[90m# A tibble: 2 x 3[39m
  .metric  .estimator .estimate
  [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m          [3m[90m<dbl>[39m[23m
[90m1[39m accuracy binary         0.955
[90m2[39m kap      binary         0    
[1] 2
[90m# A tibble: 2 x 3[39m
  .metric  .estimator .estimate
  [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m          [3m[90m<dbl>[39m[23m
[90m1[39m accuracy binary       0.115  
[90m2[39m kap      binary       0.007[4m0[24m[4m6[24m
[1] 3
[90m# A tibble: 2 x 3[39m
  .metric  .estimator .estimate
  [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m          [3m[90m<dbl>[39m[23m
[90m1[39m accuracy binary        0.044[4m6[24m
[90m2[39m kap      binary        0     
[1] 4
[90m# A tibble: 2 x 3[39m
  .metric  .estimator .estimate
  [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m          [3m[90m<dbl>[39m[23m
[90m1[39m accuracy binary        0.044[4m6[24m
[90m2[39m kap      binary        0     
[1] 5
[