In [50]:
# libraries
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)
install.packages("kknn")
library(kknn)

Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [51]:
#setting seed
set.seed(2020)

In [52]:
# importing in data and adding column names
data <- read_csv("https://raw.githubusercontent.com/BeesKneezz/dsci_100_2023_group_7/main/data/processed.cleveland.data", col_names = FALSE)
colnames(data) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "thal", "num")
data

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>
63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,2
67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0,3
57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0,1
38,1,3,138,175,0,0,173,0,0.0,1,?,3.0,0


In [53]:
# filtering / wrangling data
data_wrangled <- data |>  
    #   chol: serum cholestoral in mg/dl
    #   trestbps: resting blood pressure (in mm Hg on admission to the hospital)
    #   thalach: maximum heart rate achieved
    #   num: diagnosis of heart disease (angiographic disease status)
    #    -- Value 0: < 50% diameter narrowing
    #    -- Value 1: > 50% diameter narrowing
    select(chol, trestbps, thalach, num) |>
    mutate(num = as_factor(num))
data_wrangled$num[data_wrangled$num != '0'] <- '1'
data_wrangled <- droplevels(data_wrangled)
# levels(data_wrangled$num) <- c(levels(data_wrangled$num), 'Diagnosed', 'Not diagnosed')
# data_wrangled$num[data_wrangled$num != '0'] <- 'Diagnosed'
# data_wrangled$num[data_wrangled$num == '0'] <- 'Not diagnosed'
data_wrangled

chol,trestbps,thalach,num
<dbl>,<dbl>,<dbl>,<fct>
233,145,150,0
286,160,108,1
229,120,129,1
⋮,⋮,⋮,⋮
131,130,115,1
236,130,174,1
175,138,173,0


In [54]:
# splitting data sets
data_split <- initial_split(data_wrangled, prop = 0.75, strata = num)
data_train <- training(data_split)
data_test <- testing(data_split)

In [55]:
recipe <- recipe(num ~ ., data = data_train) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

model <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("classification") 

vfold <- vfold_cv(data_train, v = 5, strata = num)

fit <- workflow() |>
    add_recipe(recipe) |> 
    add_model(model) |>
    tune_grid(resamples = vfold, grid = tibble(neighbors = seq(from = 1, to = 100, by = 5)))

In [57]:
metrics <- fit |>
    collect_metrics() |>
    filter(.metric == 'accuracy')

tuned_K <-  metrics |>
    filter(mean == max(mean)) |>
    select(neighbors) |>
    pull()

metrics
tuned_K

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,accuracy,binary,0.5990426,5,0.02482683,Preprocessor1_Model01
6,accuracy,binary,0.6556653,5,0.02240458,Preprocessor1_Model02
11,accuracy,binary,0.6739438,5,0.01453390,Preprocessor1_Model03
⋮,⋮,⋮,⋮,⋮,⋮,⋮
86,accuracy,binary,0.6477646,5,0.01977668,Preprocessor1_Model18
91,accuracy,binary,0.6479622,5,0.01786999,Preprocessor1_Model19
96,accuracy,binary,0.6612033,5,0.02117171,Preprocessor1_Model20


In [58]:
tuned_model <- nearest_neighbor(weight_func = "rectangular", neighbors = tuned_K) |>
    set_engine("kknn") |>
    set_mode("classification") 

fit <- workflow() |>
    add_recipe(recipe) |> 
    add_model(tuned_model) |>
    fit_resamples(resamples = vfold)

metrics <- fit |>
    collect_metrics() |>
    filter(.metric == 'accuracy')

metrics

.metric,.estimator,mean,n,std_err,.config
<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
accuracy,binary,0.6868863,5,0.01468857,Preprocessor1_Model1
