In [1]:
#loading data and libraries
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

download.file("https://drive.google.com/uc?export=download&id=1fOQ8sy_qMkQiQEAO6uFdRX4tLI8EpSTn", "tennis_data.csv")
tennis_data <- read_csv("tennis_data.csv", show_col_types = FALSE)
#tennis_data

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.6     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.0.0 ──

[32m✔[39m [34mbroom       [39m 1.0.0     [32m✔[39m [34mrsample     [39m 1.0.0
[32m✔[39m [34mdials       [39m 1.0.0     [32m✔[39m [34mtune        [39m 1.0.0
[32m✔[39m [34minfer       [39m 1.0.2     [32m✔[39m [34mworkflows   [39m 1.0.0
[32m✔

In [2]:
#prepping data
set.seed(2132)

winner_data <- tennis_data |> select(winner_age, winner_ht, winner_rank_points, minutes) |> na.omit()
loser_data <-  tennis_data |> select(loser_age, loser_ht, loser_rank_points, minutes) |> na.omit()

winner_split <- initial_split(winner_data, prop = .75, strata = winner_rank_points)  
winner_train <- training(winner_split)
winner_test <- testing(winner_split)
#winner_train

loser_split <- initial_split(loser_data, prop = .75, strata = loser_rank_points)  
loser_train <- training(loser_split)
loser_test <- testing(loser_split)
#loser_train

In [3]:
# testing/tuning winner regression
# k = 540 - v = 5 mean = 2078.592
set.seed(5342)

win_recipe <- recipe(winner_rank_points ~ winner_ht, data = winner_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())

win_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("regression")

win_workflow <- workflow() |>
  add_recipe(win_recipe) |>
  add_model(win_spec)

win_vfold <- vfold_cv(winner_train, v = 5, strata = winner_rank_points)
gridvals <- tibble(neighbors = seq(from = 535, to = 545, by = 1))

win_results <- win_workflow |>
  tune_grid(resamples = win_vfold, grid = gridvals) |>
  collect_metrics() |>
  filter(.metric == "rmse")

win_min <- win_results |>
  filter(mean == min(mean))
win_min

k_win_selec <- ggplot(win_results, aes(x = neighbors, y = mean)) + geom_point() + geom_line()
k_win_selec

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
540,rmse,standard,2078.592,5,70.83819,Preprocessor1_Model06


In [4]:
# testing/tuning loser regression
# k = 384 - v = 5 - mean = 1323.389
set.seed(4892)

lose_recipe <- recipe(loser_rank_points ~ loser_ht, data = loser_train) |>
  step_scale(all_predictors()) |>
  step_center(all_predictors())

lose_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
  set_engine("kknn") |>
  set_mode("regression")

lose_workflow <- workflow() |>
  add_recipe(lose_recipe) |>
  add_model(lose_spec)

lose_vfold <- vfold_cv(loser_train, v = 5, strata = loser_rank_points)
gridvals <- tibble(neighbors = seq(from = 370, to = 390, by = 1))

lose_results <- lose_workflow |>
  tune_grid(resamples = lose_vfold, grid = gridvals) |>
  collect_metrics() |>
  filter(.metric == "rmse")

lose_min <- lose_results |>
  filter(mean == min(mean))
lose_min

k_lose_selec <- ggplot(lose_results, aes(x = neighbors, y = mean)) + geom_point() + geom_line()
k_lose_selec

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
384,rmse,standard,1323.389,5,100.9358,Preprocessor1_Model15
