In [19]:
library(tidyverse)
library(tidymodels)
library(hms)

In [20]:
session_df_by_minute_in_minute <- read_csv("session_df_by_minute.csv")|>
    mutate(hms = as_hms(tinterval))|>
    mutate(as_minutes = 60*hour(hms)+minute(hms))|>
    mutate(minutes = minute(hms))|>
    filter(minutes == 0 | minutes == 10| minutes == 20|minutes == 30| minutes == 40| minutes == 50)|>
    select(as_minutes, player_count)

[1mRows: [22m[34m249013[39m [1mColumns: [22m[34m2[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[32mdbl[39m  (1): player_count
[34mdttm[39m (1): tinterval

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [21]:
set.seed(69420)

sessions_split <- initial_split( session_df_by_minute_in_minute , prop = 0.6 , strata = player_count)
sessions_training <- training(sessions_split)
sessions_testing <- testing(sessions_split)

sessions_spec <- nearest_neighbor(weight_func = "rectangular" , neighbors = tune()) |> 
      set_engine("kknn") |>
      set_mode("regression")

sessions_recipe <- recipe( player_count ~ as_minutes, data = sessions_training) |>
   step_scale(all_predictors()) |>
   step_center(all_predictors())

sessions_workflow <- workflow() |>
      add_recipe(sessions_recipe) |>
      add_model(sessions_spec)



In [27]:
sessions_vfold <- vfold_cv(sessions_training, v = 5, strata = player_count)

gridvals <- tibble(neighbors = seq(1,200, by=1))

training_results <- sessions_workflow |>
                       tune_grid(resamples = sessions_vfold, grid = gridvals) |>
                       collect_metrics() 



training_min <- training_results |>
               filter(.metric == 'rmse') |>
               filter(mean == min(mean)) |>
            select( neighbors)|>
            pull()

In [28]:
session_best_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = training_min) |>
                  set_engine("kknn") |>
                  set_mode("regression")

session_fit <- workflow() |>
           add_recipe(sessions_recipe) |>
           add_model(session_best_spec) |>
           fit(data = sessions_training)

In [35]:
session_summary <- session_fit |> 
           predict(sessions_testing) |>
           bind_cols(sessions_testing) |>
           metrics(truth = player_count, estimate = .pred) 

In [36]:
session_summary

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,0.6603468
rsq,standard,0.1184708
mae,standard,0.4233886


In [37]:
time_prediction <- session_fit |> 
           predict(sessions_testing) |>
           bind_cols(sessions_testing)|>
            select( as_minutes, .pred)

plot_time_prediction

In [None]:
Methods and Plan

Why is this method appropriate?

Which assumptions are required, if any, to apply the method selected?

What are the potential limitations or weaknesses of the method selected?

How are you going to compare and select the model?

How are you going to process the data to apply the model? For example: Are you splitting the data? How? How many splits? What proportions will you use for the splits? At what stage will you split? Will there be a validation set? Will you use cross validation?
