# 

In [37]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [7]:
PLAYERS_DATA_URL = "https://raw.githubusercontent.com/Bumblet3a/dsci-individual-project/refs/heads/main/players.csv"
SESSION_DATA_URL = "https://raw.githubusercontent.com/Bumblet3a/dsci-individual-project/refs/heads/main/sessions.csv"

download.file(PLAYERS_DATA_URL, "players.csv")
download.file(SESSION_DATA_URL, "sessions.csv")

players <- read_csv("players.csv")
head(players)
sessions <- read_csv("sessions.csv")

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,Female,21
Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,Male,21
Amateur,True,f58aad5996a435f16b0284a3b267f973f9af99e7a89bee0430055a44fa92f977,0.0,Adrian,Female,17


[1mRows: [22m[34m1535[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): hashedEmail, start_time, end_time
[32mdbl[39m (2): original_start_time, original_end_time

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [62]:
player_sessions <- merge(sessions, players, all = TRUE)|>
select(Age, experience, subscribe, played_hours)|>
drop_na(Age:played_hours)
player_sessions

Age,experience,subscribe,played_hours
<dbl>,<chr>,<lgl>,<dbl>
20,Regular,TRUE,1.5
20,Regular,TRUE,1.5
17,Veteran,TRUE,0.0
⋮,⋮,⋮,⋮
23,Amateur,TRUE,56.1
17,Amateur,TRUE,0.1
20,Beginner,TRUE,1.1


In [73]:
play_sessions_split <- initial_split(player_sessions, prop = 0.75, strata = played_hours)
play_sessions_training <- training(play_sessions_split)
play_sessions_testing <- testing(play_sessions_split)

play_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
set_engine("kknn")|>
set_mode("regression")

play_recipe <- recipe(played_hours ~ ., data = play_sessions_training)|>
step_scale(Age)|>
step_center(Age)
play_recipe

play_vfold <- vfold_cv(play_sessions_training, v = 5, strata = played_hours)


play_workflow <- workflow()|>
add_recipe(play_recipe)|>
add_model(play_spec)

gridvals <- tibble(neighbors = seq(from = 1, to = 81, by = 10))

play_results <- play_workflow|>
tune_grid(resamples = play_vfold, grid = gridvals)|>
collect_metrics()

play_results



[36m──[39m [1mRecipe[22m [36m──────────────────────────────────────────────────────────────────────[39m



── Inputs 

Number of variables by role

outcome:   1
predictor: 3



── Operations 

[36m•[39m Scaling for: [34mAge[39m

[36m•[39m Centering for: [34mAge[39m



neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,rmse,standard,109.20611,5,1.79856,Preprocessor1_Model1
1,rsq,standard,0.09432,5,0.01801,Preprocessor1_Model1
11,rmse,standard,85.51235,5,2.49178,Preprocessor1_Model2
⋮,⋮,⋮,⋮,⋮,⋮,⋮
71,rsq,standard,0.7433,5,0.02769,Preprocessor1_Model8
81,rmse,standard,42.6988,5,2.34702,Preprocessor1_Model9
81,rsq,standard,0.7376,5,0.02789,Preprocessor1_Model9


In [79]:
play_min <- play_results|>
filter(.metric == "rmse")|>
slice_min(mean, n = 1)
play_min

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
61,rmse,standard,41.87,5,2.184,Preprocessor1_Model7


In [87]:
min <- play_min|>
pull(neighbors)

play_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = min)|>
set_engine("kknn")|>
set_mode("regression")

play_fit <- workflow()|>
add_recipe(play_recipe)|>
add_model(play_spec)|>
fit(data = play_sessions_training)

play_summ <- play_fit|>
predict(play_sessions_testing)|>
bind_cols(play_sessions_testing)|>
metrics(truth = played_hours, estimate = .pred)
play_summ

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,35.5625
rsq,standard,0.8346
mae,standard,20.248
