# 

In [1]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [2]:
PLAYERS_DATA_URL = "https://raw.githubusercontent.com/Bumblet3a/dsci-individual-project/refs/heads/main/players.csv"
SESSION_DATA_URL = "https://raw.githubusercontent.com/Bumblet3a/dsci-individual-project/refs/heads/main/sessions.csv"

download.file(PLAYERS_DATA_URL, "players.csv")
download.file(SESSION_DATA_URL, "sessions.csv")

players <- read_csv("players.csv")
head(players)

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,Female,21
Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,Male,21
Amateur,True,f58aad5996a435f16b0284a3b267f973f9af99e7a89bee0430055a44fa92f977,0.0,Adrian,Female,17


In [3]:
players_filtered <- select(players, Age, experience, played_hours, subscribe)|>
mutate(subscribe = as_factor(subscribe))|>
drop_na(Age)
players_filtered

Age,experience,played_hours,subscribe
<dbl>,<chr>,<dbl>,<fct>
9,Pro,30.3,TRUE
17,Veteran,3.8,TRUE
17,Veteran,0.0,FALSE
⋮,⋮,⋮,⋮
22,Veteran,0.3,FALSE
17,Amateur,0.0,FALSE
17,Amateur,2.3,FALSE


In [4]:
players_scaled <- players_filtered|>
mutate(Age = scale(Age, center = TRUE),
       played_hours = scale(played_hours, center = TRUE))
players_scaled

Age,experience,played_hours,subscribe
"<dbl[,1]>",<chr>,"<dbl[,1]>",<fct>
-1.8657878,Pro,0.85603962,TRUE
-0.5701714,Veteran,-0.07385234,TRUE
-0.5701714,Veteran,-0.20719534,FALSE
⋮,⋮,⋮,⋮
0.2395889,Veteran,-0.1966683,FALSE
-0.5701714,Amateur,-0.2071953,FALSE
-0.5701714,Amateur,-0.1264877,FALSE


In [5]:

players_split <- initial_split(players_scaled, prop = 0.75, strata = subscribe)
players_testing <- testing(players_split)
players_training <- training(players_split)

In [6]:
players_recipe <- recipe(subscribe ~ ., data = players_scaled)|>
step_scale(Age, played_hours)|>
step_center()

players_model<- nearest_neighbor(weight_func = "rectangular", neighbors = 5)|>
set_engine("kknn")|>
set_mode("classification")

players_fit <- workflow()|>
add_recipe(players_recipe)|>
add_model(players_model)|>
fit(data = players_scaled)

In [7]:
players_prediction <- predict(players_fit, players_testing)|>
bind_cols(players_testing)
players_prediction

.pred_class,Age,experience,played_hours,subscribe
<fct>,"<dbl[,1]>",<chr>,"<dbl[,1]>",<fct>
TRUE,-0.5701714,Veteran,-0.07385234,TRUE
TRUE,-0.5701714,Amateur,-0.20719534,TRUE
TRUE,-0.2462673,Regular,-0.20719534,TRUE
⋮,⋮,⋮,⋮,⋮
TRUE,-0.5701714,Amateur,-0.2071953,FALSE
TRUE,-0.5701714,Amateur,-0.2071953,FALSE
TRUE,-0.5701714,Amateur,-0.1264877,FALSE


In [8]:
players_pred_accuracy <- players_prediction|>
metrics(truth = subscribe, estimate = .pred_class)
players_pred_accuracy

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.7755102
kap,binary,0.2108346


In [12]:
players_mat <- players_prediction|>
conf_mat(truth = subscribe, estimate = .pred_class)
players_mat

          Truth
Prediction FALSE TRUE
     FALSE     2    0
     TRUE     11   36

In [26]:
players_vfold <- vfold_cv(players_training, v = 5, strata = subscribe)


players_resample <- workflow()|>
add_recipe(players_recipe)|>
add_model(players_model)|>
fit_resamples(resamples = players_vfold)|>
collect_metrics()
players_resample

.metric,.estimator,mean,n,std_err,.config
<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
accuracy,binary,0.730197,5,0.02929188,Preprocessor1_Model1
roc_auc,binary,0.6044952,5,0.06408316,Preprocessor1_Model1


In [34]:
players_tune <-  nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
set_engine("kknn")|>
set_mode("classification")

players_grid = tibble(neighbors = seq(from = 1, to = 10, by=1))

players_tune_results <- workflow()|>
add_recipe(players_recipe)|>
add_model(players_tune)|>
tune_grid(resamples = players_vfold, grid = players_grid)|>
collect_metrics()
players_tune_results

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,accuracy,binary,0.6055008,5,0.04521082,Preprocessor1_Model01
1,roc_auc,binary,0.5507035,5,0.04927219,Preprocessor1_Model01
2,accuracy,binary,0.6192939,5,0.04031851,Preprocessor1_Model02
⋮,⋮,⋮,⋮,⋮,⋮,⋮
9,roc_auc,binary,0.6117308,5,0.07812870,Preprocessor1_Model09
10,accuracy,binary,0.7235140,5,0.02129237,Preprocessor1_Model10
10,roc_auc,binary,0.5866844,5,0.07853797,Preprocessor1_Model10
