In [10]:
#ALL THE NECESSARY CODE THAT IS NEEDED IN METHODS/RESULTS (for the question Can played hours and age predict subscription status in players.csv?) :

#Loads tidyverse n' stuff
library(repr)
library(tidyverse)
library(tidymodels)
options(repr.matrix.max.rows = 10) 

#Reads the Data
player_data=read_csv("https://raw.githubusercontent.com/Arzmxn/ideal-umbrella/refs/heads/main/players.csv") #Change to actual Repo at end

#Wrangles/Cleans the Data 
wrangled_player=player_data|>
    mutate(subscribe=as_factor(subscribe))|> #Makes Subscription a Factor
    select(subscribe,played_hours,Age)|> # Selects for the 3 relevant variables (Subscription status, Played Hours, Age)
    filter(Age>=0) #Filters for any age that is N/A (removal of missing data)

#Code for Relevant Summary Statistics
summary_data=player_data|>  
    select(played_hours,Age)|>
    summarize(across(played_hours:Age, ~ mean(.x, na.rm = TRUE))) #Gets the mean Age & Played Hours


[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [12]:
#Code for all of the Exploratory Visualizations
options(repr.plot.width = 12, repr.plot.height = 8)

scatter_viz=wrangled_player|>
    ggplot(aes(x=Age,y=played_hours,color=subscribe))+
    geom_point()+
    labs(x="Age of Player (Years)",y="Total Playtime (Hours)",color="Subscription Status",title = "Fig.1 Scatterplot of Age (yrs) vs Playtime (hrs) with Subscription Status")+
    theme(text = element_text(size = 15))

histogram_viz_1=wrangled_player|>
    ggplot(aes(x=Age,fill=subscribe))+
    geom_histogram()+
    labs(x="Age of Player (Years)",y="Number of Players", fill=" Subscribtion Status", title = "Fig.2 Distribution of Age (yrs) with Subscription Status")+
    theme(text = element_text(size = 15))

histogram_viz_2=wrangled_player|>
    ggplot(aes(x=played_hours,fill=subscribe))+
    geom_histogram()+
    labs(x="Total Playtime (Hours)",y="Number of Players",fill=" Subscribtion Status",title = "Fig.3 Distribution of Playtime (hrs) with Subscription Status")+
    theme(text = element_text(size = 15))

In [35]:
#Code for the Data Analysis
set.seed(123123123) 

player_split <- initial_split(wrangled_player, prop = 0.75, strata = subscribe)  #Splits the data into 75% training, 35% testing
player_train <- training(player_split)   
player_test <- testing(player_split)
 
player_recipe <- recipe(subscribe ~ played_hours + Age , data = player_train) |>  #Recipe for the model + Scaling
    step_scale(all_predictors()) |>
    step_center(all_predictors())

player_vfold <- vfold_cv(player_train, v = 5, strata = subscribe) # Five-Fold cross Validation

knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |> #Tuning for the best K value
       set_engine("kknn") |>
       set_mode("classification")

k_vals <- tibble(neighbors = seq(from = 1, to = 20, by = 1)) #Tibble of K-values used to tune for the best K

knn_player_results <- workflow() |> #Collects the metrics on the 
       add_recipe(player_recipe) |>
       add_model(knn_tune) |>
       tune_grid(resamples = player_vfold, grid = k_vals) |>
       collect_metrics()

best_k_value=knn_player_results|> #Gets the best K-Value from the tuning
    filter(.metric=="accuracy")|>
    select(neighbors,mean)|>
    slice_max(mean)|>
    pull(neighbors)

knn_best <- nearest_neighbor(weight_func = "rectangular", neighbors = best_k_value) |> #Model with the best K-value
       set_engine("kknn") |>
       set_mode("classification")

player_best_fit <- workflow() |> #Best Fit on training data
       add_recipe(player_recipe) |>
       add_model(knn_best) |>
       fit(player_train)

player_predictions <- predict(player_best_fit, player_test) |>
                        bind_cols(player_test)

player_metrics <- player_predictions |> 
    metrics(truth = subscribe, estimate = .pred_class)

player_metrics

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
accuracy,binary,0.73469388
kap,binary,0.06734993


neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
1,accuracy,binary,0.4272906,5,0.05565142,Preprocessor1_Model01
1,roc_auc,binary,0.4195346,5,0.05992703,Preprocessor1_Model01
2,accuracy,binary,0.4206240,5,0.05685503,Preprocessor1_Model02
2,roc_auc,binary,0.4366033,5,0.05168284,Preprocessor1_Model02
3,accuracy,binary,0.5179310,5,0.06254986,Preprocessor1_Model03
⋮,⋮,⋮,⋮,⋮,⋮,⋮
8,roc_auc,binary,0.4861511,5,0.04398101,Preprocessor1_Model08
9,accuracy,binary,0.6956486,5,0.04510738,Preprocessor1_Model09
9,roc_auc,binary,0.5007421,5,0.04983304,Preprocessor1_Model09
10,accuracy,binary,0.7020854,5,0.04488364,Preprocessor1_Model10
