In [None]:
suppressPackageStartupMessages(
    {suppressWarnings({
        library(tidyverse)
        library(repr)
        library(tidymodels)
        library(tidyr)
        library(ggplot2)        
        library(scales)
        library(patchwork) 
        library(purrr)
        library(dplyr)
    })
})

In [None]:
# Size and general style set up
options(repr.plot.width = 6, repr.plot.height = 4, repr.matrix.max.rows = 7,readr.show_col_types = FALSE)

# Load Data
player_data <- read_csv("https://raw.githubusercontent.com/FabianoGLentini/player-subscription-ml/refs/heads/main/data/players.csv")


# Data Science Project: Project Final Report


## Introduction: 
For this project, we are analyzing data collected by researchers in Computer Science at UBC. They have set up a MineCraft server that allows them to record consenting playersâ€™ behaviour and characteristics for study. In particular, we want to answer the question: **What players' `Age`, `gender`, and `played_hours` can best predict whether they would `subscribe` to a gaming newsletter, and how does it differ from players `experience`?** In this project, we will focus on the `players.csv` dataset, as it is of most use to us to answer our question. The `players.csv` set contains 196 observations and 7 variables. This data is already in its tidy form because every column is a single variable, every row is a single observation, and every cell is a single value.  We can see that there are three types of variables here: character, logical, and double. 

**Character Variables:**

`hashedEmail` (email of player that has been converted into a unique string of characters, for privacy)

`name` (first name of player)

`gender` (gender of player)

`experience` (skill level of player)

**Double Variables:**

`played_hours` (number of hours spent on the game by each player)

`Age` (age in years of each player)

**Logical variable:**

`subscribe` (whether or not the player is subscribed to the game newsletter) 

For our question, we want to know if a player's age, gender, and amount of played hours can determine if they will subscribe to the game newsletter or not. We chose these three predictor variables because they are all characteristics of each individual player that can allow us to group them and determine which type of player is most likely to subscribe. Additionally, we want to determine if any difference in experience across players will be a factor in whether or not a player is subscribed.

A potential issue with this data under the `played_hours` variable, we can see that many players have 0 hours played. This could cause weird results when we begin to wrangle. Furthermore, some values under the `gender` variable contain very little sample sizes, making it hard to scale these categories. 

## Methods & Results:

### Set up and intro to data: "TODO should rename later." 

In [None]:
# Wrangle data
#TODO will need additional wranggling due to poor or few varied sample categories
player_df <- player_data |> # TODO must consider reworking a joining of some labels in gender etc due to low representaions
            select( subscribe, gender, played_hours, experience, Age) |>
            drop_na() |>
            mutate(
                subscribe = as_factor(subscribe),  # lgl -> fct for analysis and modeling
                gender = as_factor(gender),        # chr -> fct for analysis and modeling
                played = factor(as.logical(played_hours)), # TODO may not use 
                experience = as_factor(experience) # chr -> fct for analysis and modeling
            ) 
 # Removed row with NA values, as it may distort the model      

# TODO FABIO check if player_hours should adjust to use a binary outcome, 
# either played or didn't play at all, or if any of the predictor should be removed.


### Training and Analysis:

In [None]:
head(player_df) # TODO DELETE tmp  for set up 

In [None]:
set.seed(2025) # Don't change
# Prep for modelling
ply_df <- select(player_df, Age, subscribe, gender, played_hours)  # Exclude row_id & experience for modeling purposes

# Split step
player_split <- initial_split(ply_df, prop = 0.70, strata = subscribe) 
player_train <- training(player_split)
player_test <- testing(player_split)

# TODO FABIO CHECK boostraps() and if it would benefit our model with our small set

In [None]:
#TODO FABIO write up split/scale/recipe step reasoning
# 'initial split' Use Strata sub to balance the outcome for the bool prediction to avoid 
# imbalance in our test and train data split

# ... why use 75 25 split instead of 70/30 etc?

# ~ maybe explain why start with all predictors

In [None]:
# Find best K 



In [None]:
# Full Recipe
# Recipe 01:
# Age + gender + played_hours
rc_AGH <- recipe(subscribe ~ Age + gender + played_hours, data = player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())

In [None]:
#TODO FABIO ... search refractor options to reduce code clutter

# Recipes:
# Scale/Recipe

# Note: A = Age, G = gender and H = played_hours
# Recipe 01:
# Age + gender + played_hours
rc_AGH <- recipe(subscribe ~ Age + gender + played_hours, data = player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())
# # Recipe 02:
# Age + played_hours
rc_AH <- recipe(subscribe ~ Age + played_hours, player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())

# Recipe 03:
# Age + gender 
rc_AG <- recipe(subscribe ~ Age + gender, player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())

# Recipe 04:
# gender + played_hours
rc_GH <- recipe(subscribe ~ gender + played_hours, player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())

# Recipe 05:
# Age
rc_A <- recipe(subscribe ~ Age, player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())

# Recipe 06:
# gender
rc_G <- recipe(subscribe ~ gender, player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())

# Recipe 07:
# played_hours
rc_H <- recipe(subscribe ~ played_hours, player_train) |>
            step_dummy(all_nominal_predictors()) |>
            step_zv(all_predictors()) |> # Used to remove zero-variance variable after wrangling, gender_Other category had no rows associated to i 
            step_normalize(all_numeric_predictors())



In [None]:
#TODO FABIO breackdown hypothesis for each recipe variation ...?

In [None]:
# Spec set up
knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
            set_engine("kknn") |>
            set_mode("classification")

#TODO FABIO ... search refractor options to reduce code clutter
# Check mean and standard error through collect_metrics


In [None]:
# TODO FABIO ~ write up spec use and impl of vfold..?

In [None]:
# TODO FABIO confirm that not having set.seed here is fine as long as set.seed above code cell goes first
# K-fold cross-validation
set.seed(1234) # Don't change
kfolds <- vfold_cv(player_train, v = 5, strata = subscribe)
k_vals = tibble(neighbors = seq(from = 1, to = 10, by = 1))


In [None]:
#TODO FABIO write reasonin + graph to show fold outcome
#Note the reason of using 10 10-fold is due to the small size data,
#hence it will improve the estimate and 

In [None]:
# Simplifying multi-model testing: https://www.youtube.com/watch?v=YZqbOATpjM4&t=139s
# Workflow Step
workflow_set <- workflow_set(
    preproc = list(
        AGH = rc_AGH,
        AH = rc_AH,
        AG = rc_AG,
        GH = rc_GH,
        A = rc_A,
        G = rc_G,
        H = rc_H
    ),
    models = list(knn_tune),
    cross = TRUE
)

# Tune workflow
# set.seed(22)

knn_tuned_set <- workflow_map(
    workflow_set,
    "tune_grid",
    resamples = kfolds,
    grid = k_vals,
    seed = 22
)


In [None]:
#TODO FABIO write workflow step use case/what it functionally is doing 

In [None]:

options(repr.plot.width = 15, repr.plot.height = 5)
# Get metrics all in one
all_metrics_collected <- workflow_map(
    knn_tuned_set,
    metrics = metric_set(accuracy, recall, precision)
) 

all_metrics_collected_res <- collect_metrics(all_metrics_collected)

In [None]:
# all_metrics_collected_res
#TODO TMP trying to regain neighbours
res_All_met_tuned <- all_metrics_collected |>
            rowwise() |>
            mutate(metrics = list(collect_metrics(result))) |>
            unnest(metrics) |>
            select(wflow_id, .metric, mean, neighbors, std_err) |>
            arrange(desc(mean))

# ADD RANKING
ranked_met <- res_All_met_tuned |>
            mutate(rank = seq(1, nrow(res_All_met_tuned), 1))
ranked_met
# Visualize result
options(repr.plot.width = 8, repr.plot.height = 5)

plot_top_ten <- ggplot(ranked_met, aes(x = rank, y = mean, color = wflow_id, shape = .metric)) +
                geom_point() +
                ylim(c(0,1)) 

plot_top_ten

In [None]:

# Autoplot 
acc_plot <- autoplot(all_metrics_collected, metric = "accuracy")
prec_plot <- autoplot(all_metrics_collected, metric = "precision")
recall_plot <- autoplot(all_metrics_collected,  metric = "recall")

#TODO MUST REFRACTOR NO NEED FOR ALL OF THIS

# RANKING FOR ALL
# Accuracy
all_acc_rank_result <- all_metrics_collected_res |>
                    filter(.metric == "accuracy") |>
                    arrange(desc(mean))

# Precision
all_prec_rank_result <- all_metrics_collected_res |>
                    filter(.metric == "precision") |>
                    arrange(desc(mean))

# Recall
all_recall_rank_result <- all_metrics_collected_res |>
                    filter(.metric == "recall") |>
                    arrange(desc(mean))

# Top 10 Ranked Results
# Accuracy
ten_acc_rank_result <- slice(all_acc_rank_result, 1:10)

# Precision
ten_prec_rank_result <- slice(all_prec_rank_result, 1:10)

# Recall
ten_recall_rank_result <- slice(all_recall_rank_result, 1:10)


In [None]:
# # Collect Metrics
# # Recalls
# recall_set <- workflow_map(
#     knn_tuned_set,
#     metrics = metric_set(recall)
# ) 

# # RANKED and autoplot 
# recall_ranked <- rank_results(recall_set, rank_metric = "recall")
# recall_ranked

In [None]:
#TODO fix plots they are not matching in scales
acc_plot +  prec_plot + recall_plot

In [None]:
# FIT MODEL:
slice(ten_acc_rank_result, 1)
head(ten_acc_rank_result)
final_fit <- extract_workflow(all_metrics_collected, id = "AGH_nearest_neighbor")

# library(vip)

# final_fit |>
#     extract_fit_parsnip() |>
#     vip(geom = "col")

### Player Type Exploration: "TODO may need renaming"

## Discussion:

### Reference 

##### Data Science programming techniques and approaches:
- **Clustering prediction:** https://www.youtube.com/watch?v=z57i2GVcdww
- **Simplifying multi-model set up + testing:** https://www.youtube.com/watch?v=YZqbOATpjM4
- **Tuning and comparing models using Workflowse:** https://workflowsets.tidymodels.org/articles/tuning-and-comparing-models.html
- **Useful example of report/tutorial for tidy models:** https://optimumsportsperformance.com/blog/k-nearest-neighbor-tidymodels-tutorial/
- **Additional breakdown on workflow_set usage:** https://www.youtube.com/watch?v=R95lWUDtL5A
- **Workflow_set documentation on results:** https://workflowsets.tidymodels.org/reference/collect_metrics.workflow_set
- **Workflow_set tunning and comparing documentation:** https://workflowsets.tidymodels.org/articles/tuning-and-comparing-models

In [None]:
#TODO tmp info grounds

?scale_continuous_identity