In [1]:
# Note using suppress to reduce constant red warning boxes
suppressPackageStartupMessages(
    {suppressWarnings({
        library(tidyverse)
        library(repr)
        library(tidymodels)
        library(tidyr)
        library(dplyr)
    })
})


In [2]:
# Size and general style set up
options(repr.plot.width = 6, repr.plot.height = 4, repr.matrix.max.rows = 7,readr.show_col_types = FALSE)

# Load Data
player_data <- read_csv("https://raw.githubusercontent.com/FabianoGLentini/player-subscription-ml/refs/heads/main/data/players.csv")
# TODO may not need session data
# session_data <- read_csv("")



# Data Science Project: Project Final Report


## Introduction:

## Methods & Results:

### Set up and intro to data: "TODO should rename later." 

In [3]:
# Wrangle data
# Added row ID to replace hashedEmail and removed name
player_df <- player_data |>
            mutate(row_id = row_number()) |> # Replace hashedEmail with int row ID
            select(row_id, subscribe, gender, played_hours, experience, Age) |>
            drop_na() # Removed row with NA values, as it may distort the model      

# TODO FABIO check if player_hours should adjust to use a binary outcome, 
# either played or didn't play at all, or if any of the predictor should be removed.


### Training and Analysis:

In [4]:
head(player_df) # TODO DELETE tmp  for set up 

row_id,subscribe,gender,played_hours,experience,Age
<int>,<lgl>,<chr>,<dbl>,<chr>,<dbl>
1,True,Male,30.3,Pro,9
2,True,Male,3.8,Veteran,17
3,False,Male,0.0,Veteran,17
4,True,Female,0.7,Amateur,21
5,True,Male,0.1,Regular,21
6,True,Female,0.0,Amateur,17


In [5]:
set.seed(2025)

# Split step
player_split <- select(player_df, -row_id) |> # Exclude row id for modeling purposes
                initial_split(, prop = 0.75, strata = subscribe) 
player_train <- training(player_split)
player_test <- testing(player_split)

# Scale/Recipe
# Note: A = Age, G = gender and H = played_hours
# Recipe 01:
# Age + gender + played_hours
player_recipe_All <- recipe(subscribe ~ Age + gender + played_hours, player_train) |>
                step_normalize(all_predictors)

# Recipe 02:
# Age + played_hours
player_recipe_AH <- recipe(subscribe ~ Age + played_hours, player_train) |>
                step_normalize(all_predictors)

# Recipe 03:
# Age + gender 
player_recipe_AG <- recipe(subscribe ~ Age + gender, player_train) |>
                step_normalize(all_predictors)

# Recipe 04:
# gender + played_hours
player_recipe_GH <- recipe(subscribe ~ gender + played_hours, player_train) |>
                step_normalize(all_predictors)

# Recipe 05:
# Age
player_recipe_A <- recipe(subscribe ~ Age, player_train) |>
                step_normalize(all_predictors)

# Recipe 06:
# gender
player_recipe_G <- recipe(subscribe ~ gender, player_train) |>
                step_normalize(all_predictors)

# Recipe 07:
# played_hours
player_recipe_H <- recipe(subscribe ~ played_hours, player_train) |>
                step_normalize(all_predictors)


In [6]:
#TODO FABIO write up split/scale/recipe step reasoning
# 'initial split' Use Strata sub to balance the outcome for the bool prediction to avoid 
# imbalance in our test and train data split

# ... why use 75 25 split instead of 70/30 etc?

# ~ maybe explain why start with all predictors

In [7]:
# K-fold cross-validation
player_vfold <- vfold_cv(player_train, v = 10, strata = subscribe)

In [8]:
#TODO FABIO write reasonin + graph to show fold outcome
#Note the reason of using 10 10-fold is due to the small size data,
#hence it will improve the estimate and 

In [9]:
#TODO FABIO ... search refractor options to reduce code clutter

# Recipes:
# Scale/Recipe

# Note: A = Age, G = gender and H = played_hours
# Recipe 01:
# Age + gender + played_hours
player_recipe_All <- recipe(subscribe ~ Age + gender + played_hours, player_train) |>
                step_normalize(all_predictors)

# Recipe 02:
# Age + played_hours
player_recipe_AH <- recipe(subscribe ~ Age + played_hours, player_train) |>
                step_normalize(all_predictors)

# Recipe 03:
# Age + gender 
player_recipe_AG <- recipe(subscribe ~ Age + gender, player_train) |>
                step_normalize(all_predictors)

# Recipe 04:
# gender + played_hours
player_recipe_GH <- recipe(subscribe ~ gender + played_hours, player_train) |>
                step_normalize(all_predictors)

# Recipe 05:
# Age
player_recipe_A <- recipe(subscribe ~ Age, player_train) |>
                step_normalize(all_predictors)

# Recipe 06:
# gender
player_recipe_G <- recipe(subscribe ~ gender, player_train) |>
                step_normalize(all_predictors)

# Recipe 07:
# played_hours
player_recipe_H <- recipe(subscribe ~ played_hours, player_train) |>
                step_normalize(all_predictors)

In [10]:
#TODO FABIO breackdown hypothesis for each recipe variation ...?

In [11]:
# Spec set up
player_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
        set_engine("kknn") |>
        set_mode("classification")

#TODO FABIO ... search refractor options to reduce code clutter
# Check mean and standard error through collect_metrics


In [12]:
# TODO FABIO ~ write up spec use and impl of vfold..?

In [13]:
#TODO FABIO ... search refractor options to reduce code clutter/redundancy 
# Workflow Step
# player_wf <- workflow()  |>
#             add_recipe(...) |>
#             add_model(player_spec)


# Note: A = Age, G = gender and H = played_hours
# Workflow 01:
# Age + gender + played_hours
player_wf_All <- workflow()  |>
            add_recipe(player_recipe_All) |>
            add_model(player_spec)

# Workflow 02:
# Age + played_hours
player_wf_AH <- workflow()  |>
            add_recipe(player_recipe_AH) |>
            add_model(player_spec)

# Workflow 03:
# Age + gender 
player_wf_AG <- workflow()  |>
            add_recipe(player_recipe_AG) |>
            add_model(player_spec)

# Workflow 04:
# gender + played_hours
player_wf_GH <- workflow()  |>
            add_recipe(player_recipe_GH) |>
            add_model(player_spec)


# Workflow 05:
# Age
player_wf_A <- workflow()  |>
            add_recipe(player_recipe_A) |>
            add_model(player_spec)


# Workflow 06:
# gender
player_wf_G <- workflow()  |>
            add_recipe(player_recipe_G) |>
            add_model(player_spec)

# Workflow 07:
# played_hours
player_wf_H <- workflow()  |>
            add_recipe(player_recipe_H) |>
            add_model(player_spec)


In [None]:
#TODO FABIO write workflow step use case/what it functionally is doing 

### Player Type Exploration: "TODO may need renaming"

## Discussion: