In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)
source('cleanup.R')

In [None]:
players_data <- read_csv("data/players.csv")
players_data

In [None]:
sessions_data <- read_csv("data/sessions.csv")
sessions_data

I would like to investigate whether a player subscribing to a gaming newsletter may be predicted by play time and age.

We can first observe the trend through graphs.

In [None]:
time_age_plot <- players_data |>
                ggplot(aes(x = Age, y = played_hours)) +
                geom_point(aes(color = subscribe)) +
                labs(x = "Age", y = "Hours Played", color = "subscribed?")
time_age_plot

In [None]:
age_histogram <- ggplot(players_data, aes(x = Age)) +
                geom_histogram(binwidth = 10, aes(fill = subscribe))
age_histogram

In [None]:
avg_time_sub <- players_data |>
                group_by(subscribe) |>
                summarize(avg_time_played_hrs = mean(played_hours, na.rm = TRUE))
avg_time_sub

In [None]:
experience_sub <- players_data |>
                group_by(experience, subscribe) |>
                summarize(count = n()) |>
                mutate(total_num = nrow(players_data)) |>
                mutate(percentage = count / total_num) |>
                select(experience, subscribe, percentage)
experience_sub

In [None]:
experience_sub_graph <- ggplot(experience_sub, aes(x = experience, y = percentage)) +
                        geom_bar(stat = "identity", position = "dodge", aes(fill = subscribe))
experience_sub_graph

In [None]:
subscribe_count = players_data |>
                group_by(subscribe) |>
                summarize(count = n())
subscribe_count

In [None]:
age_sub <- players_data |>
        group_by(gender, subscribe) |>
        summarize(count = n()) |>
        arrange(desc(count)) |>
        mutate(total_players = nrow(players_data)) |>
        mutate(percentage = count / total_players) |>
        select(gender, subscribe, percentage)
age_sub

In [None]:
age_sub_plot <- ggplot(age_sub, aes(x = gender, y = percentage)) +
                geom_bar(stat = "identity", position = "dodge", aes(fill = subscribe))
age_sub_plot

# Predictive Model

## Finding the Best K value

In [None]:
players_processed <- players_data |>
                    filter(Age > 15, Age < 25) |>
                      select(experience, subscribe, played_hours, gender) |>
                    mutate(subscribe = as_factor(subscribe)) |>
                    mutate(experience = as_factor(experience)) |>
                    mutate(gender = as_factor(gender))
players_processed

In [None]:
set.seed(1000)
players_split <- initial_split(players_processed, prop = 0.70, strata = subscribe)
players_training <- training(players_split)
players_testing <- testing(players_split)

In [None]:
set.seed(1000)
sub_recipe <- recipe(subscribe ~ played_hours + experience + gender, data = players_training) |>
            step_scale(played_hours) |>
            step_center(played_hours)
sub_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
            set_engine("kknn") |>
            set_mode("classification")
sub_vfold <- vfold_cv(players_training, v = 5, strata = subscribe)
vals <- tibble(neighbors = seq(from = 1, to = 10, by = 1))

sub_fit <- workflow() |>
        add_recipe(sub_recipe) |>
        add_model(sub_spec) |>
        tune_grid(resamples = sub_vfold, grid = vals) |>
        collect_metrics() |>
        filter(.metric == "accuracy") |>
        filter(mean == max(mean))
sub_fit

In [None]:
set.seed(3000)
tuned_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 9) |>
            set_engine("kknn") |>
            set_mode("classification")
tuned_fit <- workflow() |>
            add_recipe(sub_recipe) |>
            add_model(tuned_spec) |>
            fit(players_training)

In [None]:
set.seed(4000)
players_predictions <- predict(tuned_fit, players_testing) |>
                    bind_cols(players_testing)
players_predictions
players_metrics <- players_predictions |>
                metrics(truth = subscribe, estimate = .pred_class)
players_metrics
players_conf_mat <- players_predictions |>
                    conf_mat(truth = subscribe, estimate = .pred_class)
players_conf_mat

$Recall = 35 / 37 = 0.91$

$Precision = 40 / 54 = 0.74$