# 

In [4]:
library(tidyverse)
library(repr)
library(tidymodels)
library(recipes)
options(repr.matrix.max.rows = 6)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 1.1.1 ──

[32m✔[39m [34mbroom       [39m 1.0.6     [32m✔[39m [34mrsample     [39

In [5]:
PLAYERS_DATA_URL = "https://raw.githubusercontent.com/Bumblet3a/dsci-individual-project/refs/heads/main/players.csv"
SESSION_DATA_URL = "https://raw.githubusercontent.com/Bumblet3a/dsci-individual-project/refs/heads/main/sessions.csv"

download.file(PLAYERS_DATA_URL, "players.csv")
download.file(SESSION_DATA_URL, "sessions.csv")

players <- read_csv("players.csv")
head(players)
sessions_data <- read_csv("sessions.csv")

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


experience,subscribe,hashedEmail,played_hours,name,gender,Age
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>
Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9
Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17
Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17
Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,Female,21
Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,Male,21
Amateur,True,f58aad5996a435f16b0284a3b267f973f9af99e7a89bee0430055a44fa92f977,0.0,Adrian,Female,17


[1mRows: [22m[34m1535[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): hashedEmail, start_time, end_time
[32mdbl[39m (2): original_start_time, original_end_time

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [7]:
player_sessions <- merge(sessions_data, players, all = TRUE)|>
select(Age, experience, subscribe, played_hours)|>
drop_na(Age:played_hours)
player_sessions

Age,experience,subscribe,played_hours
<dbl>,<chr>,<lgl>,<dbl>
20,Regular,TRUE,1.5
20,Regular,TRUE,1.5
17,Veteran,TRUE,0.0
⋮,⋮,⋮,⋮
23,Amateur,TRUE,56.1
17,Amateur,TRUE,0.1
20,Beginner,TRUE,1.1


In [8]:
players_scaled <- player_sessions|>
mutate(Age = scale(Age, center = TRUE),
       played_hours = scale(played_hours, center = TRUE))
players_scaled

Age,experience,subscribe,played_hours
"<dbl[,1]>",<chr>,<lgl>,"<dbl[,1]>"
0.1270154,Regular,TRUE,-1.122151
0.1270154,Regular,TRUE,-1.122151
-0.5043878,Veteran,TRUE,-1.140283
⋮,⋮,⋮,⋮
0.7584187,Amateur,TRUE,-0.4621379
-0.5043878,Amateur,TRUE,-1.1390740
0.1270154,Beginner,TRUE,-1.1269858


In [9]:

players_split <- initial_split(players_scaled, prop = 0.75, strata = subscribe)
players_testing <- testing(players_split)
players_training <- training(players_split)

In [18]:

players_model <- nearest_neighbor(weight_func = "rectangular",
                  neighbors = tune()) |>
                set_engine("kknn") |>
                set_mode("regression")


play_recipe <- recipe(played_hours ~ ., data = players_training) |>
 step_scale(Age) |>
 step_center(Age)


play_vfold <- vfold_cv(players_training, v = 5, strata = played_hours)


play_workflow <- workflow() |>
add_recipe(play_recipe) |>
add_model(players_model)


gridvals <- tibble(neighbors = seq(from = 1, to = 81, by = 10))


play_results <- tune_grid(
  play_workflow,
  resamples = play_vfold,
  grid = gridvals) |>
  collect_metrics()

play_results


→ [31m[1mA[22m[39m | [31merror[39m:   [1m[22mFailed to compute `rmse()`.
               [1mCaused by error:[22m
               [1m[22m[33m![39m `truth` should be a numeric vector, not a numeric matrix.

There were issues with some computations   [1m[31mA[39m[22m: x1

There were issues with some computations   [1m[31mA[39m[22m: x2

There were issues with some computations   [1m[31mA[39m[22m: x5



“All models failed. Run `show_notes(.Last.tune.result)` for more information.”


ERROR: [1m[33mError[39m in `estimate_tune_results()`:[22m
[33m![39m All models failed. Run `show_notes(.Last.tune.result)` for more information.


In [19]:
play_min <- play_results|>
filter(.metric == "rmse")|>
slice_min(mean, n = 1)
play_min

ERROR: Error in eval(expr, envir, enclos): object 'play_results' not found


In [None]:
players_pred_accuracy <- players_prediction|>
metrics(truth = subscribe, estimate = .pred_class)
players_pred_accuracy

In [None]:
players_mat <- players_prediction|>
conf_mat(truth = subscribe, estimate = .pred_class)
players_mat

In [None]:
players_vfold <- vfold_cv(players_training, v = 5, strata = subscribe)


players_resample <- workflow()|>
add_recipe(players_recipe)|>
add_model(players_model)|>
fit_resamples(resamples = players_vfold)|>
collect_metrics()
players_resample

In [None]:
players_tune <-  nearest_neighbor(weight_func = "rectangular", neighbors = tune())|>
set_engine("kknn")|>
set_mode("regression")

play_fit <- workflow()|>
add_recipe(play_recipe)|>
add_model(play_spec)|>
fit(data = play_sessions_training)

players_tune_results <- workflow()|>
add_recipe(players_recipe)|>
add_model(players_tune)|>
tune_grid(resamples = players_vfold, grid = players_grid)|>
collect_metrics()
players_tune_results

In [None]:
sessions <- sessions_data |>
    separate(start_time, into = c("Date", "Time"), sep = " ") |>
    separate(Date, into = c("Day", "Month", "Year"), sep = "/") |>
    separate(end_time, into = c("EDate", "ETime"), sep = " ") |>
    separate(EDate, into = c("EDay", "EMonth", "EYear"), sep = "/") |>
    mutate(Day = as.numeric(Day), EDay = as.numeric(EDay),
           Month = as.numeric(Month), EMonth = as.numeric(EMonth), 
           Year = as.numeric(Year), EYear = as.numeric(EYear)) |>
    separate(Time, into = c("Hour", "Minute"), sep = ":") |>
    separate(ETime, into = c("EHour", "EMinute"), sep = ":") |>
    mutate(Hour = as.numeric(Hour), EHour = as.numeric(EHour),
           Minute = as.numeric(Minute), EMinute = as.numeric(EMinute)) |>

    #Filter out rows that contain sessions that lasted more than a day(these data points are probably outliers)

    filter(Day==EDay & Month==EMonth & Year==EYear)



sessions

In [None]:
sessions_time_diff <- sessions |>
    mutate(time_diff = (EHour - Hour) * 60 + (EMinute - Minute))


time_stats <- sessions_time_diff |>
    summarize(average_time = mean(time_diff, na.rm = TRUE),
              median_time = median(time_diff, na.rm = TRUE),
              sd_time = sd(time_diff, na.rm = TRUE))


sessions_visualization <- sessions_time_diff |>
    ggplot(aes(x = time_diff)) +
    geom_histogram(binwidth = 10) +
    labs(title = "Distribution of Session Time Differences",
         x = "Time Difference (minutes)",
         y = "Count")

sessions_visualization
time_stats

# Skewed right distribution where most people had sessions that lasted less than 60 minutes

## Sessions Data Analysis

# Distribution Plot:
To create a distribution plot that visualizes the time differences in players' sessions, a new data frame was created with a column that calculates the time length of sessions by finding the difference between the start and ending hours, multiplying by 60, and adding the difference between the start and ending minutes. By using this column as the x-axis of a historgram with counts on the y-axis, the distribution of the length of player time sessions can be seen. The histogram depicts a skewed right distribution that has a mean play time of around 46 minutes, median time of 26 minutes, and standard deviation of 51 minutes. This large difference between the mean and median values of session time indicates that a majority of session times lie under 30 minutes, while large outlier session times are influencing the mean by increasing the overall average for the session times.