In [25]:
#load libraries
library(tidyverse)
library(dplyr)
library(ggplot2)

In [26]:
#read in data
players <- read_csv("project_data/players.csv")
sessions <- read_csv("project_data/sessions.csv")

[1mRows: [22m[34m196[39m [1mColumns: [22m[34m7[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): experience, hashedEmail, name, gender
[32mdbl[39m (2): played_hours, Age
[33mlgl[39m (1): subscribe

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m1535[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (3): hashedEmail, start_time, end_time
[32mdbl[39m (2): original_start_time, original_end_time

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [27]:
# Inspect structure and variables
glimpse(players)
glimpse(sessions)

Rows: 196
Columns: 7
$ experience   [3m[90m<chr>[39m[23m "Pro", "Veteran", "Veteran", "Amateur", "Regular", "Amate…
$ subscribe    [3m[90m<lgl>[39m[23m TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, T…
$ hashedEmail  [3m[90m<chr>[39m[23m "f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8…
$ played_hours [3m[90m<dbl>[39m[23m 30.3, 3.8, 0.0, 0.7, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 1.6, 0…
$ name         [3m[90m<chr>[39m[23m "Morgan", "Christian", "Blake", "Flora", "Kylie", "Adrian…
$ gender       [3m[90m<chr>[39m[23m "Male", "Male", "Male", "Female", "Male", "Female", "Fema…
$ Age          [3m[90m<dbl>[39m[23m 9, 17, 17, 21, 21, 17, 19, 21, 17, 22, 23, 17, 25, 22, 17…
Rows: 1,535
Columns: 5
$ hashedEmail         [3m[90m<chr>[39m[23m "bfce39c89d6549f2bb94d8064d3ce69dc3d7e72b38f431d8a…
$ start_time          [3m[90m<chr>[39m[23m "30/06/2024 18:12", "17/06/2024 23:33", "25/07/202…
$ end_time            [3m[90m<chr>[39m[23m "30/06/2024 18:24"

In [28]:
library(lubridate)
# Add a column for session duration (in minutes) for each session
# - Converts start_time and end_time to date-time format
# - Calculates the difference in minutes between end and start

sessions <- sessions |>
  mutate(
    start_time = ymd_hms(start_time), # Convert start_time to date-time
    end_time = ymd_hms(end_time),     # Convert end_time to date-time
    session_length = as.numeric(difftime(end_time, start_time, units = "mins"))
  )

In [29]:
# Summarize each player's session activity:
# - total_sessions: total number of sessions for each player
# - total_playtime: total minutes played for each player
# - avg_session_length: average session length (in minutes) for each player

player_summary <- sessions |>
  group_by(hashedEmail) |>
  summarise(
    total_sessions = n(),
    total_playtime = sum(session_length, na.rm = TRUE),
    avg_session_length = mean(session_length, na.rm = TRUE)
  )
# Display the first 5 rows in assignment style
player_summary |>
  slice_head(n = 5)

hashedEmail,total_sessions,total_playtime,avg_session_length
<chr>,<int>,<dbl>,<dbl>
0088b5e134c3f0498a18c7ea6b8d77b4b0ff1636fc93355ccc95b45423367832,2,525577.8,262788.9
060aca80f8cfbf1c91553a72f4d5ec8034764b05ab59fe8e1cf0eee9a7b67967,1,0.5,0.5
0ce7bfa910d47fc91f21a7b3acd8f33bde6db57912ce0290fa0437ce0b97f387,1,0.1833333,0.1833333
0d4d71be33e2bc7266ee4983002bd930f69d304288a8663529c875f40f1750f3,13,6.966667,0.5358974
0d70dd9cac34d646c810b1846fe6a85b9e288a76f5dcab9c1ff1a0e7ca200b3a,2,1.166667,0.5833333


In [32]:
players_with_activity <- players |>
  left_join(player_summary, by = "hashedEmail")

# Display the first 5 rows to check the joined data
players_with_activity |>
  slice_head(n = 5)

experience,subscribe,hashedEmail,played_hours,name,gender,Age,total_sessions,total_playtime,avg_session_length
<chr>,<lgl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<dbl>
Pro,True,f6daba428a5e19a3d47574858c13550499be23603422e6a0ee9728f8b53e192d,30.3,Morgan,Male,9,27,-13106970.0,-485443.2
Veteran,True,f3c813577c458ba0dfef80996f8f32c93b6e8af1fa939732842f2312358a88e9,3.8,Christian,Male,17,3,4.25,1.416667
Veteran,False,b674dd7ee0d24096d1c019615ce4d12b20fcbff12d79d3c5a9d2118eb7ccbb28,0.0,Blake,Male,17,1,0.08333333,0.08333333
Amateur,True,23fe711e0e3b77f1da7aa221ab1192afe21648d47d2b4fa7a5a659ff443a0eb5,0.7,Flora,Female,21,1,0.8333333,0.8333333
Regular,True,7dc01f10bf20671ecfccdac23812b1b415acd42c2147cb0af4d48fcce2420f3e,0.1,Kylie,Male,21,1,0.15,0.15


In [34]:
activity_metrics <- players_with_activity |>
  select(total_sessions, total_playtime, avg_session_length) |>
  na.omit()

activity_metrics_scaled <- scale(activity_metrics)
activity_metrics_scaled |>
  head(5)

total_sessions,total_playtime,avg_session_length
0.3561844,-7.220635764,-6.59581791
-0.2245511,-0.002835715,-0.09525277
-0.2729457,-0.00283801,-0.09527062
-0.2729457,-0.002837597,-0.09526058
-0.2729457,-0.002837973,-0.09526973
