In [None]:
# Data Science Project: Planning Stage (Individual)
## Problem: Predicting Usage of a Video Game Research Server

Board Question: What player characteristics and behaviours are most predictive of subscribing to a game-related newsletter, and how do these features differ between various player types?

Research Question: Can experience level, age, gender and playtime predict whether a player subscribes to the newsletter?

Response variable: subscribe 

Predictor variables: experience, age, gender, and played_hours

In [None]:
# Preprocessing and exploratory data analysis

In [None]:
# Importing libraries and formatting graphs

In [None]:
# load necessary packages
library(tidyverse)
library(lubridate)
library(knitr)
library(ggplot2)
library(scales)
library(dplyr)
library(readr)

# formatting graphs
options(repr.plot.width = 10, repr.plot.height = 6)

In [None]:
# Importing the dataset

In [None]:
# load datasets
players <- read_csv("players.csv")
sessions <- read_csv("sessions.csv")

In [None]:
# Initial Data Exploration

In [None]:
# check dimensions
cat("=== DATASET OVERVIEW ===\n")
cat("Players dataset:", nrow(players), "rows and", ncol(players), "columns\n")
cat("Sessions dataset:", nrow(sessions), "rows and", ncol(sessions), "columns\n")

In [None]:
# data description-players dataset

In [None]:
cat("=== PLAYERS DATASET DESCRIPTION ===\n\n")

# display first few rows
head(players)

# summary of the dataset structure
cat("\nDataset Structure:\n")
glimpse(players)

# variable descriptions and types
cat("\n--- Variable Descriptions ---\n\n")
cat("1. experience (categorical): Player's experience level\n")
cat(" - Levels: Beginner, Amateur, Regular, Veteran, Pro\n")
cat(" - Type: Character\n\n")

cat("2. subscribe (binary): Whether player subscribed to newsletter\n")
cat(" - Values: TRUE (subscribed), FALSE (not subscribed)\n")
cat(" - Type: Logical\n")
cat(" - This is our response variable\n\n")

cat("3. hashedEmail (identifier): Unique anonymized identifier for each player\n")
cat(" - Type: Character\n")
cat(" - Links to sessions dataset\n\n")

cat("4. played_hours (numeric): Total hours the player spent in the game\n")
cat(" - Type: Double (numeric)\n")
cat(" - Range: 0 to", max(players$played_hours, na.rm = TRUE), "hours\n\n")

cat("5. name (categorical): Player's name\n")
cat(" - Type: Character\n\n")

cat("6. gender (categorical): Player's gender identity\n")
cat(" - Type: Character\n\n")

cat("7. Age (numeric): Player's age in years\n")
cat(" - Type: Double (numeric)\n")
cat(" - Contains missing values (NA)\n\n")


In [None]:
# Summary Statistics for Players dataset

In [None]:
cat("\n=== SUMMARY STATISTICS ===\n\n")

# summary statistics for numeric variables
cat("\n--- Numeric Variables Summary ---\n\n")

# calculate statistics for played_hours
cat("played_hours:\n")
cat(" Mean:", round(mean(players$played_hours, na.rm = TRUE),2), "hours\n")
cat(" Median:", round(median(players$played_hours, na.rm = TRUE),2), "hours\n")
cat(" SD:", round(sd(players$played_hours, na.rm = TRUE),2), "\n")
cat(" Min:", round(min(players$played_hours, na.rm = TRUE),2), "hours\n")
cat(" Max:", round(max(players$played_hours, na.rm = TRUE),2), "hours\n")

# calculate statistics for Age
cat("Age:\n")
cat(" Mean:", round(mean(players$Age, na.rm = TRUE),2), "years\n")
cat(" Median:", round(median(players$Age, na.rm = TRUE),2), "years\n")
cat(" SD:", round(sd(players$Age, na.rm = TRUE),2), "\n")
cat(" Min:", round(min(players$Age, na.rm = TRUE),2), "years\n")
cat(" Max:", round(max(players$Age, na.rm = TRUE),2), "yearss\n")
# cat(" Missing values:", sum(is.na(players$Age), "\n\n"))

# table of mean values for all quantitative variables
numeric_summary <- players |>
summarise(mean_played_hours = round(mean(played_hours, na.rm = TRUE), 2),
          mean_age = round(mean(Age, na.rm = TRUE), 2))

cat("Table of Mean Values:\n")
print(numeric_summary)
cat("\n")

# summary for categorical variables
cat("\n--- Categorical Variables Summary ---\n\n")

# experience levels
cat("Experience Distribution:\n")
print(table(players$experience))
cat("\n")

# subscription status (our response variable)
cat("Subscription Distribution:\n")
print(table(players$subscribe))
# cat(" Subscription rate:",
    # round(mean(players$subscribe)*100, 2), "%\n\n")

# gender distribution
cat("Gender Distribution:\n")
print(table(players$gender))
cat("\n")

In [None]:
# checking for missing values
cat("\nMissing values in each column:\n")
colSums(is.na(players))

# checking for unique values in categorical variables
cat("\nExperience levels:", unique(players$experience), "\n")
cat("\nGender status:", unique(players$gender), "\n")
cat("\nSubscription status:", table(players$subscribe), "\n")

In [None]:
# identify potential issues

In [None]:
# data description-sessions dataset

In [None]:
cat("=== SESSIONS DATASET DESCRIPTION ===\n\n")

# display first few rows
head(players)

# summary of the dataset structure
cat("\nDataset Structure:\n")
glimpse(sessions)

# variable descriptions and types
cat("\nSessions Dataset Variables:\n")
cat("1. hashedEmail: Links to players dataset\n")
cat("2. start_time: Session start timestamp\n")
cat("3. end_time: Session end timestamp\n")
cat("4. original_start_time: Original start time in numeric format\n")
cat("5. original_end_time: Original end time in numeric format\n\n")

cat("This dataset can be used to calculate:\n")
cat(" - Number of sessions per player\n")
cat(" - Patterns of session durations\n")
cat(" - Playing frequency\n\n")

In [None]:
# Data Wrangling

In [None]:
cat("=== DATA WRANGLING PLAN ===")
# create a clean version of the dataset for modeling
players_clean <- players |>
select(hashedEmail, subscribe, experience, Age, gender, played_hours) |>
mutate(
    subscribe = factor(subscribe, levels = c("FALSE", "TRUE")),
    experience = factor(experience, 
                        levels = c("Beginner", "Amateur", "Regular", "Veteran", "Pro"),
                        ordered = TRUE))
cat("Cleaned dataset preview:\n")
head(players_clean)
cat("\n")
cat("Cleaned dataset has", 
    nrow(players_clean), 
    ncol(players_clean), 
    "variables\n\n")

In [None]:
# VISUALIZATION: SUBSCRIPTION BY EXPERIENCE
ggplot(players, aes(x=experience, fill=subscribe))+
geom_bar(position="fill")+
labs(title="Newsletter Subscription Rate by Experience Level",
     x="Experience Level",
     y="Percentage",
     fill="Subscribed")

In [None]:
# VISUALIZATION: AGE DISTRIBUTION
ggplot(players|> filter(!is.na(Age)),
       aes(x=subscribe, y=Age, fill=subscribe))+
geom_boxplot(alpha=0.7)+
geom_jitter(width=0.2, alpha=0.3)+
labs(title="Age Distribution by Newsletter Subscription Status",
     x="Subscribed to Newsletter",
     y="Age (years)")

In [None]:
# VISUALIZATION: PLAYED HOURS DISTRIBUTION
ggplot(players, aes(x=played_hours, fill=subscribe))+
geom_histogram(bins=30, alpha=0.6, position="identity")+
labs(title="Distribution of Total Hours Played",
     x="Hours Played",
     y="Count",
     fill="Subscribed")

# Summary Statistics by Subscription
players|>
group_by(subscribe)|>
summarise(mean_hours=round(mean(played_hours),2),
          median_hours=round(median(played_hours),2), 
          n=n())|>
kable(caption="Played Hours by Subscription Status")

In [None]:
# VISUALIZATION: SUBSCRIPTION RATE BY GENDER
# gender summary
gender_summary <- players|>
group_by(gender, subscribe)|>
summarise(count=n(), .groups='drop')|>
group_by(gender)|>
mutate(total=sum(count),
       percentage=count/total*100)

ggplot(gender_summary, aes(x=reorder(gender, total), y=count, fill=subscribe))+
geom_col(position="fill")+
coord_flip()+
labs(title="Subscription Rate by Gender",
     x="Gender",
     y="Percentage",
     fill="Subscribed")

In [None]:
# VISUALIZATION: AGE VS PLAYED HOURS BY SUBSCRIPTION
ggplot(players|>filter(!is.na(Age)),
       aes(x=Age, y=played_hours, color=subscribe))+
geom_point(alpha=0.6, size=3)+
labs(title="Relationship between Age and Played Hours",
     x="Age (in years)",
     y="Hours Played",
     color="Subscribed")