In [None]:
library(tidyverse)
library(tidymodels)
library(repr)
library(dplyr)


In [None]:
playersurl <-"https://raw.githubusercontent.com/Alexis-Jang/Group7-DSCI100project/refs/heads/main/DSCI%20100%20Players.csv"
players.csv <- "players.csv"
download.file(playersurl,players.csv, mode = "wb")
players1 <- read.csv(players.csv)
head(players1)

The data set "players.csv" has 196 observations. It tells us that there are 7 variables and 196 rows. Below are the variables and their relevant summary statistics:
  
`experience` - character (chr) data type
- Experience level(Amateur, Regular, Pro, Veteran)

`subscribe` - logical (log) data type
- Whether or not the player subscribed
  - FALSE: 52
  - TRUE: 144

`hashedEmail` - as a character (chr) data type
- The player's email (hidden)

`played_hours` - as a double (dbl) data type
- The hours spent playing on the server 
    - minimum: 0.000 
    - maximum: 223.100
    - median: 0.100
    - mean: 5.846
    - standard deviation: 28.35734 

`name` - as a character (chr) data type
- The player's name

`gender` - as a character (chr) data type
- The player's gender

`Age` - as a double (dbl) data type
- The player's age
    - minimum: 8.00
    - maximum: 50.00
    - median: 19.00
    - mean: 20.52
    - standard deviation: 6.174667
    - NAs: 2 -> This means that 2 players did not disclose their age


For this project, we will only look at `Age` to see if it can help us predict the `played_hours`of the player. In targetting those with the highest play time, we can find what age of player contributes the most data to the data set.

In [None]:
set.seed(123)

#firstly, we will wrangle the data to tidy it
#selecting for only Age and played_hours will help to clean up the data
players <- players1 |>
    select(Age, played_hours)
head(players)

In [None]:
set.seed(123)

#since there are very few NAs in the data, they will be omitted from the data set before splitting it for cross validation
players_omitted <- na.omit(players)

#next, the data will be split into training and testing data since we need to train our model
#the split will be 80% training and 20% testing
players_split <- initial_split(players_omitted, prop = 0.80, strata = played_hours)
players_training <- training(players_split)
players_testing <- testing(players_split)


In [None]:
set.seed(123)
#visualizing the data, we can see the age range of 17-20 has a very high play time, we can explore this further in our project
options(repr.plot.width = 8, repr.plot.height = 7)
age_plot <- players_training|>
    ggplot(aes(x = Age, y = played_hours))+
            geom_bar(stat = "identity") +
    labs(x = "Age (in Years)", y = "Hours Played", title = "Graph 1.0: Age vs. Hours Played Bar Graph") +
    theme(element_text(size = 15))
age_plot

In [None]:
set.seed(123)
#since the high play times might be due to the fact that there are more observations with that age range, 
#we can find the mean play time of everyone by grouping by age and summarizing to find the mean played hours
players_grouped <- players_training|>
    group_by(Age)|>
    summarize(mean_played_time = mean(played_hours))
players_grouped

In [None]:
set.seed(123)
#now, we can see a smaller range of players between 19-20 that seem to have the highest play time
players_grouped_graph <- players_grouped|>
    ggplot(aes(x = Age, y = mean_played_time))+
    geom_bar(stat = "identity")+
    labs(x = "Age (in Years)", y = "Mean Hours Played", title = "Graph 1.2: Age vs. Mean Hours Played Scatter Plot")
players_grouped_graph

In [None]:
set.seed(123)
#to decide which regression model to use, we should see if the data is linear
#since the visualization below shows that the data is not linear, then we should not use linear regression and should use KNN regression
players_grouped_scatter <- players_grouped|>
    ggplot(aes(x = Age, y = mean_played_time))+
    geom_point()+
    labs(x = "Age (in Years)", y = "Mean Hours Played", title = "Graph 1.3: Age vs. Mean Hours Played Scatter Plot")
players_grouped_scatter

In [None]:
set.seed(123)
#making the recipe for the regression model and standardizing the data:
age_recipe <- recipe(played_hours ~ Age, data = players_training)|>
    step_center(Age)|>
    step_scale(Age)
age_recipe

In [None]:
set.seed(123)
#making the model for KNN regression, we want to optimize for the best K
#so, instead of specifying the K, we will use tune() to use cross validation with 5 folds
age_model <- nearest_neighbor(weight_func = 'rectangular', neighbors = tune())|>
    set_engine('kknn')|>
    set_mode('regression')
age_model

In [None]:
set.seed(123)
#we will test a range of neighbours from 1 to 15 to find the best K
age_vfold <- vfold_cv(players_training, v = 5, strata = played_hours)
age_grid <- tibble(neighbors = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))

age_w <- workflow()|>
    add_recipe(age_recipe)|>
    add_model(age_model)
age_w

age_workflow <- age_w |>
    tune_grid(resamples = age_vfold, grid = age_grid) |>
    collect_metrics()
age_workflow

#using our workflow, we will find the best K with the lowest root mean squared error
age_trained <- age_workflow|>
    filter(.metric == "rmse")|>
    filter(mean == min(mean))
age_trained
#thus, we have found that 4 neighbours is the best K

In [None]:
#next, we will fix our model to use the best K = 4 in our regression
age_spec <- nearest_neighbor(weight_func = 'rectangular', neighbors = 4)|>
    set_engine('kknn')|>
    set_mode('regression')

age_fit <- workflow()|>
    add_recipe(age_recipe)|>
    add_model(age_spec)|>
    fit(data = players_training)
age_fit

#now we will see the root mean squared prediction error
age_rmspe <- age_fit |>
        predict(players_testing) |>
        bind_cols(players_testing) |>
        metrics(truth = played_hours, estimate = .pred)|>
        filter(.metric == "rmse")
age_rmspe
#we can see that the root mean squared prediction error from our testing data is not significantly different 
#from the RMSE of our training model of best K

In [None]:
set.seed(123)
#now, we will make a visualization of our analysis
age_pred <- age_fit |>
          predict(players_training) |>
          bind_cols(players_training)
head(age_pred)
age_plot <- age_pred|>
    ggplot(aes(x = Age, y = played_hours)) +
    geom_point(alpha = 0.4) +
    geom_line(data = age_pred, mapping = aes(x = Age, y = .pred), colour = "blue") +
    labs(x = "Age (in Years)", y = "Hours Played", title = "Graph 2.0: Age of Player vs. Hours Played Visualization")+
    theme(text = element_text(size = 15))
age_plot
