In [None]:
library(tidyverse)
library(tidymodels)
library(repr)
library(dplyr)


In [None]:
playersurl <-"https://raw.githubusercontent.com/Alexis-Jang/Group7-DSCI100project/refs/heads/main/DSCI%20100%20Players.csv"
players.csv <- "players.csv"
download.file(playersurl,players.csv, mode = "wb")
players1 <- read.csv(players.csv)
head(players1)

The players data set has 196 observations in this data set. It tells us that there are 7 variables. Below are the variables and their relevant summary statistics:
  
`experience` - character (chr) data type
- Experience level(Amateur, Regular, Pro, Veteran)

`subscribe` - logical (log) data type
- Whether or not the player subscribed
  - FALSE: 52
  - TRUE: 144

`hashedEmail` - as a character (chr) data type
- The player's email (hidden)

`played_hours` - as a double (dbl) data type
- The hours spent playing on the server 
    - minimum: 0.000 
    - maximum: 223.100
    - median: 0.100
    - mean: 5.846
    - standard deviation: 28.35734 

`name` - as a character (chr) data type
- The player's name

`gender` - as a character (chr) data type
- The player's gender

`Age` - as a double (dbl) data type
- The player's age
    - minimum: 8.00
    - maximum: 50.00
    - median: 19.00
    - mean: 20.52
    - standard deviation: 6.174667
    - NAs: 2 -> This means that 2 players did not disclose their age

In [None]:
set.seed(123)
players <- players1 |>
    select(Age, played_hours)
head(players)

In [None]:
set.seed(123)
players_split <- initial_split(players, prop = 0.60, strata = played_hours)
players_training <- training(players_split)
players_testing <- testing(players_split)

players_training <- na.omit(players_training)
players_testing <- na.omit(players_testing)

In [None]:
set.seed(123)
options(repr.plot.width = 8, repr.plot.height = 7)
age_plot <- players_training|>
    ggplot(aes(x = Age, y = played_hours))+
            geom_bar(stat = "identity") +
    labs(x = "Age (in Years)", y = "Hours Played", title = "Graph 1.0: Age vs. Hours Played Bar Graph") +
    theme(element_text(size = 15))
age_plot

In [None]:
set.seed(123)
players_grouped <- players_training|>
    group_by(Age)|>
    summarize(mean_played_time = mean(played_hours))
players_grouped

In [None]:
set.seed(123)
players_grouped_graph <- players_grouped|>
    ggplot(aes(x = Age, y = mean_played_time))+
    geom_bar(stat = "identity")+
    labs(x = "Age (in Years)", y = "Mean Hours Played", title = "Age vs. Mean Hours Played Scatter Plot")
players_grouped_graph

In [None]:
set.seed(123)
players_grouped_scatter <- players_grouped|>
    ggplot(aes(x = Age, y = mean_played_time))+
    geom_point()+
    labs(x = "Age (in Years)", y = "Mean Hours Played", title = "Age vs. Mean Hours Played Scatter Plot")
players_grouped_scatter

In [None]:
set.seed(123)
age_recipe <- recipe(played_hours ~ Age, data = players_training)|>
    step_center(Age)|>
    step_scale(Age)
age_recipe

In [None]:
set.seed(123)
age_model <- nearest_neighbor(weight_func = 'rectangular', neighbors = tune())|>
    set_engine('kknn')|>
    set_mode('regression')
age_model

In [None]:
set.seed(123)
age_vfold <- vfold_cv(players_training, v = 5, strata = played_hours)
age_grid <- tibble(neighbors = c(1, 2, 3, 4, 5))

age_w <- workflow()|>
    add_recipe(age_recipe)|>
    add_model(age_model)
age_w

age_workflow <- age_w |>
    tune_grid(resamples = age_vfold, grid = age_grid) |>
    collect_metrics()
age_workflow

age_trained <- age_workflow|>
    filter(.metric == "rmse")|>
    filter(mean == min(mean))
age_trained

age_spec <- nearest_neighbor(weight_func = 'rectangular', neighbors = 4)|>
    set_engine('kknn')|>
    set_mode('regression')

age_fit <- workflow()|>
    add_recipe(age_recipe)|>
    add_model(age_spec)|>
    fit(data = players_training)
age_fit


In [None]:

# ggplot(players_testing, aes(x = played_hours, y = .pred)) +
#   geom_point() +
#   geom_abline(slope = 1, intercept = 0, color = "red") +
#   theme_minimal() +
#   labs(x = "Actual Values", y = "Predicted Values")
