In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
library(GGally)
library(ISLR)
library(dplyr)
options(repr.matrix.max.rows = 6)

In [None]:
Title: Project Final Report


Introduction:
provide some relevant background information on the topic so that someone unfamiliar with it will be prepared to understand the rest of your report
clearly state the question you tried to answer with your project
identify and fully describe the dataset that was used to answer the question

In our project, we are using a dataset that contains information on players playing on Minecraft servers. This dataset contains the variables player experience, status on subscription to newsletter, hashed email, played hours, name, gender, and age. With this information, we are trying to answer the question, "We would like to know which "kinds" of players are most likely to contribute a large amount of data so that we can target those players in our recruiting efforts." To answer this, we used the dataset comparing different variables with played hours such as player experience and age.

Methods & Results:
describe the methods you used to perform your analysis from beginning to end that narrates the analysis code.
your report should include code which:
loads data 
wrangles and cleans the data to the format necessary for the planned analysis
performs a summary of the data set that is relevant for exploratory data analysis related to the planned analysis 
creates a visualization of the dataset that is relevant for exploratory data analysis related to the planned analysis
performs the data analysis
creates a visualization of the analysis 
note: all figures should have a figure number and a legend

To perform this analysis, we first print out the datasets containing the variables needed. Then, we wrangle and clean the data, removing all cells that contain 0 observations. After that, we summarize the data in groups such as age and total hours played, and we calculate means and medians. With these analyses, we then plot visualizations of the analyses and interpret our findings.

Discussion:
summarize what you found
discuss whether this is what you expected to find?
discuss what impact could such findings have?
discuss what future questions could this lead to?


With our results, we found that...




References
You may include references if necessary, as long as they all have a consistent citation style.


In [None]:
players <- read_csv("https://raw.githubusercontent.com/Chalkkk/dsci-final-group-project/refs/heads/main/data/players.csv") |>
           as_tibble()
players

In [None]:
players <- players |>
           select(experience, played_hours, age)
players

In [None]:
# by all data points
hours_played_sum <- players |>
                    summarize(max_played_hours = max(played_hours, na.rm = TRUE),
                              min_played_hours = min(played_hours, na.rm = TRUE),
                              total_played_hours = sum(played_hours, na.rm = TRUE),
                              average_played_hours = mean(played_hours, na.rm = TRUE),
                              median_played_hours = median(played_hours, na.rm = TRUE))                            
hours_played_sum

In [None]:
# Total and average played hours of players by experience level
hours_experience_sum <- players |>
                        group_by(experience) |>
                        summarize(total_played_hours = sum(played_hours, na.rm = TRUE),
                                  average_played_hours = mean(played_hours, na.rm = TRUE),
                                  median_played_hours = median(played_hours, na.rm = TRUE))
hours_experience_sum

In [None]:
# Maximum and minimum played hours for each experience level
hours_experience_range <- players |>
                          group_by(experience) |>
                          summarize(max_played_hours = max(played_hours, na.rm = TRUE),min_played_hours = min(played_hours, na.rm = TRUE))
hours_experience_range

In [None]:
# Total and average played hours of players by age
hours_age_sum <- players |>
                    group_by(age) |>
                    summarize(total_played_hours = sum(played_hours, na.rm = TRUE),
                              average_played_hours = mean(played_hours, na.rm = TRUE),
                              median_played_hours = median(played_hours, na.rm = TRUE))
hours_age_sum

In [None]:
# Maximum and minimum played hours for each age
hours_age_range <- players |>
                   group_by(age) |>
                   summarize(max_played_hours = max(played_hours, na.rm = TRUE),
                             min_played_hours = min(played_hours, na.rm = TRUE))
hours_age_range

In [None]:
hours_age_exp_sum <- players |>
                     group_by(age, experience) |>
                     summarize(total_played_hours = sum(played_hours, na.rm = TRUE),
                               average_played_hours = mean(played_hours, na.rm = TRUE),
                               median_played_hours = median(played_hours, na.rm = TRUE))
hours_age_exp_sum

In [None]:
hours_age_exp_range <- players |>
                       group_by(age, experience) |>
                       summarize(max_played_hours = max(played_hours, na.rm = TRUE),
                                 min_played_hours = min(played_hours, na.rm = TRUE))
hours_age_exp_range

In [None]:
hours_exp_age_sum <- players |>
                     group_by(experience, age) |>
                     summarize(total_played_hours = sum(played_hours, na.rm = TRUE),
                               average_played_hours = mean(played_hours, na.rm = TRUE),
                               median_played_hours = median(played_hours, na.rm = TRUE))
hours_exp_age_sum

In [None]:
hours_exp_age_range <- players |>
                       group_by(experience, age) |>
                       summarize(max_played_hours = max(played_hours, na.rm = TRUE),
                                 min_played_hours = min(played_hours, na.rm = TRUE))
hours_exp_age_range

In [None]:
hours_experience_bars <- hours_experience_sum |>
                         mutate(experience = factor(experience, levels = c("Beginner", "Amateur", "Regular", "Pro", "Veteran"))) |>
                         ggplot(aes(x = experience, y = average_played_hours, fill = experience)) + 
                         geom_bar(stat = "identity") +
                         labs(title = "Average Hours Played by Experience Level", 
                              x = "Experience Level", 
                              y = "Average Hours Played")
hours_experience_bars

In [None]:
hours_age_bars <- hours_age_sum |>
                  ggplot(aes(x = factor(age), y = average_played_hours)) +
                  geom_bar(stat = "identity", 
                           fill = "steelblue", 
                           colour = "black") +
                  labs(title = "Average Hours Played by Age",
                       x = "Age",
                       y = "Average Hours Played")
hours_age_bars

In [None]:
options(repr.plot.width = 10, repr.plot.height = 12)

hour_age_experience_plot <- players |>
  ggplot(aes(x = age, y = played_hours)) +
  geom_point(aes(colour = experience)) + 
  geom_smooth(method = "lm", se = FALSE, colour = "black") +  # overall trendline in black
  labs(title = "Hours Played vs Age", 
       x = "Age", 
       y = "Played Hours", 
       colour = "Experience Level")

hour_age_experience_plot