In [4]:
library(repr)
library(tidyverse)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [5]:
temp <- tempfile()
download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip", temp)
data <- read_delim(unz(temp, "student-mat.csv"), delim = ";") %>% 
        select(age, absences, G1, G2, G3)
unlink(temp)

Parsed with column specification:
cols(
  .default = col_character(),
  age = [32mcol_double()[39m,
  Medu = [32mcol_double()[39m,
  Fedu = [32mcol_double()[39m,
  traveltime = [32mcol_double()[39m,
  studytime = [32mcol_double()[39m,
  failures = [32mcol_double()[39m,
  famrel = [32mcol_double()[39m,
  freetime = [32mcol_double()[39m,
  goout = [32mcol_double()[39m,
  Dalc = [32mcol_double()[39m,
  Walc = [32mcol_double()[39m,
  health = [32mcol_double()[39m,
  absences = [32mcol_double()[39m,
  G1 = [32mcol_double()[39m,
  G2 = [32mcol_double()[39m,
  G3 = [32mcol_double()[39m
)

See spec(...) for full column specifications.



In [6]:
set.seed(123)
# splitting data into training data/testing data
data_split <- initial_split(data, prop = 0.75, strata = G3)

training_data <- training(data_split)
testing_data <- testing(data_split)

training_data
testing_data

age,absences,G1,G2,G3
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
18,6,5,6,6
17,4,5,5,6
15,10,7,8,10
⋮,⋮,⋮,⋮,⋮
21,3,10,8,7
18,0,11,12,10
19,5,8,9,9


age,absences,G1,G2,G3
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
16,6,13,14,14
16,4,8,10,10
16,2,15,15,16
⋮,⋮,⋮,⋮,⋮
18,14,6,5,5
19,0,7,5,0
20,11,9,9,9


In [None]:
age_scatter <- ggplot(training_data, aes(x = age, y = G3)) +
  geom_point() +
  xlab("Student Age") +
  ylab("Third Period Grade (0-20 scale)")  + 
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + 
  theme(text = element_text(size = 18))

In [None]:
absences_scatter <- ggplot(training_data, aes(x = absences, y = G3)) +
  geom_point() +
  xlab("Number of School Absences") +
  ylab("Third Period Grade (0-20 scale)")  +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
  theme(text = element_text(size = 18))

In [None]:
G1_scatter <- ggplot(training_data, aes(x = G1, y = G3)) +
  geom_point() +
  xlab("First Period Grade (0-20 scale)") +
  ylab("Third Period Grade (0-20 scale)") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + 
  theme(text = element_text(size = 18))

In [None]:
G2_scatter <- ggplot(training_data, aes(x = G2, y = G3)) +
  geom_point() +
  xlab("Second Period Grade (0-20 scale)") +
  ylab("Third Period Grade (0-20 scale)") +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE) + 
  theme(text = element_text(size = 18))

In [None]:
library(gridExtra)
require(grid)
options(repr.plot.width = 20, repr.plot.height = 10)
grid.arrange(age_scatter, absences_scatter, G1_scatter, G2_scatter,
             nrow=2,
             top=textGrob("Scatter Plots of Predictor Variables",
                          gp=gpar(fontsize=16,font=10))
            ) 

In [None]:
lm_spec <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")

lm_recipe <- recipe(G3 ~ age + absences + G1 + G2, data = training_data)

lm_fit <- workflow() %>%
  add_recipe(lm_recipe) %>%
  add_model(lm_spec) %>%
  fit(data = training_data)
lm_fit



# G3 = 1.44581 -0.20750*age + 0.04795*absences + 0.14683*G1 + 0.98564*G2

In [None]:
lm_test_results <- lm_fit %>%
  predict(testing_data) %>%
  bind_cols(testing_data) %>%
  metrics(truth = G3, estimate = .pred)
lm_test_results

In [None]:
coeffs <- tidy(pull_workflow_fit(lm_fit))
coeffs