In [5]:
# import required libraries
library(tidyverse)
library(repr)
library(tidymodels)

In [6]:
dass_data <- read_csv("dass_data.csv")
colnames(dass_data)

Parsed with column specification:
cols(
  .default = col_double(),
  country = [31mcol_character()[39m,
  major = [31mcol_character()[39m
)

See spec(...) for full column specifications.



In [7]:
# I am going to assume we are not going to work with the demographic for now
# the VLC# columns seems interesting but it will be dropped for now
#The response time, duration, and position of question will also be dropped
#This leaves on the Q#A, TIPI, and VCL# coluums
dass_selected <- dass_data %>%
    select(ends_with("A") | starts_with("TIPI"))
names(dass_selected)

In [8]:
#split data and assign training data to dass_train

dass_split <- initial_split(dass_selected, prop = 0.75, strata = Q13A)  
dass_train <- training(dass_split)   
dass_test <- testing(dass_split)

head(dass_train)
head(dass_test)

Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,⋯,TIPI1,TIPI2,TIPI3,TIPI4,TIPI5,TIPI6,TIPI7,TIPI8,TIPI9,TIPI10
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
4,4,2,4,4,4,4,4,2,1,⋯,1,5,7,7,7,7,7,5,1,1
2,3,2,1,3,3,4,2,3,3,⋯,1,1,7,4,6,4,6,1,6,1
1,1,1,1,3,2,2,1,1,1,⋯,7,6,4,5,3,2,6,3,5,2
3,2,4,1,4,4,3,4,4,4,⋯,1,7,5,7,5,7,1,2,1,7
3,1,2,1,3,3,1,3,1,1,⋯,5,3,6,6,3,4,4,7,5,7
3,3,2,2,4,3,1,3,4,3,⋯,6,5,6,6,6,2,5,3,3,3


Q1A,Q2A,Q3A,Q4A,Q5A,Q6A,Q7A,Q8A,Q9A,Q10A,⋯,TIPI1,TIPI2,TIPI3,TIPI4,TIPI5,TIPI6,TIPI7,TIPI8,TIPI9,TIPI10
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
4,1,2,3,4,4,3,4,3,2,⋯,6,5,4,7,5,4,7,7,1,5
3,1,4,1,4,3,1,3,2,4,⋯,2,5,2,2,5,6,5,5,3,2
2,2,3,4,4,2,4,4,4,3,⋯,2,5,3,6,5,5,5,6,3,3
1,1,2,1,3,1,1,3,3,2,⋯,2,1,6,1,7,7,7,2,6,7
1,1,2,3,4,1,3,3,3,4,⋯,2,5,6,5,3,2,6,3,5,5
4,4,3,4,3,4,4,4,4,3,⋯,1,4,5,7,5,7,6,7,1,4


In [9]:
# lets try this as a knn regression problem
dass_recipe <- recipe(Q13A ~ ., data = dass_selected) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())

dass_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 3) %>%
    set_engine("kknn") %>%
    set_mode("regression")

dass_fit <- workflow() %>%
    add_recipe(dass_recipe) %>%
    add_model(dass_spec) %>%
    fit(data = dass_train)

dass_summary <- dass_fit %>%
          predict(dass_test) %>%
          bind_cols(dass_test) %>%
          metrics(truth = Q13A, estimate = .pred)

dass_summary

.metric,.estimator,.estimate
<chr>,<chr>,<dbl>
rmse,standard,0.6964609
rsq,standard,0.5945151
mae,standard,0.4983571


In [10]:
# further tuning the k value for best performance
set.seed(1234)

dass_spec_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("regression")

dass_workflow <- workflow() %>%
    add_recipe(dass_recipe) %>%
    add_model(dass_spec_tune)

dass_vfold <- vfold_cv(dass_train, v = 5, strata = Q13A)

gridvals <- tibble(neighbors = seq(from = 1, to = 100, by = 10))

dass_results <- dass_workflow %>% tune_grid(resamples = dass_vfold, grid = gridvals) %>%
                    collect_metrics()

dass_min <- dass_results %>%
   filter(.metric == "rmse") %>%
   arrange(mean, std_err) %>% 
   slice(1)
dass_min

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
41,rmse,standard,0.6413403,5,0.00468002,Model05


In [12]:
# the wide range tune showed 41 to be the best, we can do a more detailed close range tune
# tuning the k value for best performance
set.seed(1234)

dass_spec_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("regression")

dass_workflow <- workflow() %>%
    add_recipe(dass_recipe) %>%
    add_model(dass_spec_tune)

dass_vfold <- vfold_cv(dass_train, v = 2, strata = Q13A)

gridvals <- tibble(neighbors = seq(from = 30, to = 50, by = 1)) # changed here

dass_results <- dass_workflow %>% tune_grid(resamples = dass_vfold, grid = gridvals) %>%
                    collect_metrsics()

dass_min <- dass_results %>%
   filter(.metric == "rmse") %>%
   arrange(mean, std_err) %>% 
   slice(1)
dass_min

neighbors,.metric,.estimator,mean,n,std_err,.config
<dbl>,<chr>,<chr>,<dbl>,<int>,<dbl>,<chr>
42,rmse,standard,0.6448181,2,7.457832e-05,Model13


In [None]:
# it shows that the even with the most optimized k value, the rmse is still not very ideal.
# Since we have two distinct groups of predictors, Q questions and TIPI personality tests, we should split them up and see if that improves the code

dass_TP <- dass_data %>%
    select(ends_with("A") | starts_with("TIPI"))
names(dass_selected)