In [None]:
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [None]:
# Load the dataset
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
names <- c("age", "workclass", "fnlwgt", "education", "education_num", "martial-status", "occupation", "relationship",
           "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income")
data_raw <- read_csv(url, col_names = names)
data_raw

In [None]:
# Wrangle data
data <- data_raw %>%
    filter(native_country == "United-States") %>%
    select(c("age", "education_num", "hours_per_week", "income")) %>% # select only columns that are used
    mutate(income = as_factor(ifelse(income == "<=50K", 0, 1)))
data

In [None]:
# Split into training data and testing data
set.seed(1) # set the seed

income_split = initial_split(data, prop = 0.75, strata = income) # strata = target variable
income_training = training(income_split)
income_testing = testing(income_split)

## Preliminary Exploratory Data Analysis

In [None]:
# Histograms for numerical predictor variables
bins_num = ceiling(log(nrow(data),2))+1 # number of bins, by Sturges' formula
theme_set(theme_grey())
target_var = income_training$income # target variable

ggplot(income_training, aes(x = age, fill = target_var)) +
    geom_histogram(bins = bins_num) +
    facet_grid(rows = target_var)
ggplot(income_training, aes(x = education_num, fill = target_var)) +
    geom_histogram(bins = bins_num) +
    facet_grid(rows = target_var)
ggplot(income_training, aes(x = hours_per_week, fill = target_var)) +
    geom_histogram(bins = bins_num) +
    facet_grid(rows = target_var)

In [None]:
# Bar plots for (potential) target variables
ggplot(data, aes(x = income)) +
    geom_bar() +
    xlab("Income") +
    ylab("Frequency")

## Methods

In [None]:
# finalize methods

## Expected Outcomes

In [None]:
# Create recipe, choose target and predictor variables
# income ~ age + education_num + hours_per_week

income_recipe <- recipe(income ~ age + education_num + hours_per_week, data = income_training) %>%
    step_scale(all_predictors()) %>%
    step_center(all_predictors())
#     step_upsample(income, over_ratio = 1, skip = FALSE)

In [None]:
# Create model with specific value of k (no cross-validation)
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 20) %>%
    set_engine("kknn") %>%
    set_mode("classification")

# Fit model
income_fit <- workflow() %>%
    add_recipe(income_recipe) %>%
    add_model(knn_spec) %>%
    fit(data = income_training)

In [None]:
income_test_predictions <- predict(income_fit, income_testing) %>%
    bind_cols(income_testing)
# income_test_predictions

In [None]:
income_prediction_accuracy <- income_test_predictions %>%
        metrics(truth = income, estimate = .pred_class)  
income_prediction_accuracy

## Cross Validation

In [None]:
# Cross-validation
income_vfold <- vfold_cv(income_training, v = 5, strata = income)

knn_tune <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) %>%
    set_engine("kknn") %>%
    set_mode("classification")

In [None]:
# Perform cross-validation

# Values of k used in the cross-validation
k_vals <- tibble(neighbors = seq(from = 1, to = 20, by = 1))

income_fit_resample <- workflow() %>%
    add_recipe(income_recipe) %>%
    add_model(knn_tune) %>%
    tune_grid(resamples = income_vfold, grid = 10) %>%
    collect_metrics()
income_fit_resample

In [None]:
# Plot the accuracies for various values of k
accuracies <- income_fit_resample %>% 
    filter(.metric == "accuracy")
# accuracies

accuracy_versus_k <- ggplot(accuracies, aes(x = neighbors, y = mean)) +
      geom_point() +
      geom_line() +
      labs(x = "Neighbors", y = "Accuracy Estimate") +
      scale_x_continuous(breaks = seq(0, max(accuracies$neighbors), by = 1)) +  # adjusting the x-axis
      scale_y_continuous(limits = c(0, 1)) # adjusting the y-axis
accuracy_versus_k

In [None]:
# determine max accuracy, and k which achieves this max
max_accuracy <- accuracies %>%
    arrange(mean) %>%
    slice_tail() %>%
    select(neighbors, mean)
max_accuracy