Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
134 lines (94 sloc) 3.95 KB
library(tidyverse)
library(caret)
# Package for easy timing in R
library(tictoc)
# Demo of timer function --------------------------------------------------
# Run the next 5 lines at once
tic()
Sys.sleep(3)
timer_info <- toc()
runtime <- timer_info$toc - timer_info$tic
runtime
# Get data ----------------------------------------------------------------
# Accelerometer Biometric Competition Kaggle competition data
# https://www.kaggle.com/c/accelerometer-biometric-competition/data
train <- read_csv("train.csv")
# YOOGE!
dim(train)
# knn modeling ------------------------------------------------------------
model_formula <- as.formula(Device ~ X + Y + Z)
# Values to use:
#
# Note: How we space the different n_values and k_values using multiplicative
# instead of additive differences. We do this because the difference in runtime
# between n=1000 and n=1001 is probabily insignificant, but the difference
# between n=1000 and n=10000 is more meaningful. i.e. we focus on multiplicative
# differences
n_values <- seq(from=log10(1000), to=log10(nrow(train)), length=100) %>%
10^. %>%
round()
k_values <- seq(from=log10(2), to=log10(10^5), length=10) %>%
10^. %>%
round()
# See values
n_values
k_values
# Save runtimes here
runtime_dataframe <- expand.grid(n_values, k_values) %>%
as_tibble() %>%
rename(n=Var1, k=Var2) %>%
mutate(runtime = n*k)
runtime_dataframe
# Time knn here -----------------------------------------------------------
# This took 3000s = 50min total to run; you can skip this for-loop and load() the
# saved runtime_dataframe in the next "Plot your results" section
total_time <- 0
for(i in 1:nrow(runtime_dataframe)){
n <- runtime_dataframe$n[i]
k <- runtime_dataframe$k[i]
# Create subset of train data of size n. Note we use slice() instead of
# sample_n() because reading off first n rows is much faster than sampling n
# rows at random
train_subset <- train %>%
slice(1:n)
# We only time the knn3() function call. Note that data = train_subset
tic()
model_knn <- caret::knn3(model_formula, data = train_subset, k = k)
timer_info <- toc()
runtime_dataframe$runtime[i] <- timer_info$toc - timer_info$tic
total_time <- total_time + runtime_dataframe$runtime[i]
}
# Save the results so we don't need to run them each time
save(runtime_dataframe, file="runtime_dataframe.RData")
# Plot your results ---------------------------------------------------------
# Save time by loading previously run results here:
load("runtime_dataframe.RData")
# Think of creative ways to improve this barebones plot. Note: you don't have to
# necessarily use geom_point
runtime_plot <- ggplot(runtime_dataframe, aes(x=n, col=as.factor(k), y=runtime)) +
geom_line() +
geom_point(size=0.5) +
# scale_x_log10() +
labs(x="Training set size n", y="Runtime (in seconds)", col="k",
title="Runtime of caret::knn3() as a function of n and k")
ggsave(runtime_plot, filename="solutions_mcpherson.png", width=16/2.5, height = 9/2.5)
runtime_plot
# Inspect in an interactive plot
library(plotly)
ggplotly(runtime_plot)
# Runtime complexity ------------------------------------------------------
# Can you write out the rough Big-O runtime algorithmic complexity as a function
# of:
# -n: number of points in training set
# -k: number of neighbors to consider
# -d: number of predictors used? In this case d is fixed at 3
# Looking at the above plot, it seems that the
# -runtime is sort of linear with respect to n, but also sort of O(n^2)?!? This
# isn't what I expected! I expected a strong O(n^2). Perhaps n isn't big enough
# yet to really see a strong polynomial trend?
# -depending on k, there seem to be different slopes to the lines, but the order
# of the slope magnitude doesn't seem to be monotonic in k
# -We kept d=3 fixed. As d increases, I presume the time it takes to compute
# distances would increase as well
# What is computational complexity?
# https://stats.stackexchange.com/questions/219655/k-nn-computational-complexity