Skip to content

Commit

Permalink
Complete rewrite of resample_data to fix function.
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronrudkin committed Oct 21, 2017
1 parent 7fb8045 commit ad9a857
Showing 1 changed file with 59 additions and 35 deletions.
94 changes: 59 additions & 35 deletions R/resample_data.R
Expand Up @@ -28,59 +28,83 @@
#'
#'
#' @export
resample_data <- function(data, N, ID_labels = NULL) {
# setup
resample_data = function(data, N, ID_labels=NULL) {
# User didn't provide an N or an ID label, it's clear they just want a regular bootstrap
if (missing(N) & is.null(ID_labels)) {
N <- nrow(data)
return(bootstrap_single_level(data, nrow(data), ID_label=NULL))
}
k <- length(N) ## number of levels

# checks
if (!is.null(ID_labels) & (k != length(ID_labels))) {
# Error handling
if (!is.null(ID_labels) & (length(N) != length(ID_labels))) {
stop(
"If you provide more than one ID_labels to resample data for multilevel data, please provide a vector for N of the same length representing the number to resample at each level."
)
}

# Case 1: Single Level
if (k == 1) {
data <- bootstrap_single_level(data = data, N = N)
} else {
# Case 2: Multi Level

data <- bootstrap_single_level(data, ID_label = ID_labels[1], N = N[1])
if (any(!ID_labels %in% names(data))) {
stop(
"One or more of the ID labels you provided are not columns in the data frame provided."
)
}

for (i in 2:k) {
group_by_set <- ID_labels[1:(i - 1)]
group_by_list <- as.list(data[, group_by_set, drop = FALSE])
new_data_list <- split(data, group_by_list)
new_data_list <-
lapply(new_data_list,
bootstrap_single_level,
ID_label = ID_labels[i],
N = N[i])
data <- do.call(rbind, new_data_list)
}
if(length(N) > 10) {
stop(
"Multi-level bootstrap with more than 10 levels is not advised."
)
}
rownames(data) <- NULL
return(data)
}

# Single level bootstrap with explicit bootstrapping on a particular cluster variable
if(length(N)==1)
{
return(bootstrap_single_level(data,
N[1],
ID_label=ID_labels[1]))
} else {
# Do the current bootstrap level
current_boot_values = unique(data[, ID_labels[1]])
sampled_boot_values = sample(1:length(current_boot_values), N[1], replace=TRUE)
app = 0

# Iterate over each thing chosen at the current level
results_all = lapply(sampled_boot_values, function(i) {
new_results = resample_data(
data[data[, ID_labels[1]] == i, ],
N=N[2:length(N)],
ID_labels=ID_labels[2:length(ID_labels)]
)
})
#res = rbindlist(results_all)
res = do.call(rbind, results_all)
rownames(res) = NULL
# Return to preceding level
return(res)
}
}

bootstrap_single_level <-
function(data, ID_label = NULL, N) {
bootstrap_single_level <- function(data, ID_label = NULL, N) {
if(dim(data)[1] == 0) {
stop("Data being bootstrapped has no rows.")
}
if (is.null(ID_label)) {
boot_indicies <- sample(1:nrow(data), N, replace = TRUE)
} else {
# Simple bootstrap
boot_indices <- sample(1:nrow(data), N, replace = TRUE)
} else if(!ID_label %in% colnames(data)) {
stop("ID label provided is not a column in the data being bootstrapped.")
} else {
# Bootstrapping unique values of ID_label (i.e. cluster selection when data
# are observations, not clusters
boot_ids <-
sample(unique(data[, ID_label]), size = N, replace = TRUE)
boot_indicies <- unlist(lapply(boot_ids, function(i) {
# Need to do the unlist-apply approach to ensure each row
# is appropriately duplicated. Faster than other ways to map
# cluster ids to row ids.
boot_indices <- unlist(lapply(boot_ids, function(i) {
which(data[, ID_label] == i)
}))
}
new_data <- data[boot_indicies, , drop = FALSE]
return(new_data)
}

# Grab the relevant rows
new_data <- data[boot_indices, , drop = FALSE]

return(new_data)
}

0 comments on commit ad9a857

Please sign in to comment.