Complete rewrite of resample_data to fix function.

DeclareDesign · Oct 21, 2017 · ad9a857 · ad9a857
1 parent 7fb8045
commit ad9a857
Showing 1 changed file with 59 additions and 35 deletions.
diff --git a/R/resample_data.R b/R/resample_data.R
@@ -28,59 +28,83 @@
 #'
 #'
 #' @export
-resample_data <- function(data, N, ID_labels = NULL) {
-  # setup
+resample_data = function(data, N, ID_labels=NULL) {
+  # User didn't provide an N or an ID label, it's clear they just want a regular bootstrap
   if (missing(N) & is.null(ID_labels)) {
     N <- nrow(data)
+    return(bootstrap_single_level(data, nrow(data), ID_label=NULL))
   }
-  k <- length(N) ## number of levels
 
-  # checks
-  if (!is.null(ID_labels) & (k != length(ID_labels))) {
+  # Error handling
+  if (!is.null(ID_labels) & (length(N) != length(ID_labels))) {
     stop(
       "If you provide more than one ID_labels to resample data for multilevel data, please provide a vector for N of the same length representing the number to resample at each level."
     )
   }
 
-  # Case 1: Single Level
-  if (k == 1) {
-    data <- bootstrap_single_level(data = data, N = N)
-  } else {
-    # Case 2: Multi Level
-
-    data <- bootstrap_single_level(data, ID_label = ID_labels[1], N = N[1])
+  if (any(!ID_labels %in% names(data))) {
+    stop(
+      "One or more of the ID labels you provided are not columns in the data frame provided."
+    )
+  }
 
-    for (i in 2:k) {
-      group_by_set <- ID_labels[1:(i - 1)]
-      group_by_list <- as.list(data[, group_by_set, drop = FALSE])
-      new_data_list <- split(data, group_by_list)
-      new_data_list <-
-        lapply(new_data_list,
-               bootstrap_single_level,
-               ID_label = ID_labels[i],
-               N = N[i])
-      data <- do.call(rbind, new_data_list)
-    }
+  if(length(N) > 10) {
+    stop(
+      "Multi-level bootstrap with more than 10 levels is not advised."
+    )
   }
-  rownames(data) <- NULL
-  return(data)
-}
 
+  # Single level bootstrap with explicit bootstrapping on a particular cluster variable
+  if(length(N)==1)
+  {
+    return(bootstrap_single_level(data,
+                                  N[1],
+                                  ID_label=ID_labels[1]))
+  } else {
+    # Do the current bootstrap level
+    current_boot_values = unique(data[, ID_labels[1]])
+    sampled_boot_values = sample(1:length(current_boot_values), N[1], replace=TRUE)
+    app = 0
 
+    # Iterate over each thing chosen at the current level
+    results_all = lapply(sampled_boot_values, function(i) {
+      new_results = resample_data(
+        data[data[, ID_labels[1]] == i, ],
+        N=N[2:length(N)],
+        ID_labels=ID_labels[2:length(ID_labels)]
+      )
+    })
+    #res = rbindlist(results_all)
+    res = do.call(rbind, results_all)
+    rownames(res) = NULL
+    # Return to preceding level
+    return(res)
+  }
+}
 
-bootstrap_single_level <-
-  function(data, ID_label = NULL, N) {
+bootstrap_single_level <- function(data, ID_label = NULL, N) {
+    if(dim(data)[1] == 0) {
+      stop("Data being bootstrapped has no rows.")
+    }
     if (is.null(ID_label)) {
-      boot_indicies <- sample(1:nrow(data), N, replace = TRUE)
-    } else {
+      # Simple bootstrap
+      boot_indices <- sample(1:nrow(data), N, replace = TRUE)
+    } else if(!ID_label %in% colnames(data)) {
+      stop("ID label provided is not a column in the data being bootstrapped.")
+    }  else {
+      # Bootstrapping unique values of ID_label (i.e. cluster selection when data
+      # are observations, not clusters
       boot_ids <-
         sample(unique(data[, ID_label]), size = N, replace = TRUE)
-      boot_indicies <- unlist(lapply(boot_ids, function(i) {
+      # Need to do the unlist-apply approach to ensure each row
+      # is appropriately duplicated. Faster than other ways to map
+      # cluster ids to row ids.
+      boot_indices <- unlist(lapply(boot_ids, function(i) {
         which(data[, ID_label] == i)
       }))
     }
-    new_data <- data[boot_indicies, , drop = FALSE]
-    return(new_data)
-  }
-
+    # Grab the relevant rows
+    new_data <- data[boot_indices, , drop = FALSE]
 
+    return(new_data)
+}