Merge pull request #20 from DeclareDesign/variable_creation

Tests, error handling, and resample rewrite
DeclareDesign · Oct 25, 2017 · 38a2f02 · 38a2f02
2 parents 18dd32d + 937b1a4
commit 38a2f02
Show file tree

Hide file tree

Showing 5 changed files with 388 additions and 72 deletions.
diff --git a/R/fabricate.R b/R/fabricate.R
@@ -60,9 +60,23 @@ fabricate <-
     all_levels <- check_all_levels(options)
 
     if (!missing(data) && !"data.frame"  %in% class(data)) {
-      stop(
-        "Please provide a data object to the data argument, e.g. a data.frame, tibble, or sf object."
-      )
+      if(is.null(dim(data))) {
+          stop(
+            "User provided data must be a data frame. Provided data was low dimensional."
+          )
+      }
+      if(!"data" %in% names(sys.call())) {
+        stop(
+          "The data argument must be a data object. The argument call, ", deparse(substitute(data)), ", was not a data object (e.g. a data.frame, tibble, sf object, or convertible matrix)."
+        )
+      }
+      tryCatch({
+        data = as.data.frame(data)
+      }, error=function(e) {
+        stop(
+          "User provided data could not convert to a data frame."
+        )
+      })
     }
 
 
@@ -92,9 +106,26 @@ fabricate <-
       if(missing(N)) N <- NULL
       if(missing(ID_label)) ID_label <- NULL
 
-      ID_label <- substitute(ID_label)
-      if (!is.null(ID_label)) {
-        ID_label <- as.character(ID_label)
+      if(is.symbol(substitute(ID_label))) {
+        ID_label <- substitute(ID_label)
+        if (!is.null(ID_label)) {
+          ID_label <- as.character(ID_label)
+        }
+      } else if(!is.null(ID_label)) {
+        if(is.vector(ID_label) & length(ID_label) > 1) {
+          # Vector of length n>1, error
+          stop("Provided ID_label must be a character vector of length 1 or variable name.")
+        } else if(is.vector(ID_label) & is.numeric(ID_label[1])) {
+          # Numeric ID_label
+          warning("Provided ID_label is numeric and will be prefixed with the character \"X\"")
+          ID_label <- as.character(ID_label)
+        } else if(is.vector(ID_label) & is.character(ID_label[1])) {
+          # Valid ID_label
+          ID_label <- as.character(ID_label)
+        } else if(!is.null(dim(ID_label))) {
+          # Higher dimensional ID_label
+          stop("Provided ID_label must be a character vector or variable name, not a data frame or matrix.")
+        }
       }
 
       fabricate_data_single_level(
@@ -121,12 +152,36 @@ fabricate_data_single_level <- function(data = NULL,
 
   if (!is.null(N)) {
     if (length(N) != 1) {
-      stop(
-        "At the top level, ",
-        ID_label,
-        ", you must provide a single number to N."
-      )
+      if(is.null(ID_label)) {
+        stop(
+          "At the top level, you must provide a single number to N."
+          )
+      } else {
+        stop(
+          "At the top level, ",
+          ID_label,
+          ", you must provide a single number to N."
+        )
+      }
     }
+
+    if(is.numeric(N) & any(!N%%1 == 0)) {
+      stop(paste0(
+        "The provided N must be an integer number. Provided N was of type ",
+        typeof(N)
+      ))
+    }
+
+    if(!is.numeric(N)) {
+      tryCatch({
+        N = as.numeric(N)
+      }, error=function(e) {
+        stop(
+          "The provided value for N must be an integer number."
+        )
+      })
+    }
+
     data <- data.frame()
     existing_ID <- FALSE
   } else if(!is.null(data)){
@@ -181,7 +236,6 @@ check_all_levels <- function(options){
 
   if (length(options) == 0)  return(FALSE)
 
-
   is_function <- sapply(options, function(i) {
       is_lang(get_expr(i))
   })

diff --git a/R/resample_data.R b/R/resample_data.R
@@ -28,59 +28,83 @@
 #'
 #'
 #' @export
-resample_data <- function(data, N, ID_labels = NULL) {
-  # setup
+resample_data = function(data, N, ID_labels=NULL) {
+  # User didn't provide an N or an ID label, it's clear they just want a regular bootstrap
   if (missing(N) & is.null(ID_labels)) {
     N <- nrow(data)
+    return(bootstrap_single_level(data, nrow(data), ID_label=NULL))
   }
-  k <- length(N) ## number of levels
 
-  # checks
-  if (!is.null(ID_labels) & (k != length(ID_labels))) {
+  # Error handling
+  if (!is.null(ID_labels) & (length(N) != length(ID_labels))) {
     stop(
       "If you provide more than one ID_labels to resample data for multilevel data, please provide a vector for N of the same length representing the number to resample at each level."
     )
   }
 
-  # Case 1: Single Level
-  if (k == 1) {
-    data <- bootstrap_single_level(data = data, N = N)
-  } else {
-    # Case 2: Multi Level
-
-    data <- bootstrap_single_level(data, ID_label = ID_labels[1], N = N[1])
+  if (any(!ID_labels %in% names(data))) {
+    stop(
+      "One or more of the ID labels you provided are not columns in the data frame provided."
+    )
+  }
 
-    for (i in 2:k) {
-      group_by_set <- ID_labels[1:(i - 1)]
-      group_by_list <- as.list(data[, group_by_set, drop = FALSE])
-      new_data_list <- split(data, group_by_list)
-      new_data_list <-
-        lapply(new_data_list,
-               bootstrap_single_level,
-               ID_label = ID_labels[i],
-               N = N[i])
-      data <- do.call(rbind, new_data_list)
-    }
+  if(length(N) > 10) {
+    stop(
+      "Multi-level bootstrap with more than 10 levels is not advised."
+    )
   }
-  rownames(data) <- NULL
-  return(data)
-}
 
+  # Single level bootstrap with explicit bootstrapping on a particular cluster variable
+  if(length(N)==1)
+  {
+    return(bootstrap_single_level(data,
+                                  N[1],
+                                  ID_label=ID_labels[1]))
+  } else {
+    # Do the current bootstrap level
+    current_boot_values = unique(data[, ID_labels[1]])
+    sampled_boot_values = sample(1:length(current_boot_values), N[1], replace=TRUE)
+    app = 0
 
+    # Iterate over each thing chosen at the current level
+    results_all = lapply(sampled_boot_values, function(i) {
+      new_results = resample_data(
+        data[data[, ID_labels[1]] == i, ],
+        N=N[2:length(N)],
+        ID_labels=ID_labels[2:length(ID_labels)]
+      )
+    })
+    #res = rbindlist(results_all)
+    res = do.call(rbind, results_all)
+    rownames(res) = NULL
+    # Return to preceding level
+    return(res)
+  }
+}
 
-bootstrap_single_level <-
-  function(data, ID_label = NULL, N) {
+bootstrap_single_level <- function(data, ID_label = NULL, N) {
+    if(dim(data)[1] == 0) {
+      stop("Data being bootstrapped has no rows.")
+    }
     if (is.null(ID_label)) {
-      boot_indicies <- sample(1:nrow(data), N, replace = TRUE)
-    } else {
+      # Simple bootstrap
+      boot_indices <- sample(1:nrow(data), N, replace = TRUE)
+    } else if(!ID_label %in% colnames(data)) {
+      stop("ID label provided is not a column in the data being bootstrapped.")
+    }  else {
+      # Bootstrapping unique values of ID_label (i.e. cluster selection when data
+      # are observations, not clusters
       boot_ids <-
         sample(unique(data[, ID_label]), size = N, replace = TRUE)
-      boot_indicies <- unlist(lapply(boot_ids, function(i) {
+      # Need to do the unlist-apply approach to ensure each row
+      # is appropriately duplicated. Faster than other ways to map
+      # cluster ids to row ids.
+      boot_indices <- unlist(lapply(boot_ids, function(i) {
         which(data[, ID_label] == i)
       }))
     }
-    new_data <- data[boot_indicies, , drop = FALSE]
-    return(new_data)
-  }
-
+    # Grab the relevant rows
+    new_data <- data[boot_indices, , drop = FALSE]
 
+    return(new_data)
+}
diff --git a/R/variable_creation_functions.R b/R/variable_creation_functions.R
@@ -62,12 +62,18 @@ draw_discrete <-
                           "ordered",
                           "count")) {
       stop(
-        "Please choose either 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count' as a data type."
+        "Please choose either 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered', or 'count' as a data type."
+      )
+    }
+
+    if (N %% length(x) & type != "categorical") {
+      stop(
+        "N is not an even multiple of the length of the vector x."
       )
     }
 
     if (mode(x) != "numeric") {
-      stop("Please provide a numeric vector to x.")
+      stop("\"x\" must be a number or vector of numbers.")
     }
 
     if (link == "logit") {
@@ -85,16 +91,49 @@ draw_discrete <-
       }
       if (link == "identity")
         if (!all(0 <= x & x <= 1)) {
-          warning("The identity link requires values between 0 and 1, inclusive.")
+          stop("The identity link requires probability values between 0 and 1, inclusive.")
         }
 
       out <- rbinom(N, k, prob)
 
     } else if (type == "binomial")   {
       if (link == "identity")
         if (!all(0 <= x & x <= 1)) {
-          warning("The identity link requires values between 0 and 1, inclusive.")
+          stop("The identity link requires values between 0 and 1, inclusive.")
+        }
+
+      if(is.vector(k) & length(k)>1) {
+        if(N %% length(k)) {
+          stop(
+            "\"N\" is not an even multiple of the length of the number of trials, \"k\"."
+          )
+        }
+        if(!all(is.numeric(k) & (is.integer(k) | !k%%1))) {
+          stop(
+            "All numbers of trials should be integer numbers."
+          )
         }
+      }
+      if(!is.null(dim(k))) {
+        stop(
+          "Number of trials must be an integer or vector, not higher-dimensional."
+        )
+      }
+      if(is.null(k) | is.na(k)) {
+        stop(
+          "Number of trials must be specified, not null or NA."
+        )
+      }
+      if(is.numeric(k) & !is.integer(k) & k%%1) {
+        stop(
+          "Number of trials must be an integer."
+        )
+      }
+      if(k <= 0) {
+        stop(
+          "Number of trials must be a positive integer."
+        )
+      }
 
       out <- rbinom(N, k, prob)
 
@@ -103,21 +142,60 @@ draw_discrete <-
         x <- x + rnorm(N)
       }
 
-      # out <- cut(x, breaks, labels = break_labels) - 1
+      if (is.null(breaks) | any(is.na(breaks))) {
+        stop("You must specify numeric breaks for ordered data.")
+      }
+      if (any(!is.numeric(breaks))) {
+        stop("All breaks specified for ordered data must be numeric.")
+      }
+      if (is.matrix(breaks) | is.data.frame(breaks)) {
+        stop("Numeric breaks must be a vector.")
+      }
+      if (length(breaks) < 3) {
+        stop("Numeric breaks for ordered data must be of at least length 3.")
+      }
+      if (!all(sort(breaks) == breaks)) {
+        stop("Numeric breaks must be in ascending order.")
+      }
+      if(any(breaks[1] > x) | any(breaks[length(breaks)] < x)) {
+        stop("Numeric break endpoints should be outside min/max of x data range.")
+      }
+      if(is.vector(break_labels) & !is.logical(break_labels) & all(!is.na(break_labels)) & length(break_labels) != length(breaks)-1) {
+        stop("Break labels should be of length one less than breaks.")
+      }
+
       out <- cut(x, breaks, labels = break_labels)
 
     } else if (type == "count") {
       if (link != "identity") {
         stop("Count data does not accept link functions.")
       }
 
-      out <- rpois(N, lambda = x)
+      if (any(x < 0)) {
+        stop(
+          "All provided count values must be non-negative."
+        )
+      }
 
-      ## Categorical
+      out <- rpois(N, lambda = x)
 
     } else if (type == "categorical") {
-      if (is.null(dim(x)))
-        stop("For a categorical distribution a matrix of probabilities should be provided")
+      if (link != "identity") {
+        stop("Categorical data does not accept link functions.")
+      }
+
+      if (is.null(dim(x))) {
+        if (is.vector(x) & all(is.numeric(x)) & length(x)>1) {
+          x <- matrix(rep(x, N), byrow=TRUE, ncol=length(x), nrow=N)
+          warning(
+            "For a categorical (multinomial) distribution, a matrix of probabilities should be provided. Data generated by interpreting vector of category probabilities, identical for each observation."
+          )
+        } else {
+          stop(
+            "For a categorical (multinomial) distribution, a matrix of probabilities should be provided"
+          )
+        }
+      }
       if (!all(apply(x, 1, min) > 0)) {
         stop(
           "For a categorical (multinomial) distribution, the elements of x should be positive and sum to a positive number."