diff --git a/DESCRIPTION b/DESCRIPTION index f31b42a..9d94a6b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: fabricatr Type: Package Title: Imagine your data before you collect it -Version: 1.0.0 +Version: 1.0.1 Authors@R: c(person("Graeme", "Blair", email = "graeme.blair@ucla.edu", role = c("aut", "cre")), person("Jasper", "Cooper", email = "jjc2247@columbia.edu", role = c("aut")), person("Alexander", "Coppock", email = "alex.coppock@yale.edu", role = c("aut")), @@ -22,6 +22,7 @@ Suggests: dplyr, knitr, rmarkdown, - data.table -FasterWith: data.table + data.table, + mvnfast +FasterWith: data.table, mvnfast VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 885dd0f..dbc814e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,12 +1,16 @@ # Generated by roxygen2: do not edit by hand export(ALL) +export(add_level) +export(cross_level) export(draw_binary) export(draw_binary_icc) export(draw_discrete) export(draw_normal_icc) export(fabricate) +export(join) export(level) +export(modify_level) export(resample_data) importFrom(rlang,eval_tidy) importFrom(rlang,get_expr) @@ -16,6 +20,7 @@ importFrom(rlang,lang_modify) importFrom(rlang,lang_name) importFrom(rlang,quo) importFrom(rlang,quo_name) +importFrom(rlang,quo_text) importFrom(rlang,quos) importFrom(stats,median) importFrom(stats,pnorm) diff --git a/R/cross_classify_helpers.R b/R/cross_classify_helpers.R new file mode 100644 index 0000000..bb3ed1d --- /dev/null +++ b/R/cross_classify_helpers.R @@ -0,0 +1,135 @@ +join_dfs = function(dfs, variables, N, sigma=NULL, rho=0) { + # Error handling + if(is.data.frame(dfs)) { + stop("You need at least two data frames.") + } + if(length(dfs) != length(variables)) { + stop("You must define which variables to join on.") + } + if(length(variables) < 2) { + stop("You must define at least two variables to join on.") + } + + # Create the data list -- the subset from the dfs of the variables we're + # joining on -- for each df in dfs, map it to a variable. Subset the df to + # that variable. Unlist and unname, creating a vector. Plonk that in a + # data_list + data_list = Map(function(x, y) { + unname(unlist(x[y])) + }, dfs, variables) + + # Do the joint draw + result = joint_draw_ecdf(data_list=data_list, + N=N, + sigma=sigma, + rho=rho) + + # result now contains a matrix of indices. Each column of this matrix is + # the indices for each df of dfs. Subset by row the df. This will return + # a list of new dfs. We need to cbind these dfs to make the merged data. + merged_data = do.call("cbind", + Map(function(df, indices) { df[indices, ] }, + dfs, + result)) + + # Cleanup: remove row names + rownames(merged_data) = NULL + # Re-write the column names to be the original column names from the original + # dfs. + colnames(merged_data) = unname(unlist(lapply(dfs, colnames))) + + merged_data +} + +joint_draw_ecdf = function (data_list, N, ndim=length(data_list), + sigma=NULL, rho=0, use_f = TRUE) { + + # We don't modify data_list, but this is useful to ensure the + # argument is evaluated anyway + force(ndim) + + # Error handling for N + if(is.null(N) || is.na(N) || !is.atomic(N) || length(N) > 1 || N <= 0) { + stop("N must be a single integer that is positive.") + } + + # Error handling for rho, if specified + if(is.null(sigma)) { + if(is.atomic(rho) & length(rho)==1) { + if(ndim>2 & rho<0) { + stop("The correlation matrix must be positive semi-definite. Specifically, ", + "if the number of variables being drawn from jointly is 3 or more, ", + "then the correlation coefficient rho must be non-negative.") + } + + if(rho == 0) { + # Uncorrelated draw would be way faster; just sample each column + return(lapply(seq_along(data_list), + function(vn) { + sample.int(length(data_list[[vn]]), N, replace=TRUE) + })) + } + sigma = matrix(rho, ncol=ndim, nrow=ndim) + diag(sigma) = 1 + } else { + stop("If specified, rho should be a single number") + } + } + + # Error handling for sigma + if(ncol(sigma) != ndim | nrow(sigma) != ndim | any(diag(sigma) != 1)) { + stop("The correlation matrix must be square, with the number of dimensions ", + "equal to the number of dimensions you are drawing from. In addition, ", + "the diagonal of the matrix must be equal to 1.") + } + + # Can we use the fast package or are we stuck with the slow one? + use_f = use_f && requireNamespace("mvnfast", quietly = TRUE) + + # Standard normal = all dimensions mean 0. + mu = rep(0, ndim) + + # Possible options for the joint normal draw + if(!use_f) { + # Below code is a reimplementation of the operative parts of rmvnorm from + # the mvtnorm package so that we don't induce a dependency + + # Right cholesky decomposition (i.e. LR = sigma s.t. L is lower triang, R + # is upper triang.) + right_chol = chol(sigma, pivot=TRUE) + # Order columns by the pivot attribute -- induces numerical stability? + right_chol = right_chol[, order(attr(right_chol, "pivot"))] + # Generate standard normal data and right-multiply by decomposed matrix + # with right_chol to make it correlated. + correlated_sn <- matrix(rnorm(N * ndim), + nrow = N) %*% right_chol + + } else { + # Using mvnfast + correlated_sn = mvnfast::rmvn(N, ncores = getOption("mc.cores", 2L), mu, sigma) + } + + # Z-scores to quantiles + quantiles = pnorm(correlated_sn) + colnames(quantiles) = names(data_list) + + # Quantiles to inverse eCDF. + result = lapply( + seq_along(data_list), + function(vn) { + # What would the indices of the quantiles be if our data was ordered -- + # if the answer is below 0, set it to 1. round will ensure the tie- + # breaking behaviour is random with respect to outcomes + ordered_indices = pmax(1, + round(quantiles[, vn] * length(data_list[[vn]])) + ) + + # Now get the order permutation vector and map that to the ordered indices + # to get the indices in the original space + indices = order(data_list[[vn]])[ordered_indices] + }) + + + # Set up response + result +} diff --git a/R/draw_binary_icc.R b/R/draw_binary_icc.R index 3033e4f..92a3389 100644 --- a/R/draw_binary_icc.R +++ b/R/draw_binary_icc.R @@ -6,12 +6,14 @@ #' Generation, and Estimation of Intracluster Correlation Coefficient (ICC) #' for Binary Data". #' -#' @param x A number or vector of numbers, one probability per cluster. +#' @param x A number or vector of numbers, one probability per cluster. If none +#' is provided, will default to 0.5. #' @param N (Optional) A number indicating the number of observations to be #' generated. Must be equal to length(clusters) if provided. #' @param clusters A vector of factors or items that can be coerced to #' clusters; the length will determine the length of the generated data. -#' @param rho A number indicating the desired RCC. +#' @param rho A number indicating the desired ICC, if none is provided will +#' default to 0. #' @return A vector of binary numbers corresponding to the observations from #' the supplied cluster IDs. #' @examples @@ -22,7 +24,7 @@ #' @importFrom stats rbinom #' #' @export -draw_binary_icc = function(x = 0.5, N = NULL, clusters, rho = 0.5) { +draw_binary_icc = function(x = 0.5, N = NULL, clusters, rho = 0) { # Let's not worry about how clusters are provided tryCatch({ clusters = as.numeric(as.factor(clusters)) diff --git a/R/draw_normal_icc.R b/R/draw_normal_icc.R index ee74ba4..4d30b0f 100644 --- a/R/draw_normal_icc.R +++ b/R/draw_normal_icc.R @@ -5,14 +5,16 @@ #' used in this function is specified at the following URL: #' \url{https://stats.stackexchange.com/questions/263451/create-synthetic-data-with-a-given-intraclass-correlation-coefficient-icc} #' -#' @param x A number or vector of numbers, one mean per cluster. +#' @param x A number or vector of numbers, one mean per cluster. If none is +#' provided, will default to 0. #' @param N (Optional) A number indicating the number of observations to be #' generated. Must be equal to length(clusters) if provided. #' @param clusters A vector of factors or items that can be coerced to #' clusters; the length will determine the length of the generated data. #' @param sd A number or vector of numbers, indicating the standard deviation of #' each cluster's error terms -#' @param rho A number indicating the desired RCC. +#' @param rho A number indicating the desired ICC. If none is provided, +#' will default to 0. #' @return A vector of numbers corresponding to the observations from #' the supplied cluster IDs. #' @examples @@ -26,8 +28,7 @@ draw_normal_icc = function(x = 0, N = NULL, clusters, sd = 1, - rho = 0.5) { - + rho = 0) { # Let's not worry about how clusters are provided tryCatch({ clusters = as.numeric(as.factor(clusters)) diff --git a/R/fabricate.R b/R/fabricate.R index 1cd97db..a10c6f2 100644 --- a/R/fabricate.R +++ b/R/fabricate.R @@ -1,15 +1,32 @@ - - - - #' Fabricate data #' -#' \code{fabricate} helps you simulate a dataset before you collect it. You can either start with your own data and add simulated variables to it (by passing \code{data} to \code{fabricate()}) or start from scratch by defining \code{N}. Create hierarchical data with multiple levels of data such as citizens within cities within states using \code{level()}. You can use any R function to create each variable. We provide several built-in options to easily draw from binary and count outcomes, \code{\link{draw_binary}} and \code{\link{draw_discrete}}. +#' \code{fabricate} helps you simulate a dataset before you collect it. You can +#' either start with your own data and add simulated variables to it (by passing +#' \code{data} to \code{fabricate()}) or start from scratch by defining +#' \code{N}. Create hierarchical data with multiple levels of data such as +#' citizens within cities within states using \code{add_level()} or modify +#' existing hierarchical data using \code{modify_level()}. You can use any R +#' function to create each variable. We provide several built-in options to +#' easily draw from binary and count outcomes, \code{\link{draw_binary}}, +#' \code{\link{draw_binary_icc}}, and \code{\link{draw_discrete}}. #' -#' @param data (optional) user-provided data that forms the basis of the fabrication, i.e. you can add variables to existing data. Provide either \code{N} or \code{data} (\code{N} is the number of rows of the data if \code{data} is provided). -#' @param N (optional) number of units to draw. If provided as \code{fabricate(N = 5)}, this determines the number of units in the single-level data. If provided in \code{level}, i.e. \code{fabricate(cities = level(N = 5))}, \code{N} determines the number of units in a specific level of a hierarchical dataset. +#' @param data (optional) user-provided data that forms the basis of the +#' fabrication, i.e. you can add variables to existing data. Provide either +#' \code{N} or \code{data} (\code{N} is the number of rows of the data if +#' \code{data} is provided). +#' @param N (optional) number of units to draw. If provided as +#' \code{fabricate(N = 5)}, this determines the number of units in the +#' single-level data. If provided in \code{add_level}, i.e. +#' \code{fabricate(cities = add_level(N = 5))}, \code{N} determines the number +#' of units in a specific level of a hierarchical dataset. #' @param ID_label (optional) variable name for ID variable, i.e. citizen_ID -#' @param ... Variable or level-generating arguments, such as \code{my_var = rnorm(N)}. For \code{fabricate}, you may also pass \code{level()} arguments, which define a level of a multi-level dataset. See examples. +#' @param ... Variable or level-generating arguments, such as +#' \code{my_var = rnorm(N)}. For \code{fabricate}, you may also pass +#' \code{add_level()} or \code{modify_level()} arguments, which define a level +#' of a multi-level dataset. See examples. +#' @param nest (Default TRUE) Boolean determining whether data in an \code{add_level()} call will be nested under the current working data frame or create a separate hierarchy of levels. See our vignette for cross-classified, non-nested data for details. +#' @param working_environment_ Internal argument, not intended for end-user use. +#' @param data_arguments Internal argument, not intended for end-user use. #' #' @return data.frame #' @@ -35,215 +52,666 @@ #' # Draw a two-level hierarchical dataset #' # containing cities within regions #' df <- fabricate( -#' regions = level(N = 5), -#' cities = level(N = 2, pollution = rnorm(N, mean = 5))) +#' regions = add_level(N = 5), +#' cities = add_level(N = 2, pollution = rnorm(N, mean = 5))) #' head(df) #' #' # Start with existing data and add variables to hierarchical data -#' # note: do not provide N when adding variables to an existing level +#' # at levels which are already present in the existing data. +#' # Note: do not provide N when adding variables to an existing level #' df <- fabricate( #' data = df, -#' regions = level(watershed = sample(c(0, 1), N, replace = TRUE)), -#' cities = level(runoff = rnorm(N)) +#' regions = modify_level(watershed = sample(c(0, 1), N, replace = TRUE)), +#' cities = modify_level(runoff = rnorm(N)) #' ) #' -#' @importFrom rlang quos quo_name eval_tidy lang_name lang_modify lang_args is_lang get_expr +#' @importFrom rlang quos quo_name eval_tidy lang_name lang_modify lang_args +#' is_lang get_expr #' #' @export -fabricate <- - function(data, - N, - ID_label, - ...) { - options <- quos(...) - - all_levels <- check_all_levels(options) - - if (!missing(data) && !"data.frame" %in% class(data)) { - if(is.null(dim(data))) { - stop( - "User provided data must be a data frame. Provided data was low dimensional." - ) - } - if(!"data" %in% names(sys.call())) { - stop( - "The data argument must be a data object. The argument call, ", deparse(substitute(data)), ", was not a data object (e.g. a data.frame, tibble, sf object, or convertible matrix)." - ) - } - tryCatch({ - data = as.data.frame(data) - }, error=function(e) { - stop( - "User provided data could not convert to a data frame." - ) - }) +fabricate <- function(data = NULL, ..., N = NULL, ID_label = NULL) +{ + # Store all data generation arguments in a quosure for future evaluation + # A quosure contains unevaluated formulae and function calls. + data_arguments = quos(...) + + # Fabricatr expects either a single-level function call + # or a series of level calls. You can't mix and match. + # This helper function will be TRUE if calls are all levels, FALSE + # if there are no calls or they are not levels. + all_levels = check_all_levels(data_arguments) + + # User must provide exactly one of: + # 1) One or more level calls (with or without importing their own data) + # 2) Import their own data and do not involve level calls + # 3) Provide an N without importing their own data + if(sum((!missing(data) && !is.null(data) & !all_levels), + (!is.null(N) & !missing(N)), + all_levels) != 1) { + stop( + "You must do exactly one of: \n", + "1) One or more level calls, with or without existing data \n", + "2) Import existing data and optionally, add new variables without adding a level \n", + "3) Provide an \"N\" without importing data and optionally, add new variables" + ) + } + + # Create a blank working environment. + working_environment = new.env(parent = emptyenv()) + + # User provided level calls + if(all_levels) { + + # Ensure the user provided a name for each level call. + if(is.null(names(data_arguments)) | any(names(data_arguments) == "")) { + stop("You must provide a name for each level you create.") + } + + # User provided data, if any, should be preloaded into working environment + if(!is.null(data) & !missing(data)) { + # Ensure data is sane. + data = handle_data(data) + working_environment$imported_data_ = data } + # Each of data_arguments is a level call + for(i in seq_along(data_arguments)) { + # Add two variables to the argument of the current level call + # one to pass the working environment so far + # one to pass the ID_label the user intends for the level - # check if all the options are level calls - if (all_levels) { - for (i in seq_along(options)) { - # Pop the data from the previous level in the current call - # Do this if there existing data to start with or - # and beginning with the second level - if (i > 1 | !missing(data)) { - options[[i]] <- lang_modify(options[[i]], data_internal_ = data) - } + data_arguments[[i]] = lang_modify(data_arguments[[i]], + working_environment_ = working_environment, + ID_label = names(data_arguments)[i]) - # Also do a sweet switcheroo with the level names - options[[i]] <- - lang_modify(options[[i]], ID_label_ = names(options)[i]) + # Execute the level build and pass it back to the current working + # environment. + working_environment = eval_tidy(data_arguments[[i]]) + } - # update the current data - data <- eval_tidy(options[[i]]) + # Return the results from the working environment + return(report_results(working_environment)) + } - } + # User did not pass data -- they passed N + if(is.null(data) | missing(data)) { + # Single level -- maybe the user provided an ID_label, maybe they didn't. + # Sanity check and/or construct an ID label for the new data. + ID_label = handle_id(ID_label, data) - return(data) + # Is the N argument passed here sane? Let's check + N = handle_n(N, add_level = TRUE) - } else { - if(missing(data)) data <- NULL - if(missing(N)) N <- NULL - if(missing(ID_label)) ID_label <- NULL - - if(is.symbol(substitute(ID_label))) { - ID_label <- substitute(ID_label) - if (!is.null(ID_label)) { - ID_label <- as.character(ID_label) - } - } else if(!is.null(ID_label)) { - if(is.vector(ID_label) & length(ID_label) > 1) { - # Vector of length n>1, error - stop("Provided ID_label must be a character vector of length 1 or variable name.") - } else if(is.vector(ID_label) & is.numeric(ID_label[1])) { - # Numeric ID_label - warning("Provided ID_label is numeric and will be prefixed with the character \"X\"") - ID_label <- as.character(ID_label) - } else if(is.vector(ID_label) & is.character(ID_label[1])) { - # Valid ID_label - ID_label <- as.character(ID_label) - } else if(!is.null(dim(ID_label))) { - # Higher dimensional ID_label - stop("Provided ID_label must be a character vector or variable name, not a data frame or matrix.") - } - } + # Creating a working environment that's empty (user passed no data) + data_arguments[["working_environment_"]] = working_environment - fabricate_data_single_level( - data = data, - N = N, - ID_label = ID_label, - existing_ID = !is.null(data) & is.null(ID_label), - options = options + # Run the level adder, report the results, and return + return( + report_results( + add_level(N = N, ID_label = ID_label, data_arguments = data_arguments) ) - } + ) } + # Confirm data can be a data frame + data = handle_data(data) + + # Single level -- maybe the user provided an ID_label, maybe they didn't. + # Sanity check and/or construct an ID label for the new data. + ID_label = handle_id(ID_label, data) -#' @importFrom rlang quos eval_tidy -fabricate_data_single_level <- function(data = NULL, - N = NULL, - ID_label = NULL, - ..., - existing_ID = FALSE, - options=quos(...)) { - if (is.null(data) == is.null(N)) { - stop("Please supply either a data.frame or N and not both.") + # User passed data, not N + # First, let's dynamically get N from the number of rows + N = nrow(data) + + # Now, let's load the data into our working environment + working_environment$imported_data_ = data + data_arguments[["working_environment_"]] = working_environment + + # Run the level adder, report the results, and return + return( + report_results( + add_level(N = N, + ID_label = ID_label, + data_arguments = data_arguments, + nest=FALSE) + ) + ) +} + +#' @importFrom rlang quos eval_tidy quo lang_modify +#' +#' @rdname fabricate +#' @export +add_level = function(N = NULL, ID_label = NULL, + working_environment_ = NULL, + ..., + data_arguments=quos(...), + nest = TRUE) { + + # Copy the working environment out of the data_arguments quosure and into + # the root. This happens when we have a single non-nested fabricate call + # and we don't want to double-quosure the working environmented. + if("working_environment_" %in% names(data_arguments)) { + working_environment_ = data_arguments[["working_environment_"]] + data_arguments[["working_environment_"]] = NULL } - if (!is.null(N)) { - if (length(N) != 1) { - stop( - "At the top level, ", - ifelse(!is.null(ID_label), - paste0(ID_label, ", "), - ""), - "you must provide a single number to N" - ) - } else if(is.numeric(N) & any(!N%%1 == 0)) { - stop( - "The provided N must be an integer number. Provided N was of type ", - typeof(N) - ) + # Pass-through mapper to nest_level. + # This needs to be done after we read the working environment and + # before we check N or do the shelving procedure. + if(nest && + ("data_frame_output_" %in% names(working_environment_) | + "imported_data_" %in% names(working_environment_))) { + return(nest_level(N=N, ID_label=ID_label, + working_environment_=working_environment_, + data_arguments=data_arguments)) + } + + # Check to make sure the N here is sane + N = handle_n(N, add_level=TRUE) + + # User is adding a new level, but already has a working data frame. + # Shelf the working data frame and move on + if("data_frame_output_" %in% names(working_environment_)) { + # Construct the shelved version + package_df = list(data_frame_output_ = working_environment_$data_frame_output_, + level_ids_ = working_environment_$level_ids_, + variable_names_ = names(working_environment_$data_frame_output_)) + + # Append it to the existing shelf + if("shelved_df" %in% names(working_environment_)) { + working_environment_$shelved_df = append(working_environment_$shelved_df, + list(package_df)) + } else { + # Create a shelf just for this + working_environment_$shelved_df = list(package_df) } - if(!is.numeric(N)) { - tryCatch({ - N = as.numeric(N) - }, error=function(e) { - stop( - "The provided value for N must be an integer number." - ) - }) + # Clear the current work-space. + working_environment_$data_frame_output_ = + working_environment_$level_ids_ = + working_environment_$variable_names_ = NULL + } + + # User is adding a new level, but we need to sneak in the imported data first. + # When this is done, trash the imported data, because the working data frame + # contains it. + if("imported_data_" %in% names(working_environment_)) { + num_obs_imported = nrow(working_environment_$imported_data_) + working_data_list = as.list(working_environment_$imported_data_) + working_environment_$variable_names_ = names(working_environment_$imported_data_) + working_environment_$imported_data_ = NULL + + # User didn't specify an N, so get it from the current data. + if(is.null(N)) { + N = num_obs_imported } + } else { + working_data_list = list() + } - data <- data.frame() - existing_ID <- FALSE - } else if(!is.null(data)){ - N <- nrow(data) + # Staple in an ID column onto the data list. + if(!is.null(ID_label)) { + # It's possible the working data frame already has the ID label, if so, + # don't do anything. + if(is.null(names(working_data_list)) || !ID_label %in% names(working_data_list)) { + # First, add the column to the working data frame + working_data_list[[ID_label]] = generate_id_pad(N) + + # Next, add the ID_label to the level ids tracker + # Why does this not need to return? Because environments are passed by + # reference + add_level_id(working_environment_, ID_label) + add_variable_name(working_environment_, ID_label) + } else { + # If the ID label was specified but already exists, we should still log + # it as a level ID + add_level_id(working_environment_, ID_label) + } + } else { + stop("Please specify a name for the level call you are creating.") } - if (!existing_ID) { - if(is.null(ID_label)) ID_label <- "ID" - data <- genID(data, ID_label, N) + # Loop through each of the variable generating arguments + for(i in names(data_arguments)) { + # Evaluate the formula in an environment consisting of: + # 1) The current working data list + # 2) A list that tells everyone what N means in this context. + working_data_list[[i]] = eval_tidy(data_arguments[[i]], + append(working_data_list, list(N=N))) + + # Write the variable name to the list of variable names + add_variable_name(working_environment_, i) + + # Nuke the current data argument -- if we have the same variable name + # created twice, this is OK, because it'll only nuke the current one. + data_arguments[[i]] = NULL } + # Before handing back data, ensure it's actually rectangular + working_data_list = check_rectangular(working_data_list, N) - fab(data, options) -} + # Coerce our working data list into a working data frame + working_environment_$data_frame_output_ = data.frame(working_data_list, + stringsAsFactors=FALSE, + row.names=NULL) -# make IDs that are nicely padded -genID <- function(data, ID, N=nrow(data)){ - fmt <- paste0("%0", nchar(N), "d") - data[1:N, ID] <- sprintf(fmt, 1:N) - data + # In general the reference should be unchanged, but for single-level calls + # there won't be a working environment to reference. + return(working_environment_) } -fab <- function(data, args) { - N <- nrow(data) - for (i in names(args)) { - if(i == "") next #Unnamed args are meaningless? - - # this was changed to move costly data.frame operations inside the loop - # because previously, if you did rnorm(N) in a level - # it literally did a vector of length N, rather than doing - # it for all of the values of the level above it, i.e. if this - # this is the second level and N = 2, it made a vector of length 2 - # even though there were 5 units in the higher level so there should - # have been a vector of 5*2 = 10 - # NB: this is still not super safe; if the expression returns a thing - # of length not exactly to N it's just going to repeat it as it does - # data.frame(data_list). Usually this shouldn't be a problem but we may - # want a warning or error - data_list <- as.list(data) - data_list[[i]] <- - eval_tidy(args[[i]], append(data_list, list(N=N))) - - #TODO Factor this out of loop? It's expensive - data <- data.frame(data_list, stringsAsFactors = FALSE, row.names=NULL) - args[[i]] <- NULL - } - - return(data) +#' +#' @importFrom rlang quos eval_tidy quo lang_modify +#' +nest_level = function(N = NULL, ID_label = NULL, + working_environment_ = NULL, + ..., + data_arguments=quos(...)) { + + # Check to make sure we have a data frame to nest on. + if(is.null(dim(working_environment_$data_frame_output_))) { + if("imported_data_" %in% names(working_environment_)) { + working_environment_$data_frame_output_ = data.frame(working_environment_$imported_data_) + working_environment_$variable_names_ = names(working_environment_$imported_data_) + working_environment_$imported_data_ = NULL + } else { + stop("You can't nest a level if there is no level to nest inside") + } + } + + # Check to make sure the N here is sane + # Pass the working environment because N might not be a singleton here + N = handle_n(N, add_level=FALSE, + working_environment = working_environment_, + parent_frame_levels=2) + + # We need to expand the size of the current working data frame by copying it + # Let's start by getting the size of the current working data frame + past_level_N = nrow(working_environment_$data_frame_output_) + # And now make an index set 1:past_level_N + indices = seq_len(past_level_N) + + # We're now going to modify the index set to take into account the expansion + # If N is a single number, then we repeat each index N times + # If N is of length past_level_N, then we repeat each index N_i times. + # For r's rep, the each / times arguments have odd behaviour that necessitates + # this approach + if(length(N)==1) rep_indices = rep(indices, each=N) + else rep_indices = rep(indices, times=N) + + # Update N to the new length. + inner_N = N # Length specified for this level + N = length(rep_indices) # Length of overall data frame + + # Expand the data frame by duplicating the indices and then coerce the data + # frame to a list -- we do this to basically make variables accessible in the + # namespace. + working_data_list = as.list(working_environment_$data_frame_output_[rep_indices, , drop=FALSE]) + + # Everything after here is non-unique to nest_level versus add_level -- need + # to think about how to refactor this out. + + # Staple in an ID column onto the data list. + if(!is.null(ID_label) && (is.null(names(working_data_list)) || + !ID_label %in% names(working_data_list))) { + # First, add the column to the working data frame + working_data_list[[ID_label]] = generate_id_pad(N) + + add_level_id(working_environment_, ID_label) + add_variable_name(working_environment_, ID_label) + } + + # Loop through each of the variable generating arguments + for(i in names(data_arguments)) { + # Evaluate the formula in an environment consisting of: + # 1) The current working data list + # 2) A list that tells everyone what N means in this context. + working_data_list[[i]] = eval_tidy(data_arguments[[i]], + append(working_data_list, list(N=N))) + + # User provided a fixed-length data variable whose length is the length of + # the inner-most level for a given outer level. See example: + # fabricate(countries = add_level(N=20), + # cities = nest_level(N=2, capital=c(TRUE, FALSE))) + # We need to expand this to each setting of the outer level. + # Only evaluate if inner_N is a single number + if(length(inner_N) == 1 && length(working_data_list[[i]]) == inner_N) { + working_data_list[[i]] = rep(working_data_list[[i]], (N/inner_N)) + } + + # Write the variable name to the list of variable names + add_variable_name(working_environment_, i) + + # Nuke the current data argument -- if we have the same variable name + # created twice, this is OK, because it'll only nuke the current one. + data_arguments[[i]] = NULL + } + + # Before handing back data, ensure it's actually rectangular + working_data_list = check_rectangular(working_data_list, N) + + # Overwrite the working data frame. + working_environment_[["data_frame_output_"]] = data.frame(working_data_list, + stringsAsFactors=FALSE, + row.names=NULL) + + return(working_environment_) } -check_all_levels <- function(options){ +#' @importFrom rlang quos eval_tidy quo lang_modify +#' +#' @rdname fabricate +#' @export +modify_level = function(N = NULL, + ID_label = NULL, + working_environment_ = NULL, + ..., + data_arguments=quos(...)) { + + # Need to supply an ID_label, otherwise we have no idea what to modify. + if(is.null(ID_label)) { + stop("You can't modify a level without a known level ID variable. If you", + "are adding nested data, please use add_level") + } - if (length(options) == 0) return(FALSE) + # First, establish that if we have no working data frame, we can't continue + if(is.null(dim(working_environment_$data_frame_output_))) { + if("imported_data_" %in% names(working_environment_)) { + working_environment_$data_frame_output_ = data.frame(working_environment_$imported_data_) + working_environment_$variable_names_ = names(working_environment_$imported_data_) + working_environment_$imported_data_ = NULL + } else { + stop( + "You can't modify a level if there is no working data frame to ", + "modify: you must either load pre-existing data or generate some data ", + "before modifying." + ) + } + } - is_function <- sapply(options, function(i) { - is_lang(get_expr(i)) - }) + # There are two possibilities. One is that we are modifying the lowest level + # of data. In which case, we simply add variables, like if someone called + # add_level with a dataset. To check if that's the world we're in, check if + # we have any duplicates in the ID label: + if(!anyDuplicated(working_environment_$data_frame_output_[[ID_label]])) { + # There is no subsetting going on, but modify_level was used anyway. + N = nrow(working_environment_$data_frame_output_) + + # Coerce the working data frame into a list + working_data_list = as.list(working_environment_$data_frame_output_) + + # Now loop over the variable creation. + for(i in names(data_arguments)) { + # Evaluate the formula in an environment consisting of: + # 1) The current working data list + # 2) A list that tells everyone what N means in this context. + # Store it in the current environment + working_data_list[[i]] = eval_tidy(data_arguments[[i]], + append(working_data_list, list(N=N))) + + # Write the variable name to the list of variable names + add_variable_name(working_environment_, i) + + # Nuke the current data argument -- if we have the same variable name + # created twice, this is OK, because it'll only nuke the current one. + data_arguments[[i]] = NULL + } - is_level <- "level" == sapply(options[is_function], lang_name) ## function names + # Before handing back data, ensure it's actually rectangular + working_data_list = check_rectangular(working_data_list, N) - if(length(is_level) == 0) return(FALSE) + # Overwrite the working data frame. + working_environment_[["data_frame_output_"]] = data.frame(working_data_list, + stringsAsFactors=FALSE, + row.names=NULL) - if (any(is_level) != all(is_level)) { + # Return results + return(working_environment_) + } + + # If we're here, then at least some subsetting is used in the modify call + # first, subset to unique observations, then generate new data, then re-expand. + # To do this, we need a mapping between observations and unique observations. + # First, get the unique values of the level: + unique_values_of_level = unique(working_environment_$data_frame_output_[[ID_label]]) + + # Pre-allocate the mapping vector + index_maps = numeric(length(working_environment_$data_frame_output_[[ID_label]])) + # Iterate along the unique values of the level + for(i in seq_along(unique_values_of_level)) { + # Any obs that matches the level matching this i will be a duplicate of this i. + index_maps[ + working_environment_$data_frame_output_[[ID_label]] == unique_values_of_level[i] + ] = i + } + + # Now, which variables are we going to write to (do we need to subset)? + write_variables = unname(unlist(get_symbols_from_quosure(data_arguments))) + # Remove the ID label from the variables we are going to write to. + write_variables = setdiff(write_variables, ID_label) + # Let's also remove anything that doesn't seem to be a valid variable + write_variables = write_variables[write_variables %in% names(working_environment_$data_frame_output_)] + + # Level unique variables: + level_unique_variables = get_unique_variables_by_level( + data = working_environment_$data_frame_output_, + ID_label = ID_label, + superset=write_variables) + + # Error if we try to write using a variable that's not unique to the level. + if(length(level_unique_variables) != length(write_variables) & + length(write_variables) != 0) { stop( - "Arguments passed to ... must either all be calls to level() or have no calls to level()." + "Your modify_level command attempts to generate a new variable at the level \"", + ID_label, + "\" but requires reading from the existing variable(s) [", + paste(setdiff(write_variables, level_unique_variables), collapse=", "), + "] which are not defined at the level \"", ID_label, "\"\n\n", + "To prevent this error, you may modify the data at the level of interest, ", + "or change the definition of your new variables." ) } - is_level[1] && length(is_level) == length(options) + # Our subset needs these columns -- the level variable, all the unique + # variables we are going to use to write, and then in case the latter is "", + # remove that dummy obs.: + merged_set = unique(c(ID_label, setdiff(level_unique_variables, ""))) + + # And these rows: + row_indices_keep = !duplicated(working_environment_$data_frame_output_[[ID_label]]) + + # Now subset it: + working_subset = working_environment_$data_frame_output_[row_indices_keep, + merged_set, + drop=FALSE] + + # Set the N variable correctly moving forward: + super_N = nrow(working_environment_$data_frame_output_) + N = nrow(working_subset) + + # Get the subset into a list: + working_data_list = as.list(working_subset) + # And our original working data frame: + super_working_data_list = as.list(working_environment_$data_frame_output_) + + # Now loop + for(i in names(data_arguments)) { + # Evaluate the formula in an environment consisting of: + # 1) The current working data list + # 2) A list that tells everyone what N means in this context. + # Store it in the currently working data list + working_data_list[[i]] = eval_tidy(data_arguments[[i]], + append(working_data_list, list(N=N))) + + # Write the variable name to the list of variable names + add_variable_name(working_environment_, i) + + # Expand the variable and store it in the actual, expanded working data list + # Why do we keep these in parallel? Because subsequent variables might need + # the non-expanded version to generate new variables. + super_working_data_list[[i]] = working_data_list[[i]][index_maps] + + # Nuke the current data argument -- if we have the same variable name + # created twice, this is OK, because it'll only nuke the current one. + data_arguments[[i]] = NULL + } + + # Before handing back data, ensure it's actually rectangular + super_working_data_list = check_rectangular(super_working_data_list, super_N) + + # Overwrite the working data frame. + working_environment_$data_frame_output_ = data.frame(super_working_data_list, + stringsAsFactors=FALSE, + row.names=NULL) + + # Return results + return(working_environment_) +} + +#' Creates cross-classified (partially non-nested, joined data) with a fixed +#' correlation structure. +#' +#' @param N (required) The number of observations in the resulting data frame +#' @param by The result of a call to \code{join()} which specifies how the cross-classified data will be created +#' @param ... A variable or series of variables to add to the resulting data frame after the cross-classified data is created. +#' @param ID_label Internal keyword used to sepcify the name of the ID variable created for the new level. If left empty, this will be the name the level is assigned to as part of a \code{fabricate()} call. +#' @param working_environment_ Internal keyword not for end user use. +#' @param data_arguments Internal keyword not for end user use. +#' @importFrom rlang quos quo_text +#' @export +cross_level = function(N = NULL, + ID_label = NULL, + working_environment_ = NULL, + by = NULL, + ..., + data_arguments=quos(...)) { + + if(any(!c("data_frame_output_", "shelved_df") %in% + names(working_environment_))) { + stop("You require at least two separate level hierarchies to create ", + "cross-classified data.") + } + + if(is.null(by) || !length(by$variable_names)) { + stop("You must specify a join structure to create cross-classified data.") + } + + # Move the current working data frame into a package + package_df = list(data_frame_output_ = working_environment_$data_frame_output_, + level_ids_ = working_environment_$level_ids_, + variable_names_ = names(working_environment_$data_frame_output_)) + + # Stuff it in the shelved df + working_environment_$shelved_df = append(working_environment_$shelved_df, + list(package_df)) + + # Clear the active working data frame. + working_environment_$data_frame_output_ = + working_environment_$level_ids_ = + working_environment_$variable_names_ = NULL + + # Loop over the variable name + variable_names = by$variable_names + data_frame_indices = integer(length(variable_names)) + + if(anyDuplicated(variable_names)) { + stop("Variables names for joining cross-classified data must be unique. ", + "Currently, you are joining on a variable named \"", + variable_names[anyDuplicated(variable_names)[1]], + "\" more than once.") + } + + # Figure out which dfs we're joining on which variables + for(i in seq_along(variable_names)) { + for(j in seq_along(working_environment_$shelved_df)) { + if(variable_names[i] %in% + working_environment_$shelved_df[[j]]$variable_names_) { + + # If we've already found this one, that's bad news for us... + if(data_frame_indices[i]) { + stop("Variable name ", + variable_names[i], + " is ambiguous and appears in at least two level hierarchies.") + } + + data_frame_indices[i] = j + } + } + + # If we didn't find this one, that's bad news for us... + if(!data_frame_indices[i]) { + stop("Variable name ", + variable_names[i], + " was not found in any of the level hierarchies.") + } + } + + if(anyDuplicated(data_frame_indices)) { + stop("You can't join a level hierarchy to itself.") + } + + # Actually fetch the df objects + data_frame_objects = sapply(data_frame_indices, + function(x) { + working_environment_$shelved_df[[x]]$data_frame_output_ + }, + simplify = FALSE + ) + + # Do the join. + out = join_dfs(data_frame_objects, variable_names, N, by$sigma, by$rho) + working_environment_$variable_names_ = names(out) + + # Staple in an ID column onto the data list. + if(!is.null(ID_label) && (!ID_label %in% names(out))) { + out[, ID_label] = generate_id_pad(N) + + add_level_id(working_environment_, ID_label) + add_variable_name(working_environment_, ID_label) + } + + # Overwrite the working data frame. + working_environment_$data_frame_output_ = out + + if(length(data_arguments)) { + working_environment_ = modify_level(ID_label = ID_label, + working_environment_ = working_environment_, + data_arguments = data_arguments) + } + + # Return results + return(working_environment_) } +#' Helper function handling specification of which variables to join a +#' cross-classified data on, and what kind of correlation structure needed +#' @param ... A series of two or more variable names, unquoted, to join on in order to create cross-classified data. +#' @param rho A fixed (Spearman's rank) correlation coefficient between the variables being joined on: note that if it is not possible to make a correlation matrix from this coefficient (i.e. if you are joining on three or more variables and rho is negative) then the \code{cross_level()} call will fail. +#' @param sigma A matrix with dimensions equal to the number of variables you are joining on, specifying the correlation for the resulting joined data. Only one of rho and sigma should be provided. +#' @param data_arguments Internal, not for end-user use. +#' @export +join = function(..., rho=0, sigma=NULL, data_arguments=quos(...)) { + variable_names = unlist(lapply(data_arguments, function(x) { quo_text(x) })) + return(list(variable_names = variable_names, + rho = rho, + sigma = sigma)) +} + +#' Deprecated level call function maintained to provide useful error for +#' previous fabricatr code. +#' @keywords internal +#' @export +level = function(N = NULL, ID_label = NULL, ...) { + stop("Level calls are currently deprecated; use add_level and modify_level") + # Stub, this doesn't do anything yet -- may in the future dispatch to the + # relevant levels. +} + +# Dummy helper function that just extracts the working data frame from the +# environment. This exists because we may in the future want to return something +# that is not a data frame. +report_results = function(working_environment) { + return(working_environment$data_frame_output_) +} diff --git a/R/helper_functions.R b/R/helper_functions.R new file mode 100644 index 0000000..8525270 --- /dev/null +++ b/R/helper_functions.R @@ -0,0 +1,355 @@ +get_symbols_from_expression = function(l_arg) { + # We have some sort of language expression in R, let's extract + # the symbols it's going to refer to + + if(is.symbol(l_arg)) { + # If it's a symbol, return the symbol + return(unname(l_arg)) + } else if(is.language(l_arg)) { + # If it's a language call, then we need to unpack some more + # Extract the language from the language call + recurse = lang_args(l_arg) + # Iterate through each part of the language, recursively calling this function + # Results are a list, so unlist and unname to flatten + temp = unname(unlist(lapply(recurse, function(i) { get_symbols_from_expression(i) }))) + return(temp) + } else { + # It's something else? This might happen if the base level call + # is numeric or whatever. We are only interested in variable nanes. + } +} + +get_symbols_from_quosure = function(quosure) { + # Given a quosure, what symbols will that quosure attempt to read when it + # is evaluated? + meta_results = lapply(quosure, function(i) { + # For each term in the quosure, get the language call out of the term: + expression = get_expr(i) + # Get the arguments out of that language call + thing = lang_args(expression) + # Now, for each argument try to extract the symbols + results = lapply(thing, function(x) { get_symbols_from_expression(x) }) + + # We are going to unlist, convert to characters (this is necessary to coerce + # results into a vector), and then remove duplicates + return(unique( + as.character( + unlist( + results)))) + }) + + return(meta_results) +} + +get_unique_variables_by_level <- function(data, ID_label, superset=NULL) { + # Superset contains a vector of character strings that contain variables + # the modify level call is going to write. Some of these may be columns + # in the data frame, others might not be. If superset is specified, + # then we definitely only want to check those variables + if(!is.null(superset)) { + names_to_check = intersect(colnames(data), superset) + } else { + names_to_check = colnames(data)[-which(colnames(data)==ID_label)] + } + + # It turns out the call isn't going to use any variables at all! + if(!length(names_to_check)) { return("") } + + # Iterate through each column of interest + # Per column, split that column's data into a list. The split indices come from the level indicator. + # Now, run a function which checks the unique length of each tranch + # Unlist the result to get a vector of TRUE or FALSE for each tranch of the list. + # If all tranches are TRUE, then the column has unique values based on the level's level. + # Take the results per column, unlist those, strip the names (if any) from the variables. + # Now extract the column names for the columns for which this was true. Return as a vector. + + # Performance is around 22% faster than existing code for small dataset + level_variables = names_to_check[ + unname(unlist(lapply(names_to_check, + function(i) { + all(unlist( + lapply( + split(data[, i], data[, ID_label]), + function(x) { + length(unique(x))==1 + } + ) + )) + } + ) + )) + ] + return(level_variables) +} + + +# Checks if an ID label is sane, warns or errors if not. +# Generates an ID label if there isn't one provided. +handle_id = function(ID_label, data=NULL) { + # If the user passed a symbol, we should evaluate the symbol forcibly and error if + # they were assuming NSE substitution of an undefined symbol. + tryCatch(force(ID_label), + error = function(e) { + stop("The ID_label provided is a reference to an undefined variable. Please enclose ID_label in quotation marks if you intended to provide ID_label as a character vector.") + }) + + # User passed a non-symbol non-null ID_label + if(!is.null(ID_label)) { + if(is.vector(ID_label) & length(ID_label) > 1) { + # Vector of length n>1, error + stop("Provided ID_label must be a character vector of length 1 or variable name.") + } else if(is.vector(ID_label) & is.numeric(ID_label[1])) { + # Numeric ID_label -- this is OK but variable names can't be numeric + warning("Provided ID_label is numeric and will be prefixed with the character \"X\"") + ID_label <- as.character(ID_label) + } else if(is.vector(ID_label) & is.character(ID_label[1])) { + # Valid ID_label + ID_label <- as.character(ID_label) + } else if(!is.null(dim(ID_label))) { + # Higher dimensional ID_label + stop("Provided ID_label must be a character vector or variable name, not a data frame or matrix.") + } + } + + # At the end of all this, we still don't have an ID label + if(is.null(ID_label)) { + if(is.null(data) | missing(data)) { + ID_label = "ID" + } else { + # We need to come up with an ID, but there's some data, so we're not sure + tries = 0 + # "ID" isn't taken + if (!"ID" %in% names(data)) { + ID_label = "ID" + } else { + # "ID" is taken, so we're going to try some backups + while(tries < 5) { + tries = tries + 1 + candidate_label = paste0("fab_ID_", tries) + # This backup is available + if(!candidate_label %in% names(data)) { + ID_label = candidate_label + break + } + } + + # We tried all our backup IDs and still couldn't find a valid ID + if(tries >= 5 & is.null(ID_label)) { + stop( + "No ID label specified for level and supply of default ID_labels -- ID, fab_ID_1, fab_ID_2, fab_ID_3, fab_ID_4, fab_ID_5 -- all used for data columns. Please specify an ID_label for this level." + ) + } + } + } + } + + # Return the resulting ID_label + return(ID_label) +} + +# Checks if a supplied N is sane for the context it's in +handle_n = function(N, add_level=TRUE, working_environment=NULL, + parent_frame_levels=1) { + # Error handling for user-supplied N + + # First, evaluate the N in the context of the working environment's working data frame + # Why do we need to do this? Because N could be a function of variables. + if(!is.null(working_environment) & "data_frame_output_" %in% names(working_environment)) { + # Why do we substitute N in parent.frame()? Because if we substitute in the current + # frame, we just get the symbol used for N from the outside functions, which would just be N + # This ensures we get the expression passed to N in the outer function call. + temp_N_expr = substitute(N, parent.frame(parent_frame_levels)) + N = eval_tidy(temp_N_expr, data=working_environment$data_frame_output_) + } + + # User provided an unevaluated function + if(typeof(N) == "closure") { + stop("If you use a function to define N, you must evaluate that function rather than passing it in closure form.") + } + + # If they provided an N + if(!is.null(N)) { + # If this is an add_level operation, N must be a single number + if(add_level) { + if(length(N) > 1) { + stop( + "When adding a new level, the specified N must be a single number." + ) + } + } else { + if(length(N) > 1) { + # User specified more than one N; presumably this is one N for each level of the + # last level variable + + # What's the last level variable? + name_of_last_level = working_environment$level_ids_[length(working_environment$level_ids_)] + + # What are the unique values? + unique_values_of_last_level = unique( + working_environment$data_frame_output_[[name_of_last_level]] + ) + + if(length(N) != length(unique_values_of_last_level)) { + stop( + "N must be either a single number or a vector of length ", + length(unique_values_of_last_level), + " (one value for each possible level of ", + name_of_last_level, + ")" + ) + } + } + # If this is not an add_level operation, there are other options + + } + + # If any N is non-numeric or non-integer or negative or zero, fail. + if(all(is.numeric(N)) && any(N%%1 | N<=0)) { + stop( + "Provided N must be a single positive integer." + ) + } + + # Coerce to numeric or fail + if(!is.numeric(N)) { + tryCatch({ + N = as.numeric(N) + }, error=function(e) { + stop( + "Provided values for N must be integer numbers" + ) + }, warning=function(e) { + stop( + "Provided values for N must be integer numbers" + ) + }) + } + } + + return(N) +} + +# Checks if the user-provided data is sane +# errors if not. +handle_data = function(data) { + if(!is.null(data) & !missing(data) & !"data.frame" %in% class(data)) { + # User provided data, but it's not 2D + if(is.null(dim(data))) { + stop( + "User provided data must be a data frame. Provided data was low dimensional." + ) + } + + # User provided data, but it's not a data frame, and they didn't provide it explicitly, + # so this is probably a mess-up with an implicit argument + if(!"data" %in% names(sys.call()) && + !"data" %in% names(sys.call(-1))) { + stop( + "The data argument must be a data object. The argument call, ", + deparse(substitute(data)), + ", was not a data object (e.g. a data.frame, tibble, sf object, or ", + "convertible matrix)." + ) + } + + # Convert user data to a data frame + tryCatch({ + data = data.frame(data) + }, error=function(e) { + # We can't make it a data frame -- this should probably never happen, + # since it relies on something with a dim attribute not converting to + # a data frame. + stop( + "User provided data could not convert to a data frame." + ) + }) + } + return(data) +} + +# Function to check if every argument in a quosure options +# is a level call. +check_all_levels <- function(options){ + # Passing the options quosures + # There were no levels, or indeed arguments, at all + if (length(options) == 0) return(FALSE) + + # get_expr returns the expression for an item in a quosure + # is_lang checks if it's a function + is_function <- sapply(options, function(i) { + is_lang(get_expr(i)) + }) + + # lang_name gets function name from a quosure + func_names = sapply(options[is_function], lang_name) + + # Check to see if the function names are one of the valid level operations + is_level = sapply(func_names, function(i) { i %in% c("level", + "add_level", + "nest_level", + "modify_level", + "cross_level") }) + + # Return false if we have no level calls + if(length(is_level) == 0) return(FALSE) + + # If some calls are levels and some aren't, we're unhappy + if (any(is_level) != all(is_level)) { + stop( + "Arguments passed to ... must either all be calls to create or modify levels, or else none of them must be." + ) + } + + # Confirm they're all levels + is_level[1] && length(is_level) == length(options) +} + + +# Generates IDs from 1:N with zero left padding for visual display. +generate_id_pad <- function(N){ + # Left-Pad ID variable with zeroes + format_left_padded <- paste0("%0", nchar(N), "d") + + # Add it to the data frame. + return(sprintf(format_left_padded, 1:N)) +} + +# Try to overwrite R's recycling of vector operations to ensure the initial +# data is rectangular -- needs an N to ensure that constants do get recycled. +check_rectangular = function(working_data_list, N) { + for(i in seq_along(working_data_list)) { + if(length(working_data_list[[i]]) == 1) { + # Variable is a constant -- repeat it N times + working_data_list[[i]] = rep(working_data_list[[i]], N) + } else if(length(working_data_list[[i]]) != N) { + # Variable is not of length N. Oops. + stop("Variable lengths must all be equal to N.") + } + } + return(working_data_list) +} + +# Add a level ID to a working environment +add_level_id = function(working_environment_, ID_label) { + # Add or create level ID list + if("level_ids_" %in% names(working_environment_)) { + working_environment_$level_ids_ = append(working_environment_$level_ids_, ID_label) + } else { + working_environment_$level_ids_ = c(ID_label) + } + + return() +} + +# Add a variable name to a working environment +add_variable_name = function(working_environment_, variable_name) { + # Add or create variable name list. + if("variable_names_" %in% names(working_environment_)) { + working_environment_$variable_names_ = append(working_environment_$variable_names_, + variable_name) + } else { + working_environment_$variable_names_ = c(variable_name) + } + + return() +} diff --git a/R/level.R b/R/level.R deleted file mode 100644 index 0153ed1..0000000 --- a/R/level.R +++ /dev/null @@ -1,84 +0,0 @@ - -#' Fabricate a Level of Data for Hierarchical Data -#' -#' @importFrom rlang quos eval_tidy quo lang_modify -#' -#' @rdname fabricate -#' @export -level <- - function(N = NULL, - ...) { - - # handle data that is sent from higher levels of the hierarchy - # this is done internally through data_internal_, which is passed through - # the ...; users can send data to any level through data, but is handled - # differently - dots <- quos(...) - if ("data_internal_" %in% names(dots)) { - data_internal_ <- eval_tidy(dots[["data_internal_"]]) - dots[["data_internal_"]] <- NULL - } else { - data_internal_ <- NULL - } - - if ("ID_label_" %in% names(dots)) { - ID_label <- eval_tidy(dots[["ID_label_"]]) - dots[["ID_label_"]] <- NULL - } else { - stop("Please provide a name for the level, by specifying `your_level_name = level()` in fabricate.") - } - - if (is.null(data_internal_)) { - - ## if data is not provided to fabricate, this part handles the case - ## of the top level, where data must be created for the first time - return(fabricate_data_single_level(N=N, ID_label=ID_label, options=dots)) - - } else { - # at the second level, after data is created, or at the top level if data is provided - # to fabricate, there are two case: - # 1. ID_label does not yet exist, in which case we create the level defined by ID_label by expanding dataset based on N - # 2. ID label already exists, in which case we add variables to an existing level - - # if there is no ID variable, expand the dataset based on the commands in N - if (!ID_label %in% colnames(data_internal_)) { - N <- eval(substitute(N), envir = data_internal_) - - data_internal_ <- - expand_data_by_ID(data = data_internal_, - ID_label = ID_label, - N = N) - - # now that data_internal_ is the right size, pass to "mutate", i.e., simulate data - return(fabricate_data_single_level(data_internal_, NULL, ID_label, options=dots)) - - } else { - # otherwise assume you are adding variables to an existing level - # defined by the level ID variable that exists in the data_internal_ - - # get the set of variable names that are unique within the level you are adding vars to - # so the new vars can be a function of existing ones - level_variables <- - get_unique_variables_by_level(data = data_internal_, ID_label = ID_label) - - # construct a dataset with only those variables at this level - data <- - unique(data_internal_[, unique(c(ID_label, level_variables)), - drop = FALSE]) - - data <- fabricate_data_single_level(data, NULL, ID_label, existing_ID = TRUE, options=dots) - - return(merge( - data_internal_[, colnames(data_internal_)[!(colnames(data_internal_) %in% - level_variables)], drop = FALSE], - data, - by = ID_label, - all = TRUE, - sort = FALSE - )) - - } - - } - - } diff --git a/R/resample_data.R b/R/resample_data.R index a939e00..cebf19b 100644 --- a/R/resample_data.R +++ b/R/resample_data.R @@ -18,8 +18,8 @@ #' # N specifies a number of clusters to return #' #' clustered_survey <- fabricate( -#' clusters = level(N=25), -#' cities = level(N=runif(25, 1, 5), population=runif(n = N, min=50000, max=1000000)) +#' clusters = add_level(N=25), +#' cities = add_level(N=round(runif(25, 1, 5)), population=runif(n = N, min=50000, max=1000000)) #' ) #' #' # Specify the name of the cluster variable one of two ways @@ -34,8 +34,8 @@ #' #' my_data <- #' fabricate( -#' cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), -#' citizens = level(N = 3, income = round(elevation * rnorm(n = N, mean = 5))) +#' cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), +#' citizens = add_level(N = 3, income = round(elevation * rnorm(n = N, mean = 5))) #' ) #' #' # Specify the levels you wish to resample one of two ways: @@ -57,7 +57,9 @@ resample_data = function(data, N, ID_labels=NULL) { # Mask internal outer_level and use_dt arguments from view. - .resample_data_internal(data, N, ID_labels) + df = .resample_data_internal(data, N, ID_labels) + rownames(df) = NULL + return(df) } #' Magic number constant to allow users to specify "ALL" for passthrough resampling @@ -198,10 +200,14 @@ resample_single_level <- function(data, ID_label = NULL, N) { stop("ID label provided (", ID_label, ") is not a column in the data being resampled.") } - if(length(N) > 1 | !is.numeric(N) | N%%1 | (N<=0 & N!=ALL)) { + if(length(N) > 1) { stop("For a single resample level, N should be a single positive integer. N was ", N) } + if(!is.numeric(N) || (N%%1 | (N<=0 & N!=ALL))) { + stop("For a single resample level, N should be a positive integer. N was ", N) + } + # Split data by cluster ID, storing all row indices associated with that cluster ID # nrow passes through transparently to dim, so this is slightly faster indices_split = split(seq_len(dim(data)[1]), data[[ID_label]]) diff --git a/R/utilities.R b/R/utilities.R deleted file mode 100644 index 3f6e5f4..0000000 --- a/R/utilities.R +++ /dev/null @@ -1,41 +0,0 @@ -get_unique_variables_by_level <- function(data, ID_label) { - ## identify variables that do not vary within ID_label - ## maybe there is a faster way to do this? - level_variables <- - sapply(colnames(data)[!colnames(data) %in% ID_label], function(i) - max(tapply(data[, i], list(data[, ID_label]), - function(x) - length(unique(x)))) == 1) - return(names(level_variables)[level_variables]) -} - -expand_data_by_ID <- function(data, ID_label, N) { - if (typeof(N) %in% c("integer", "double") && - length(N) == 1) { - data <- data[rep(1:nrow(data), each = N), , drop = FALSE] - } else if (typeof(N) %in% c("integer", "double") && - length(N) > 1) { - # check that the vector that is N is the right length, i.e the length of data_internal_ - if (length(N) != nrow(data)) { - stop( - paste0( - "If you provide a vector to N for level ", - ID_label, - ", it must be the length of the dataset at the level above it ", - "in the hierarchy" - ) - ) - } - data <- data[rep(1:nrow(data), times = N), , drop = FALSE] - } else if (class(N) == "function") { - data <- data[rep(1:nrow(data), times = N()), , drop = FALSE] - } else { - stop( - paste0( - "Please provide level ", - ID_label, - " with N that is a vector, scalar, or function that generates a vector." - ) - ) - } -} diff --git a/R/variable_creation_functions.R b/R/variable_creation_functions.R index 848b33d..84eb7c3 100644 --- a/R/variable_creation_functions.R +++ b/R/variable_creation_functions.R @@ -1,13 +1,25 @@ -#' Draw discrete variables including binary, binomial count, poisson count, ordered, and categorical +#' Draw discrete variables including binary, binomial count, poisson count, +#' ordered, and categorical #' -#' Drawing discrete data based on probabilities or latent traits is a common task that can be cumbersome. \code{draw_binary} is an alias for \code{draw_discrete(type = "binary")} that allows you to draw binary outcomes more easily. +#' Drawing discrete data based on probabilities or latent traits is a common +#' task that can be cumbersome. \code{draw_binary} is an alias for +#' \code{draw_discrete(type = "binary")} that allows you to draw binary +#' outcomes more easily. #' -#' @param x vector representing either the latent variable used to draw the count outcome (if link is "logit" or "probit") or the probability for the count outcome (if link is "identity"). For cartegorical distributions x is a matrix with as many columns as possible outcomes. -#' @param N number of units to draw. Defaults to the length of the vector \code{x} -#' @param type type of discrete outcome to draw, one of 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count' -#' @param link link function between the latent variable and the probability of a postiive outcome, i.e. "logit", "probit", or "identity". For the "identity" link, the latent variable must be a probability. +#' @param x vector representing either the latent variable used to draw the +#' count outcome (if link is "logit" or "probit") or the probability for the +#' count outcome (if link is "identity"). For cartegorical distributions x is +#' a matrix with as many columns as possible outcomes. +#' @param N number of units to draw. Defaults to the length of the vector +#' \code{x} +#' @param type type of discrete outcome to draw, one of 'binary' +#' (or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count' +#' @param link link function between the latent variable and the probability of +#' a postiive outcome, i.e. "logit", "probit", or "identity". For the "identity" +#' link, the latent variable must be a probability. #' @param breaks vector of breaks to cut an ordered latent outcome -#' @param break_labels vector of labels for the breaks for an ordered latent outcome (must be the same length as breaks) +#' @param break_labels vector of labels for the breaks for an ordered latent +#' outcome (must be the same length as breaks) #' @param k the number of trials (zero or more) #' #' @importFrom stats pnorm rnorm rpois @@ -33,7 +45,8 @@ #' #' fabricate(N = 3, #' x = 5*rnorm(N), -#' ordered = draw_discrete(x, type = "ordered", breaks = c(-Inf, -1, 1, Inf))) +#' ordered = draw_discrete(x, type = "ordered", +#' breaks = c(-Inf, -1, 1, Inf))) #' #' fabricate(N = 3, #' x = c(0,5,100), @@ -48,7 +61,7 @@ draw_discrete <- type = "binary", link = "identity", breaks = c(-Inf, 0, Inf), - break_labels = FALSE, + break_labels = NULL, k = 1) { if (!link %in% c("logit", "probit", "identity")) { @@ -62,7 +75,8 @@ draw_discrete <- "ordered", "count")) { stop( - "Please choose either 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered', or 'count' as a data type." + "Please choose either 'binary' (or 'bernoulli'), 'binomial',", + "'categorical', 'ordered', or 'count' as a data type." ) } @@ -91,7 +105,8 @@ draw_discrete <- } if (link == "identity") if (!all(0 <= x & x <= 1)) { - stop("The identity link requires probability values between 0 and 1, inclusive.") + stop("The identity link requires probability values between 0 and 1,", + "inclusive.") } out <- rbinom(N, k, prob) @@ -105,7 +120,8 @@ draw_discrete <- if(is.vector(k) & length(k)>1) { if(N %% length(k)) { stop( - "\"N\" is not an even multiple of the length of the number of trials, \"k\"." + "\"N\" is not an even multiple of the length of the number of + trials, \"k\"." ) } if(!all(is.numeric(k) & (is.integer(k) | !k%%1))) { @@ -116,7 +132,8 @@ draw_discrete <- } if(!is.null(dim(k))) { stop( - "Number of trials must be an integer or vector, not higher-dimensional." + "Number of trials must be an integer or vector,", + " not higher-dimensional." ) } if(is.null(k) | is.na(k)) { @@ -151,21 +168,34 @@ draw_discrete <- if (is.matrix(breaks) | is.data.frame(breaks)) { stop("Numeric breaks must be a vector.") } - if (length(breaks) < 3) { - stop("Numeric breaks for ordered data must be of at least length 3.") - } if (is.unsorted(breaks)) { stop("Numeric breaks must be in ascending order.") } - if(any(breaks[1] > x) | any(breaks[length(breaks)] < x)) { - stop("Numeric break endpoints should be outside min/max of x data range.") + + # Pre-pend -Inf + if(any(breaks[1] > x)) { + breaks = c(-Inf, breaks) } - if(is.vector(break_labels) & !is.logical(break_labels) & all(!is.na(break_labels)) & length(break_labels) != length(breaks)-1) { - stop("Break labels should be of length one less than breaks.") + # Post-pend Inf + if(any(breaks[length(breaks)] < x)) { + breaks = c(breaks, Inf) } - out <- cut(x, breaks, labels = break_labels) + if(!is.null(break_labels) && + (is.vector(break_labels) & + !is.logical(break_labels) & + all(!is.na(break_labels)) & + length(break_labels) != length(breaks)-1)) { + stop("Break labels should be of length one less than breaks. ", + "Currently you have ", length(break_labels), " bucket labels and ", + length(breaks)-1, " buckets of data.") + } + if(!is.null(break_labels)) { + out <- break_labels[findInterval(x, breaks)] + } else { + out <- findInterval(x, breaks) + } } else if (type == "count") { if (link != "identity") { stop("Count data does not accept link functions.") @@ -188,17 +218,21 @@ draw_discrete <- if (is.vector(x) & all(is.numeric(x)) & length(x)>1) { x <- matrix(rep(x, N), byrow=TRUE, ncol=length(x), nrow=N) warning( - "For a categorical (multinomial) distribution, a matrix of probabilities should be provided. Data generated by interpreting vector of category probabilities, identical for each observation." + "For a categorical (multinomial) distribution, a matrix of ", + "probabilities should be provided. Data generated by interpreting ", + "vector of category probabilities, identical for each observation." ) } else { stop( - "For a categorical (multinomial) distribution, a matrix of probabilities should be provided" + "For a categorical (multinomial) distribution, a matrix of ", + "probabilities should be provided" ) } } if (!all(apply(x, 1, min) > 0)) { stop( - "For a categorical (multinomial) distribution, the elements of x should be positive and sum to a positive number." + "For a categorical (multinomial) distribution, the elements of x ", + "should be positive and sum to a positive number." ) } diff --git a/README.Rmd b/README.Rmd index 7d4d5eb..5182a42 100644 --- a/README.Rmd +++ b/README.Rmd @@ -35,12 +35,12 @@ Once the package is installed, it is easy to generate new data, or modify your o library(fabricatr) house_candidates = fabricate( - parties = level( + parties = add_level( N = 2, party_ideology = c(0.5, -0.5), in_power = c(1, 0), party_incumbents = c(241, 194)), - representatives = level( + representatives = add_level( N = party_incumbents, member_ideology = rnorm(N, party_ideology), terms_served = draw_discrete(N = N, x = 3, type = "count"), diff --git a/README.md b/README.md index 59d90da..10e9b11 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,12 @@ Once the package is installed, it is easy to generate new data, or modify your o library(fabricatr) house_candidates = fabricate( - parties = level( + parties = add_level( N = 2, party_ideology = c(0.5, -0.5), in_power = c(1, 0), party_incumbents = c(241, 194)), - representatives = level( + representatives = add_level( N = party_incumbents, member_ideology = rnorm(N, party_ideology), terms_served = draw_discrete(N = N, x = 3, type = "count"), diff --git a/docs/articles/advanced_features.html b/docs/articles/advanced_features.html index 8d4b08c..c745f6b 100644 --- a/docs/articles/advanced_features.html +++ b/docs/articles/advanced_features.html @@ -107,11 +107,11 @@

Aaron Rudkin

More complicated level creation with variable numbers of observations

-

level() can be used to create more complicated patterns of nesting. For example, when creating lower level data, it is possible to use a different N for each of the values of the higher level data:

+

add_level() can be used to create more complicated patterns of nesting. For example, when creating lower level data, it is possible to use a different N for each of the values of the higher level data:

variable_data <-
   fabricate(
-    cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
-    citizens = level(N = c(2, 4), age = runif(N, 18, 70))
+    cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
+    citizens = add_level(N = c(2, 4), age = runif(N, 18, 70))
   )
 variable_data
@@ -160,13 +160,13 @@

-

Here, each city has a different number of citizens. And the value of N used to create the age variable automatically updates as needed. The result is a dataset with 6 citizens, 2 in the first city and 4 in the second. As long as N is either a number, or a vector of the same length of the current lowest level of the data, level() will know what to do.

+

Here, each city has a different number of citizens. And the value of N used to create the age variable automatically updates as needed. The result is a dataset with 6 citizens, 2 in the first city and 4 in the second. As long as N is either a number, or a vector of the same length of the current lowest level of the data, add_level() will know what to do.

It is also possible to provide a function to N, enabling a random number of citizens per city:

my_data <-
   fabricate(
-    cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
-    citizens = level(N = sample(1:6, size = 2, replace = TRUE), 
-                     age = runif(N, 18, 70))
+    cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
+    citizens = add_level(N = sample(1:6, size = 2, replace = TRUE), 
+                         age = runif(N, 18, 70))
   )
 my_data
@@ -218,50 +218,43 @@

Here, each city is given a random number of citizens between 1 and 6. Since the sample() function returns a vector of length 2, this is like specifying 2 separate Ns as in the example above.

Finally, it is possible to define N on the basis of higher level variables themselves. Consider the following example:

variable_n_function = fabricate(
-  cities = level(N = 5, population = runif(N, 10, 200)),
-  citizens = level(N = round(population * 0.3))
+  cities = add_level(N = 5, population = runif(N, 10, 200)),
+  citizens = add_level(N = round(population * 0.3))
 )
 head(variable_n_function)

- - - - - - - @@ -275,10 +268,10 @@

Averages within higher levels of hierarchy

You may want to include the mean value of a variable within a group defined by a higher level of the hierarchy, for example the average income of citizens within city. You can do this with ave():

ave_example = fabricate(
-    cities = level(N = 2),
-    citizens = level(N = 1:2, 
-                     income = rnorm(N), 
-                     income_mean_city = ave(income, cities))
+    cities = add_level(N = 2),
+    citizens = add_level(N = 1:2, 
+                         income = rnorm(N), 
+                         income_mean_city = ave(income, cities))
     ) 
 ave_example
cities population citizens
1 1 90 001
1.1 1 90 002
1.2 1 90 003
1.3 1 90 004
1.4 1 90 005
1.5 1 90 006
@@ -320,8 +313,8 @@

my_data <- fabricate( - cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), - citizens = level(N = c(2, 3), age = runif(N, 18, 70)) + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = c(2, 3), age = runif(N, 18, 70)) ) %>% group_by(cities) %>% mutate(pop = n()) @@ -374,8 +367,8 @@

my_data <- 
-data_frame(Y = sample(1:10, 2)) %>%
-  fabricate(lower_level = level(N = 3, Y2 = Y + rnorm(N)))
+data_frame(Y = sample(1:10, 2)) %>% 
+  fabricate(lower_level = add_level(N = 3, Y2 = Y + rnorm(N)))
 my_data
diff --git a/docs/articles/getting_started.R b/docs/articles/getting_started.R index 30d5ee3..d9c9883 100644 --- a/docs/articles/getting_started.R +++ b/docs/articles/getting_started.R @@ -65,6 +65,13 @@ my_data <- my_data_2 <- resample_data(my_data, N = c(3, 5), ID_labels = c("cities", "citizens")) my_data_2 +## ------------------------------------------------------------------------ +fabricate( + cities = level(N = 2), + citizens = level( + N = 1:2, income = rnorm(N), income_mean_city = ave(income, cities)) + ) + ## ------------------------------------------------------------------------ fabricate( cities = level(N = 2), diff --git a/docs/articles/getting_started.html b/docs/articles/getting_started.html index a81f925..0f93217 100644 --- a/docs/articles/getting_started.html +++ b/docs/articles/getting_started.html @@ -172,6 +172,7 @@

+ @@ -182,6 +183,7 @@

+ @@ -191,6 +193,7 @@

+ @@ -200,6 +203,7 @@

+ @@ -209,6 +213,7 @@

+ @@ -218,6 +223,7 @@

+ @@ -227,6 +233,7 @@

+ @@ -240,10 +247,10 @@

The most powerful use of fabricatr is to create hierarchical (“nested”) data. In the example below, we create 5 countries, each of which has 10 provinces:

country_data <-
   fabricate(
-    countries = level(N = 5, 
+    countries = add_level(N = 5, 
                       gdp_per_capita = runif(N, min=10000, max=50000),
                       life_expectancy = 50 + runif(N, 10, 20) + ((gdp_per_capita > 30000) * 10)),
-    provinces = level(N = 10,
+    provinces = add_level(N = 10,
                       has_nat_resources = draw_discrete(x=0.3, N=N, type="bernoulli"),
                       has_manufacturing = draw_discrete(x=0.7, N=N, type="bernoulli"))
   )
@@ -308,7 +315,7 @@ 

depth mag stationsID fatalities insurance_cost
562 4.8 410001 611 1.0e+09
650 4.2 150002 290 3.0e+08
42 5.4 430003 427 7.8e+08
626 4.1 190004 470 5.2e+08
649 4.0 110005 456 7.2e+08
195 4.0 120006 390 7.0e+08
-

Several things can be observed in this example. First, fabricate knows that your second level() command will be nested under the first level of data. Each level gets its own ID variable, in addition to the variables you create. Second, the meaning of the variable “N” changes. During the level() call for countries, N is 5. During the level() call for provinces, N is 10. And the resulting data, of course, has 50 observations.

+

Several things can be observed in this example. First, fabricate knows that your second add_level() command will be nested under the first level of data. Each level gets its own ID variable, in addition to the variables you create. Second, the meaning of the variable “N” changes. During the add_level() call for countries, N is 5. During the add_level() call for provinces, N is 10. And the resulting data, of course, has 50 observations.

Finally, the province-level variables are created using the draw_discrete() function. This is a function provided by fabricatr to make simulating discrete random variables simple. When you simulate your own data, you can use fabricatr’s functions, R’s built-ins, or any custom functions you wish. draw_discrete() is explained in our tutorial on variable generation using fabricatr

@@ -319,11 +326,11 @@

citizen_data <- 
   fabricate(
     data = country_data,
-    citizens = level(N=10,
-                     salary = rnorm(N, 
-                                    mean = gdp_per_capita + 
-                                      has_nat_resources * 5000 + 
-                                      has_manufacturing * 5000,
+    citizens = add_level(N=10,
+                         salary = rnorm(N, 
+                                        mean = gdp_per_capita +
+                                          has_nat_resources * 5000 + 
+                                          has_manufacturing * 5000,
                                     sd = 10000)))
 head(citizen_data)
@@ -406,101 +413,100 @@

Modifying existing levels

-

Suppose you have hierarchical data, and wish to simulate variables at a higher level of aggregation. For example, imagine you import a dataset containing citizens within countries, but you wish to simulate additional country-level variables. In fabricatr, you can do this using the level() command.

+

Suppose you have hierarchical data, and wish to simulate variables at a higher level of aggregation. For example, imagine you import a dataset containing citizens within countries, but you wish to simulate additional country-level variables. In fabricatr, you can do this using the modify_level() command.

Let’s use our country-province data from earlier:

new_country_data <-
   fabricate(
     data = country_data,
-    countries = level(avg_temp = runif(N, 30, 80))
+    countries = modify_level(avg_temp = runif(N, 30, 80))
   )
 
 head(new_country_data)

+ + - - + + - - + + - - + + - - + + - - + + - - + + - -
countriesgdp_per_capitalife_expectancy provinces has_nat_resources has_manufacturinggdp_per_capitalife_expectancy avg_temp
139,39873 01 1 139,39873 33
139,39873 02 1 139,39873 33
139,39873 03 0 139,39873 33
139,39873 04 0 139,39873 33
139,39873 05 1 039,39873 33
139,39873 06 0 139,39873 33
-

How does level() know whether to modify your data or add a new level? level() uses contextual information – if the name you provide to your level() call is already a field that exists in your data set, level() will treat this as a request to modify this level of data. If, on the other hand, you provide a name not used in the data set, level() will assume you mean to add nested data under the existing data.

-

We can observe that the new variable is created at the level of aggregation you chose – countries. Also, although N is not specified anywhere, level() knows how large N should be based on the number of countries it finds in the dataset. It is important, then, to ensure that the level() command is correctly assigned to the level of interest.

+

We can observe that the new variable is created at the level of aggregation you chose – countries. Also, although N is not specified anywhere, modify_level() knows how large N should be based on the number of countries it finds in the dataset. It is important, then, to ensure that the modify_level() command is correctly assigned to the level of interest.

We can also modify more than one level. Recalling our country-province-citizen data from above, the following process is possible:

new_citizen_data <-
   fabricate(
     data = citizen_data,
-    countries = level(avg_temp = runif(N, 30, 80)),
-    provinces = level(conflict_zone = draw_discrete(N, 
-                                                    x=0.2 + has_nat_resources * 0.3,
-                                                    type="binary"),
-                      infant_mortality = runif(N, 0, 10) + 
-                        conflict_zone * 10 + 
-                        (avg_temp > 70) * 10),
-    citizens = level(college_degree = draw_discrete(N, 
-                                                    x=0.4 - (0.3 * conflict_zone), 
-                                                    type="binary"))
+    countries = modify_level(avg_temp = runif(N, 30, 80)),
+    provinces = modify_level(conflict_zone = draw_discrete(N, 
+                                                           x=0.2 + has_nat_resources * 0.3,
+                                                           type="binary"),
+                             infant_mortality = runif(N, 0, 10) + 
+                               conflict_zone * 10 + 
+                               (avg_temp > 70) * 10),
+    citizens = modify_level(college_degree = draw_discrete(N, 
+                                                           x=0.4 - (0.3 * conflict_zone),
+                                                           type="binary"))
   )
-

Before assessing what this tells us about level(), let’s consider what the data simulated does. It creates a new variable at the country level, for a country level average temperature. Subsequently, it creates a province level binary indicator for whether the province is an active conflict site. Provinces that have natural resources are more likely to be in conflict in this simulation, drawing on conclusions from literature on “resource curses”. The infant mortality rate for the province is able to depend both on province level data we have just generated, and country-level data: it is higher in high-temperature areas (reflecting literature on increased disease burden near the equator) and also higher in conflict zones. Citizens access to education is also random, but depends on whether they live in a conflict area.

-

There are a lot of things to learn from this example. First, it’s possible to modify multiple levels. Any new variable created will automatically propagate to the lower level data according – by setting an average temperature for a country, all provinces, and all citizens of those provinces, have the value for the country. Values created from one level() call can be used in subsequent variables of the same call, or subsequent calls.

+

Before assessing what this tells us about modify_level(), let’s consider what the data simulated does. It creates a new variable at the country level, for a country level average temperature. Subsequently, it creates a province level binary indicator for whether the province is an active conflict site. Provinces that have natural resources are more likely to be in conflict in this simulation, drawing on conclusions from literature on “resource curses”. The infant mortality rate for the province is able to depend both on province level data we have just generated, and country-level data: it is higher in high-temperature areas (reflecting literature on increased disease burden near the equator) and also higher in conflict zones. Citizens access to education is also random, but depends on whether they live in a conflict area.

+

There are a lot of things to learn from this example. First, it’s possible to modify multiple levels. Any new variable created will automatically propagate to the lower level data according – by setting an average temperature for a country, all provinces, and all citizens of those provinces, have the value for the country. Values created from one modify_level() call can be used in subsequent variables of the same call, or subsequent calls.

Again, we see the use of draw_discrete(). Using this function is covered in our tutorial on generating discrete random variables, linked below.

diff --git a/docs/articles/resampling.html b/docs/articles/resampling.html index a2ba605..8925dce 100644 --- a/docs/articles/resampling.html +++ b/docs/articles/resampling.html @@ -180,8 +180,8 @@

Consider this example, which takes a dataset containing 2 cities of 3 citizens, and resamples it into a dataset of 3 cities, each containing 5 citizens.

my_data <-
   fabricate(
-    cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
-    citizens = level(N = 3, age = runif(N, 18, 70))
+    cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
+    citizens = add_level(N = 3, age = runif(N, 18, 70))
   )
 
 my_data_2 <- resample_data(my_data, 
@@ -289,6 +289,116 @@ 

resample_data() will first select the cities to be resampled. Then, for each city, it will continue by selecting the citizens to be resampled. If a higher level unit is used more than once (for example, the same city being chosen twice), and a lower level is subsequently resampled, the choices of which units to keep for the lower level will differ for each copy of the higher level. In this example, if city 1 is chosen twice, then the sets of five citizens chosen for each copy of the city 1 will differ.

+

You can also specify the levels you wish to resample from using the name arguents to the N parameter, like in this example which does exactly the same thing as the previous example, but specifies the level names in a different way:

+
my_data <-
+  fabricate(
+    cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
+    citizens = add_level(N = 3, age = runif(N, 18, 70))
+  )
+
+my_data_2 <- resample_data(my_data, 
+                           N = c(cities=3, citizens=5))
+my_data_2
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
citieselevationcitizensage
21014639
21014551
21014551
21014639
21014551
21014441
21014639
21014551
21014441
21014551
11421164
11421266
11421343
11421343
11421266

diff --git a/docs/articles/variable_generation.html b/docs/articles/variable_generation.html index d0c0bb5..ccfac67 100644 --- a/docs/articles/variable_generation.html +++ b/docs/articles/variable_generation.html @@ -108,10 +108,9 @@

Aaron Rudkin

Fabricating discrete random variables.

fabricatr provides a convenient helper function, draw_discrete(), which you can use to generate discrete random variables far more easily than using R’s built-in data generation mechanisms. Below, we introduce you to the types of data you can generate using draw_discrete().

- -
-

-Binary and binomial outcomes

+
+

+Binary and binomial outcomes

The simplest possible type of data, and draw_discrete()’s default, is a binary random variable (also called a bernoulli random variable). Generating a binary random variable requires only one parameter x which specifies the probability that outcomes drawn from this variable are equal to 1. By default, draw_discrete() will generate N = length(x) draws. N can also be specified explicitly. Consider these examples:

draw_discrete_ex = fabricate(N = 3, p = c(0, .5, 1), 
                              binary_1 = draw_discrete(x = p),
@@ -136,9 +135,9 @@ 

type = "bernoulli", link = "probit"))

-
-

-Ordered outcomes

+
+

+Ordered outcomes

Some researchers may be interested in generating ordered outcomes – for example, Likert scale outcomes. This can be done using the “ordered” type. Ordered variables require a vector of breakpoints, supplied as the argument breaks – points at which the underlying latent variable switches from category to category. The first break should always be below the lower bound of the data, while the final break should always be above the upper bound of the data.

In the following example, each of three observations has a latent variable x which is continuous and unbounded. The variable ordered transforms x into three numeric categories: 1, 2, and 3. All values of x below -1 result in ordered 1; all values of x between -1 and 1 result in ordered 2; all values of x above 1 result in ordered 3:

ordered_example = fabricate(N = 3, 
@@ -158,17 +157,17 @@ 

) )

-
-

-Count outcomes

+
+

+Count outcomes

draw_discrete() supports Poisson-distributed count outcomes. These require that the user specify the parameter x, equal to the Poisson distribution mean (often referred to as lambda).

count_outcome_example = fabricate(N = 3, 
                                   x = c(0, 5, 100), 
                                   count = draw_discrete(x, type = "count"))
-
-

-Categorical data

+
+

+Categorical data

draw_discrete() can also generate non-ordered, categorical data. Users must provide a vector of probabilities for each category (or a matrix, if each observation should have separate probabilities), as well as setting type to be “categorical”.

If probabilities do not sum to exactly one, they will be normalized, but negative probabilities will cause an error.

In the first example, each unit has a different set of probabilities and the probabilities are provided as a matrix:

@@ -192,6 +191,225 @@

## category probabilities, identical for each observation.

“categorical” variables can also use link functions, for example to generate multinomial probit data.

+
+
+

+Fabricating cluster-correlated random variables.

+

We also provide helper functions to generate cluster-correlated random variables with fixed intra-cluster correlation (ICC) values. Our two functions draw_binary_icc() and draw_normal_icc() allow you to generate both discrete binary data with fixed ICCs and normal data with fixed ICCs.

+
+

+Binary data with fixed ICCs

+

draw_binary_icc() takes three required arguments: x, a probability or vector of probabilities which determine the chance a given observation will be a 1; cluster_ids, a map of units to clusters (required to generate the correlation structure); and rho, the fixed intra-cluster correlation (from 0 to 1). Users may optionally specify N; if it is not specified, draw_binary_icc() will determine it based on the length of the cluster_ids vector.

+

Consider the following example, which models whether individuals smoke:

+
# 100 individual population, 10 each in each of 10 clusters
+cluster_ids = rep(1:10, 10)
+
+# Individuals have a 20% chance of smoking, but clusters are highly correlated
+# in their tendency to smoke
+smoker = draw_binary_icc(x = 0.2,
+                         cluster_ids = cluster_ids,
+                         rho = 0.7)
+
+# Observe distribution of smokers and non-smokers
+table(smoker)
+ + + + + + + + + +
01
8317
+

We see that approximately 20% of the population smokes, in line with our specification, but what patterns of heterogeneity do we see by cluster?

+
table(cluster_ids, smoker)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
01
100
100
100
100
100
19
100
91
37
100
+

We observe that 7 clusters have no smokers at all, two clusters are overwhelming smokers, and one cluster is overwhelmingly non-smokers.

+

We can also specify separate means for each cluster; but it is worth noting that the higher the ICC, the more the cluster mean will depart from the nominal cluster mean.

+

If you do not specify a vector of probabilities or a correlation coefficient, the default values are probability 0.5 for each cluster and ICC (rho) of 0.5. If you do not specify cluster IDs, the function will return an error.

+
+
+

+Normal data with fixed ICCs

+

draw_normal_icc() takes four required arguments: x, a mean or vector of means, one for each cluster; cluster_ids, a map of units to clusters (required to generate the correlation structure); rho, the fixed intra-cluster correlation coefficient; and sd, a standard deviation or vector of standard deviations, one for each cluster. Users can optionally specify N, a number of units, but if it is not supplied draw_normal_icc() will determine it based on the length of the cluster_ids vector.

+

If sd is not supplied, each cluster will be assumed to have a within-cluster standard deviation of 1. If x is not supplied, each cluster will be assumed to be mean zero. If rho is not supplied, it will be set to 0.5.

+

Here, we model student academic performance by cluster:

+
# 100 students, 10 each in 10 clusters
+cluster_ids = rep(1:10, 10)
+
+numeric_grade = draw_normal_icc(x = 80,
+                               cluster_ids = cluster_ids,
+                               rho = 0.5,
+                               sd = 15)
+
+letter_grade = draw_discrete(x = numeric_grade,
+                             type = "ordered",
+                             breaks = c(-Inf, 60, 70, 80, 90, Inf),
+                             break_labels = c("F", "D", "C", "B", "A"))
+
+mean(numeric_grade)
+

82.77

+

The mean grade matches the population mean. Now let’s look at the relationship between cluster and letter grade to observe the cluster pattern:

+
table(letter_grade, cluster_ids)
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ABCDF
62200
22420
20404
03052
91000
10441
62110
32401
51121
23410
+

It is obvious upon inspection that some clusters are higher performing than others despite having identical cluster means in expectation.

+
+
+

+Technical Appendix

+

When generating binary data with a fixed ICC, we follow this formula, where \(i\) is a cluster and \(j\) is a unit in a cluster:

+

\[ +\begin{aligned} + z_i &\sim \text{Bern}(p_i) \\ + u_{ij} &\sim \text{Bern}(\sqrt{\rho}) \\ + x_{ij} &= + \begin{cases} + x_{ij} \sim \text{Bern}(p_i) & \quad \text{if } u_{ij} = 1 \\ + z_i & \quad \text{if } u_{ij} = 0 + \end{cases} +\end{aligned} +\]

+

In expectation, this guarantees an intra-cluster correlation of \(\rho\) and a cluster proportion of \(p_i\). This approach derives from Hossain, Akhtar and Chakraborti, Hrishikesh. “ICCBin: Facilitates Clustered Binary Data Generation, and Estimation of Intracluster Correlation Coefficient (ICC) for Binary Data”, available on https://cran.r-project.org/web/packages/ICCbin/index.html or https://github.com/akhtarh/ICCbin

+

When generating normal data with a fixed ICC, we follow this formula, again with \(i\) as a cluster and \(j\) as a unit in the cluster:

+

\[ +\begin{aligned} + \sigma^2_{\alpha i} &= \frac{(\rho * \sigma^2_{\epsilon i})}{(1 - \rho)} \\ + \alpha_i &\sim \mathcal{N}(0, \sigma^2_{\alpha i}) \\ + \mu_{ij} &\sim \mathcal{N}(\mu_i, \sigma^2_{\epsilon i}) \\ + x_{ij} &= \mu_{ij} + \alpha_i +\end{aligned} +\]

+

In expectation, this approach guarantees an intra-cluster correlation of \(\rho\), a cluster mean of \(\mu_{i}\), and a cluster-level variance in error terms of \(\sigma^2_{\epsilon i}\). This approach is specified on https://stats.stackexchange.com/questions/263451/create-synthetic-data-with-a-given-intraclass-correlation-coefficient-icc.

+
+
@@ -200,12 +418,22 @@

Contents

diff --git a/docs/index.html b/docs/index.html index 28b9b7e..94f791b 100644 --- a/docs/index.html +++ b/docs/index.html @@ -120,12 +120,12 @@

library(fabricatr)
 
 house_candidates = fabricate(
-  parties = level(
+  parties = add_level(
     N = 2, 
     party_ideology = c(0.5, -0.5), 
     in_power = c(1, 0), 
     party_incumbents = c(241, 194)),
-  representatives = level(
+  representatives = add_level(
     N = party_incumbents, 
     member_ideology = rnorm(N, party_ideology),
     terms_served = draw_discrete(N = N, x = 3, type = "count"),
diff --git a/docs/reference/draw_binary_icc.html b/docs/reference/draw_binary_icc.html
new file mode 100644
index 0000000..198e3c2
--- /dev/null
+++ b/docs/reference/draw_binary_icc.html
@@ -0,0 +1,198 @@
+
+
+
+  
+  
+
+
+
+fabricatr - Draw binary data with fixed intra-cluster correlation. — draw_binary_icc
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  
+  
+
+
+
+
+
+
+  
+
+  
+    
+
+ + + +
+ +
+
+ + + +

Data is generated to ensure inter-cluster correlation 0, intra-cluster +correlation in expectation \(\rho\). Algorithm taken from Hossein, +Akhtar. "ICCbin: An R Package Facilitating Clustered Binary Data +Generation, and Estimation of Intracluster Correlation Coefficient (ICC) +for Binary Data".

+ + +
draw_binary_icc(x = 0.5, N = NULL, cluster_ids, rho = 0.5)
+ +

Arguments

+ + + + + + + + + + + + + + + + + + +
x

A number or vector of numbers, one probability per cluster.

N

(Optional) A number indicating the number of observations to be +generated. Must be equal to length(cluster_ids) if provided.

cluster_ids

A vector of factors or items that can be coerced to +clusters; the length will determine the length of the generated data.

rho

A number indicating the desired RCC.

+ +

Value

+ +

A vector of binary numbers corresponding to the observations from +the supplied cluster IDs.

+ + +

Examples

+
cluster_ids = rep(1:5, 10) +draw_binary_icc(cluster_ids = cluster_ids)
#> [1] 0 1 0 0 0 1 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 0 0 1 1 1 +#> [39] 1 0 0 1 1 0 0 0 0 1 0 0
draw_binary_icc(x = 0.5, cluster_ids = cluster_ids, rho = 0.5)
#> [1] 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 +#> [39] 1 1 1 0 1 1 1 0 1 1 1 1
+
+
+ +
+ +
+ + +
+

Site built with pkgdown.

+
+ +
+
+ + + diff --git a/docs/reference/draw_discrete.html b/docs/reference/draw_discrete.html index 9c95dcc..3f0a0d0 100644 --- a/docs/reference/draw_discrete.html +++ b/docs/reference/draw_discrete.html @@ -6,7 +6,8 @@ -fabricatr - Draw discrete variables including binary, binomial count, poisson count, ordered, and categorical — draw_discrete +fabricatr - Draw discrete variables including binary, binomial count, poisson count, +ordered, and categorical — draw_discrete @@ -119,15 +120,19 @@
-

Drawing discrete data based on probabilities or latent traits is a common task that can be cumbersome. draw_binary is an alias for draw_discrete(type = "binary") that allows you to draw binary outcomes more easily.

+

Drawing discrete data based on probabilities or latent traits is a common +task that can be cumbersome. draw_binary is an alias for +draw_discrete(type = "binary") that allows you to draw binary +outcomes more easily.

draw_discrete(x, N = length(x), type = "binary", link = "identity",
-  breaks = c(-Inf, 0, Inf), break_labels = FALSE, k = 1)
+  breaks = c(-Inf, 0, Inf), break_labels = NULL, k = 1)
 
 draw_binary(x, N = length(x), link = "identity")
@@ -136,19 +141,26 @@

Ar x -

vector representing either the latent variable used to draw the count outcome (if link is "logit" or "probit") or the probability for the count outcome (if link is "identity"). For cartegorical distributions x is a matrix with as many columns as possible outcomes.

+

vector representing either the latent variable used to draw the +count outcome (if link is "logit" or "probit") or the probability for the +count outcome (if link is "identity"). For cartegorical distributions x is +a matrix with as many columns as possible outcomes.

N -

number of units to draw. Defaults to the length of the vector x

+

number of units to draw. Defaults to the length of the vector +x

type -

type of discrete outcome to draw, one of 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count'

+

type of discrete outcome to draw, one of 'binary' +(or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count'

link -

link function between the latent variable and the probability of a postiive outcome, i.e. "logit", "probit", or "identity". For the "identity" link, the latent variable must be a probability.

+

link function between the latent variable and the probability of +a postiive outcome, i.e. "logit", "probit", or "identity". For the "identity" +link, the latent variable must be a probability.

breaks @@ -156,7 +168,8 @@

Ar break_labels -

vector of labels for the breaks for an ordered latent outcome (must be the same length as breaks)

+

vector of labels for the breaks for an ordered latent +outcome (must be the same length as breaks)

k @@ -180,37 +193,38 @@

Examp #> 3 3 1.0 1

fabricate(N = 3, x = 10*rnorm(N), - binary = draw_discrete(x, type = "bernoulli", link = "probit"))
#> ID x binary -#> 1 1 2.55317055 1 -#> 2 2 -24.37263611 0 -#> 3 3 -0.05571287 1
+ binary = draw_discrete(x, type = "bernoulli", link = "probit"))
#> ID x binary +#> 1 1 -5.6 0 +#> 2 2 -5.4 0 +#> 3 3 2.3 1
fabricate(N = 3, p = c(0, .5, 1), binomial = draw_discrete(p, type = "binomial", k = 10))
#> ID p binomial #> 1 1 0.0 0 -#> 2 2 0.5 4 +#> 2 2 0.5 3 #> 3 3 1.0 10
fabricate(N = 3, x = 5*rnorm(N), - ordered = draw_discrete(x, type = "ordered", breaks = c(-Inf, -1, 1, Inf)))
#> ID x ordered -#> 1 1 -9.109088 1 -#> 2 2 -1.236627 1 -#> 3 3 -1.220998 1
+ ordered = draw_discrete(x, type = "ordered", + breaks = c(-Inf, -1, 1, Inf)))
#> ID x ordered +#> 1 1 -7.0 1 +#> 2 2 1.3 3 +#> 3 3 -2.2 1
fabricate(N = 3, x = c(0,5,100), count = draw_discrete(x, type = "count"))
#> ID x count #> 1 1 0 0 -#> 2 2 5 4 -#> 3 3 100 119
+#> 2 2 5 6 +#> 3 3 100 111
# Categorical fabricate(N = 6, p1 = runif(N), p2 = runif(N), p3 = runif(N), - cat = draw_discrete(cbind(p1, p2, p3), type = "categorical"))
#> ID p1 p2 p3 cat -#> 1 1 0.67838043 0.53021246 0.6364656 1 -#> 2 2 0.73531960 0.69582388 0.4790245 2 -#> 3 3 0.19595673 0.68855600 0.4321713 2 -#> 4 4 0.98053967 0.03123033 0.7064338 3 -#> 5 5 0.74152153 0.22556253 0.9485766 1 -#> 6 6 0.05144628 0.30083081 0.1803388 2
+ cat = draw_discrete(cbind(p1, p2, p3), type = "categorical"))
#> ID p1 p2 p3 cat +#> 1 1 0.219 0.407 0.37 2 +#> 2 2 0.665 0.858 0.92 2 +#> 3 3 0.390 0.518 0.68 3 +#> 4 4 0.046 0.979 0.67 3 +#> 5 5 0.617 0.017 0.76 1 +#> 6 6 0.598 0.673 0.54 1
-

fabricate helps you simulate a dataset before you collect it. You can either start with your own data and add simulated variables to it (by passing data to fabricate()) or start from scratch by defining N. Create hierarchical data with multiple levels of data such as citizens within cities within states using level(). You can use any R function to create each variable. We provide several built-in options to easily draw from binary and count outcomes, draw_binary and draw_discrete.

-

Fabricate a Level of Data for Hierarchical Data

+

fabricate helps you simulate a dataset before you collect it. You can +either start with your own data and add simulated variables to it (by passing +data to fabricate()) or start from scratch by defining +N. Create hierarchical data with multiple levels of data such as +citizens within cities within states using add_level() or modify +existing hierarchical data using modify_level(). You can use any R +function to create each variable. We provide several built-in options to +easily draw from binary and count outcomes, draw_binary, +draw_binary_icc, and draw_discrete.

-
fabricate(data, N, ID_label, ...)
+    
fabricate(data = NULL, N = NULL, ID_label = NULL, ...)
 
-level(N = NULL, ...)
+add_level(N = NULL, ID_label = NULL, working_environment_ = NULL, ..., + data_arguments = quos(...), new_hierarchy = FALSE) + +modify_level(N = NULL, ID_label = NULL, working_environment_ = NULL, ..., + data_arguments = quos(...))

Arguments

- + - + @@ -148,7 +166,23 @@

Ar

- + + + + + + + + + + + + +
data

(optional) user-provided data that forms the basis of the fabrication, i.e. you can add variables to existing data. Provide either N or data (N is the number of rows of the data if data is provided).

(optional) user-provided data that forms the basis of the +fabrication, i.e. you can add variables to existing data. Provide either +N or data (N is the number of rows of the data if +data is provided).

N

(optional) number of units to draw. If provided as fabricate(N = 5), this determines the number of units in the single-level data. If provided in level, i.e. fabricate(cities = level(N = 5)), N determines the number of units in a specific level of a hierarchical dataset.

(optional) number of units to draw. If provided as +fabricate(N = 5), this determines the number of units in the +single-level data. If provided in add_level, i.e. +fabricate(cities = add_level(N = 5)), N determines the number +of units in a specific level of a hierarchical dataset.

ID_label
...

Variable or level-generating arguments, such as my_var = rnorm(N). For fabricate, you may also pass level() arguments, which define a level of a multi-level dataset. See examples.

Variable or level-generating arguments, such as +my_var = rnorm(N). For fabricate, you may also pass +add_level() or modify_level() arguments, which define a level +of a multi-level dataset. See examples.

working_environment_

Internal argument, not intended for end-user use.

data_arguments

Internal argument, not intended for end-user use.

new_hierarchy

Reserved argument for future functionality to add +cross-classified data. Not yet implemented.

@@ -174,12 +208,12 @@

Examp height_ft = runif(N, 3.5, 8) ) head(df)
#> ID height_ft -#> 1 001 6.945201 -#> 2 002 6.963537 -#> 3 003 7.958205 -#> 4 004 7.867344 -#> 5 005 5.251322 -#> 6 006 5.575339
+#> 1 001 4.2 +#> 2 002 7.7 +#> 3 003 7.9 +#> 4 004 6.7 +#> 5 005 7.5 +#> 6 006 7.9
# Start with existing data df <- fabricate( data = df, @@ -189,21 +223,22 @@

Examp # Draw a two-level hierarchical dataset # containing cities within regions df <- fabricate( - regions = level(N = 5), - cities = level(N = 2, pollution = rnorm(N, mean = 5))) + regions = add_level(N = 5), + cities = add_level(N = 2, pollution = rnorm(N, mean = 5))) head(df)

#> regions cities pollution -#> 1 1 01 5.239065 -#> 2 1 02 5.236321 -#> 3 2 03 4.740881 -#> 4 2 04 5.649046 -#> 5 3 05 3.782359 -#> 6 3 06 5.841970
+#> 1 1 01 5.3 +#> 2 1 02 6.4 +#> 3 2 03 4.8 +#> 4 2 04 5.7 +#> 5 3 05 5.1 +#> 6 3 06 5.0
# Start with existing data and add variables to hierarchical data -# note: do not provide N when adding variables to an existing level +# at levels which are already present in the existing data. +# Note: do not provide N when adding variables to an existing level df <- fabricate( data = df, - regions = level(watershed = sample(c(0, 1), N, replace = TRUE)), - cities = level(runoff = rnorm(N)) + regions = modify_level(watershed = sample(c(0, 1), N, replace = TRUE)), + cities = modify_level(runoff = rnorm(N)) )
#> ID Y_pre -#> 2 2 0.3764993 -#> 4 4 1.2412631 -#> 5 5 0.6120909 -#> 2.1 2 0.3764993 -#> 3 3 1.1387077 -#> 2.2 2 0.3764993 -#> 2.3 2 0.3764993 -#> 2.4 2 0.3764993 -#> 1 1 0.1329921 -#> 2.5 2 0.3764993
+bootsrapped_data
#> ID Y_pre +#> 1 2 -0.38 +#> 2 2 -0.38 +#> 3 5 -0.87 +#> 4 1 1.52 +#> 5 4 -0.55 +#> 6 2 -0.38 +#> 7 5 -0.87 +#> 8 1 1.52 +#> 9 3 1.83 +#> 10 3 1.83
# Resample by a single level of a hierarchical dataset (e.g. resampling clusters of observations) # N specifies a number of clusters to return clustered_survey <- fabricate( - clusters = level(N=25), - cities = level(N=runif(25, 1, 5), population=runif(n = N, min=50000, max=1000000)) + clusters = add_level(N=25), + cities = add_level(N=round(runif(25, 1, 5)), population=runif(n = N, min=50000, max=1000000)) ) # Specify the name of the cluster variable one of two ways cluster_resample <- resample_data(clustered_survey, N = 5, ID_labels = "clusters") cluster_resample
#> clusters cities population -#> 21 10 21 372291.19 -#> 22 10 22 772723.08 -#> 23 10 23 60328.84 -#> 24 10 24 109980.47 -#> 29 14 29 773192.59 -#> 30 14 30 783853.01 -#> 15 07 15 329063.25 -#> 13 06 13 667030.88 -#> 14 06 14 201083.08 -#> 4 02 04 835365.36 -#> 5 02 05 739475.76 -#> 6 02 06 398962.57 -#> 7 02 07 547257.00
+#> 1 14 36 980082 +#> 2 14 37 673751 +#> 3 14 38 135356 +#> 4 14 39 649016 +#> 5 07 14 667784 +#> 6 07 15 197530 +#> 7 07 16 532894 +#> 8 07 17 131577 +#> 9 16 42 448512 +#> 10 16 43 520782 +#> 11 16 44 385069 +#> 12 08 18 527875 +#> 13 24 66 521043
cluster_resample_2 <- resample_data(clustered_survey, N=c(clusters = 5)) cluster_resample_2
#> clusters cities population -#> 16 08 16 896720.6 -#> 60 25 60 495511.1 -#> 61 25 61 892055.5 -#> 62 25 62 687021.1 -#> 8 03 08 496385.0 -#> 38 18 38 239050.7 -#> 39 18 39 662802.0 -#> 40 18 40 762727.9 -#> 41 18 41 307428.9 -#> 13 06 13 667030.9 -#> 14 06 14 201083.1
+#> 1 23 62 306210 +#> 2 23 63 513874 +#> 3 23 64 98509 +#> 4 23 65 52353 +#> 5 03 06 648891 +#> 6 19 48 239037 +#> 7 19 49 739971 +#> 8 15 40 882929 +#> 9 15 41 933767 +#> 10 18 47 391962
# Resample a hierarchical dataset on multiple levels my_data <- fabricate( - cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), - citizens = level(N = 3, income = round(elevation * rnorm(n = N, mean = 5))) + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = 3, income = round(elevation * rnorm(n = N, mean = 5))) ) # Specify the levels you wish to resample one of two ways: my_data_2 <- resample_data(my_data, N = c(3, 5), ID_labels = c("cities", "citizens")) my_data_2
#> cities elevation citizens income -#> 1 2 1927.282 5 10398 -#> 2 2 1927.282 4 7181 -#> 3 2 1927.282 5 10398 -#> 4 2 1927.282 5 10398 -#> 5 2 1927.282 4 7181 -#> 6 1 1133.349 1 7082 -#> 7 1 1133.349 1 7082 -#> 8 1 1133.349 3 5500 -#> 9 1 1133.349 3 5500 -#> 10 1 1133.349 2 4927 -#> 11 1 1133.349 1 7082 -#> 12 1 1133.349 1 7082 -#> 13 1 1133.349 2 4927 -#> 14 1 1133.349 2 4927 -#> 15 1 1133.349 2 4927
+#> 1 1 1701 3 10505 +#> 2 1 1701 3 10505 +#> 3 1 1701 1 8210 +#> 4 1 1701 3 10505 +#> 5 1 1701 1 8210 +#> 6 1 1701 2 8789 +#> 7 1 1701 2 8789 +#> 8 1 1701 2 8789 +#> 9 1 1701 3 10505 +#> 10 1 1701 3 10505 +#> 11 1 1701 3 10505 +#> 12 1 1701 1 8210 +#> 13 1 1701 1 8210 +#> 14 1 1701 2 8789 +#> 15 1 1701 3 10505
my_data_3 <- resample_data(my_data, N = c(cities=3, citizens=5)) my_data_3
#> cities elevation citizens income -#> 1 2 1927.282 4 7181 -#> 2 2 1927.282 6 5911 -#> 3 2 1927.282 5 10398 -#> 4 2 1927.282 6 5911 -#> 5 2 1927.282 5 10398 -#> 6 2 1927.282 5 10398 -#> 7 2 1927.282 5 10398 -#> 8 2 1927.282 4 7181 -#> 9 2 1927.282 6 5911 -#> 10 2 1927.282 5 10398 -#> 11 1 1133.349 3 5500 -#> 12 1 1133.349 2 4927 -#> 13 1 1133.349 3 5500 -#> 14 1 1133.349 2 4927 -#> 15 1 1133.349 3 5500
+#> 1 2 1040 4 6786 +#> 2 2 1040 6 5508 +#> 3 2 1040 4 6786 +#> 4 2 1040 5 4613 +#> 5 2 1040 4 6786 +#> 6 2 1040 6 5508 +#> 7 2 1040 4 6786 +#> 8 2 1040 6 5508 +#> 9 2 1040 5 4613 +#> 10 2 1040 6 5508 +#> 11 1 1701 1 8210 +#> 12 1 1701 2 8789 +#> 13 1 1701 1 8210 +#> 14 1 1701 2 8789 +#> 15 1 1701 3 10505
# Transparently pass through all units at a given level # This example will resample 2 citizens at each of the cities: passthrough_resample_data <- resample_data(my_data, N = c(cities=ALL, citizens=2)) passthrough_resample_data
#> cities elevation citizens income -#> 1 1 1133.349 2 4927 -#> 2 1 1133.349 2 4927 -#> 3 2 1927.282 5 10398 -#> 4 2 1927.282 6 5911
+#> 1 1 1701 1 8210 +#> 2 1 1701 1 8210 +#> 3 2 1040 4 6786 +#> 4 2 1040 5 4613
diff --git a/man/cross_level.Rd b/man/cross_level.Rd new file mode 100644 index 0000000..17cc716 --- /dev/null +++ b/man/cross_level.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fabricate.R +\name{cross_level} +\alias{cross_level} +\title{Creates cross-classified (partially non-nested, joined data) with a fixed +correlation structure.} +\usage{ +cross_level(N = NULL, ID_label = NULL, working_environment_ = NULL, + by = NULL, ..., data_arguments = quos(...)) +} +\arguments{ +\item{N}{(required) The number of observations in the resulting data frame} + +\item{ID_label}{Internal keyword used to sepcify the name of the ID variable created for the new level. If left empty, this will be the name the level is assigned to as part of a \code{fabricate()} call.} + +\item{working_environment_}{Internal keyword not for end user use.} + +\item{by}{The result of a call to \code{join()} which specifies how the cross-classified data will be created} + +\item{...}{A variable or series of variables to add to the resulting data frame after the cross-classified data is created.} + +\item{data_arguments}{Internal keyword not for end user use.} +} +\description{ +Creates cross-classified (partially non-nested, joined data) with a fixed +correlation structure. +} diff --git a/man/draw_binary_icc.Rd b/man/draw_binary_icc.Rd index 7b2b519..4486dfe 100644 --- a/man/draw_binary_icc.Rd +++ b/man/draw_binary_icc.Rd @@ -4,10 +4,11 @@ \alias{draw_binary_icc} \title{Draw binary data with fixed intra-cluster correlation.} \usage{ -draw_binary_icc(x = 0.5, N = NULL, clusters, rho = 0.5) +draw_binary_icc(x = 0.5, N = NULL, clusters, rho = 0) } \arguments{ -\item{x}{A number or vector of numbers, one probability per cluster.} +\item{x}{A number or vector of numbers, one probability per cluster. If none +is provided, will default to 0.5.} \item{N}{(Optional) A number indicating the number of observations to be generated. Must be equal to length(clusters) if provided.} @@ -15,7 +16,8 @@ generated. Must be equal to length(clusters) if provided.} \item{clusters}{A vector of factors or items that can be coerced to clusters; the length will determine the length of the generated data.} -\item{rho}{A number indicating the desired RCC.} +\item{rho}{A number indicating the desired ICC, if none is provided will +default to 0.} } \value{ A vector of binary numbers corresponding to the observations from diff --git a/man/draw_discrete.Rd b/man/draw_discrete.Rd index 8a23400..a9f0adf 100644 --- a/man/draw_discrete.Rd +++ b/man/draw_discrete.Rd @@ -3,30 +3,42 @@ \name{draw_discrete} \alias{draw_discrete} \alias{draw_binary} -\title{Draw discrete variables including binary, binomial count, poisson count, ordered, and categorical} +\title{Draw discrete variables including binary, binomial count, poisson count, +ordered, and categorical} \usage{ draw_discrete(x, N = length(x), type = "binary", link = "identity", - breaks = c(-Inf, 0, Inf), break_labels = FALSE, k = 1) + breaks = c(-Inf, 0, Inf), break_labels = NULL, k = 1) draw_binary(x, N = length(x), link = "identity") } \arguments{ -\item{x}{vector representing either the latent variable used to draw the count outcome (if link is "logit" or "probit") or the probability for the count outcome (if link is "identity"). For cartegorical distributions x is a matrix with as many columns as possible outcomes.} +\item{x}{vector representing either the latent variable used to draw the +count outcome (if link is "logit" or "probit") or the probability for the +count outcome (if link is "identity"). For cartegorical distributions x is +a matrix with as many columns as possible outcomes.} -\item{N}{number of units to draw. Defaults to the length of the vector \code{x}} +\item{N}{number of units to draw. Defaults to the length of the vector +\code{x}} -\item{type}{type of discrete outcome to draw, one of 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count'} +\item{type}{type of discrete outcome to draw, one of 'binary' +(or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count'} -\item{link}{link function between the latent variable and the probability of a postiive outcome, i.e. "logit", "probit", or "identity". For the "identity" link, the latent variable must be a probability.} +\item{link}{link function between the latent variable and the probability of +a postiive outcome, i.e. "logit", "probit", or "identity". For the "identity" +link, the latent variable must be a probability.} \item{breaks}{vector of breaks to cut an ordered latent outcome} -\item{break_labels}{vector of labels for the breaks for an ordered latent outcome (must be the same length as breaks)} +\item{break_labels}{vector of labels for the breaks for an ordered latent +outcome (must be the same length as breaks)} \item{k}{the number of trials (zero or more)} } \description{ -Drawing discrete data based on probabilities or latent traits is a common task that can be cumbersome. \code{draw_binary} is an alias for \code{draw_discrete(type = "binary")} that allows you to draw binary outcomes more easily. +Drawing discrete data based on probabilities or latent traits is a common +task that can be cumbersome. \code{draw_binary} is an alias for +\code{draw_discrete(type = "binary")} that allows you to draw binary +outcomes more easily. } \examples{ fabricate(N = 3, @@ -47,7 +59,8 @@ fabricate(N = 3, fabricate(N = 3, x = 5*rnorm(N), - ordered = draw_discrete(x, type = "ordered", breaks = c(-Inf, -1, 1, Inf))) + ordered = draw_discrete(x, type = "ordered", + breaks = c(-Inf, -1, 1, Inf))) fabricate(N = 3, x = c(0,5,100), diff --git a/man/draw_normal_icc.Rd b/man/draw_normal_icc.Rd index 9275822..8b0e57e 100644 --- a/man/draw_normal_icc.Rd +++ b/man/draw_normal_icc.Rd @@ -4,10 +4,11 @@ \alias{draw_normal_icc} \title{Draw normal data with fixed intra-cluster correlation.} \usage{ -draw_normal_icc(x = 0, N = NULL, clusters, sd = 1, rho = 0.5) +draw_normal_icc(x = 0, N = NULL, clusters, sd = 1, rho = 0) } \arguments{ -\item{x}{A number or vector of numbers, one mean per cluster.} +\item{x}{A number or vector of numbers, one mean per cluster. If none is +provided, will default to 0.} \item{N}{(Optional) A number indicating the number of observations to be generated. Must be equal to length(clusters) if provided.} @@ -18,7 +19,8 @@ clusters; the length will determine the length of the generated data.} \item{sd}{A number or vector of numbers, indicating the standard deviation of each cluster's error terms} -\item{rho}{A number indicating the desired RCC.} +\item{rho}{A number indicating the desired ICC. If none is provided, +will default to 0.} } \value{ A vector of numbers corresponding to the observations from diff --git a/man/fabricate.Rd b/man/fabricate.Rd index 756b5b8..e0c0ba0 100644 --- a/man/fabricate.Rd +++ b/man/fabricate.Rd @@ -1,30 +1,57 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/fabricate.R, R/level.R +% Please edit documentation in R/fabricate.R \name{fabricate} \alias{fabricate} -\alias{level} +\alias{add_level} +\alias{modify_level} \title{Fabricate data} \usage{ -fabricate(data, N, ID_label, ...) +fabricate(data = NULL, ..., N = NULL, ID_label = NULL) -level(N = NULL, ...) +add_level(N = NULL, ID_label = NULL, working_environment_ = NULL, ..., + data_arguments = quos(...), nest = TRUE) + +modify_level(N = NULL, ID_label = NULL, working_environment_ = NULL, ..., + data_arguments = quos(...)) } \arguments{ -\item{data}{(optional) user-provided data that forms the basis of the fabrication, i.e. you can add variables to existing data. Provide either \code{N} or \code{data} (\code{N} is the number of rows of the data if \code{data} is provided).} +\item{data}{(optional) user-provided data that forms the basis of the +fabrication, i.e. you can add variables to existing data. Provide either +\code{N} or \code{data} (\code{N} is the number of rows of the data if +\code{data} is provided).} + +\item{...}{Variable or level-generating arguments, such as +\code{my_var = rnorm(N)}. For \code{fabricate}, you may also pass +\code{add_level()} or \code{modify_level()} arguments, which define a level +of a multi-level dataset. See examples.} -\item{N}{(optional) number of units to draw. If provided as \code{fabricate(N = 5)}, this determines the number of units in the single-level data. If provided in \code{level}, i.e. \code{fabricate(cities = level(N = 5))}, \code{N} determines the number of units in a specific level of a hierarchical dataset.} +\item{N}{(optional) number of units to draw. If provided as +\code{fabricate(N = 5)}, this determines the number of units in the +single-level data. If provided in \code{add_level}, i.e. +\code{fabricate(cities = add_level(N = 5))}, \code{N} determines the number +of units in a specific level of a hierarchical dataset.} \item{ID_label}{(optional) variable name for ID variable, i.e. citizen_ID} -\item{...}{Variable or level-generating arguments, such as \code{my_var = rnorm(N)}. For \code{fabricate}, you may also pass \code{level()} arguments, which define a level of a multi-level dataset. See examples.} +\item{working_environment_}{Internal argument, not intended for end-user use.} + +\item{data_arguments}{Internal argument, not intended for end-user use.} + +\item{nest}{(Default TRUE) Boolean determining whether data in an \code{add_level()} call will be nested under the current working data frame or create a separate hierarchy of levels. See our vignette for cross-classified, non-nested data for details.} } \value{ data.frame } \description{ -\code{fabricate} helps you simulate a dataset before you collect it. You can either start with your own data and add simulated variables to it (by passing \code{data} to \code{fabricate()}) or start from scratch by defining \code{N}. Create hierarchical data with multiple levels of data such as citizens within cities within states using \code{level()}. You can use any R function to create each variable. We provide several built-in options to easily draw from binary and count outcomes, \code{\link{draw_binary}} and \code{\link{draw_discrete}}. - -Fabricate a Level of Data for Hierarchical Data +\code{fabricate} helps you simulate a dataset before you collect it. You can +either start with your own data and add simulated variables to it (by passing +\code{data} to \code{fabricate()}) or start from scratch by defining +\code{N}. Create hierarchical data with multiple levels of data such as +citizens within cities within states using \code{add_level()} or modify +existing hierarchical data using \code{modify_level()}. You can use any R +function to create each variable. We provide several built-in options to +easily draw from binary and count outcomes, \code{\link{draw_binary}}, +\code{\link{draw_binary_icc}}, and \code{\link{draw_discrete}}. } \examples{ @@ -48,16 +75,17 @@ df <- fabricate( # Draw a two-level hierarchical dataset # containing cities within regions df <- fabricate( - regions = level(N = 5), - cities = level(N = 2, pollution = rnorm(N, mean = 5))) + regions = add_level(N = 5), + cities = add_level(N = 2, pollution = rnorm(N, mean = 5))) head(df) # Start with existing data and add variables to hierarchical data -# note: do not provide N when adding variables to an existing level +# at levels which are already present in the existing data. +# Note: do not provide N when adding variables to an existing level df <- fabricate( data = df, - regions = level(watershed = sample(c(0, 1), N, replace = TRUE)), - cities = level(runoff = rnorm(N)) + regions = modify_level(watershed = sample(c(0, 1), N, replace = TRUE)), + cities = modify_level(runoff = rnorm(N)) ) } diff --git a/man/join.Rd b/man/join.Rd new file mode 100644 index 0000000..48536b5 --- /dev/null +++ b/man/join.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fabricate.R +\name{join} +\alias{join} +\title{Helper function handling specification of which variables to join a +cross-classified data on, and what kind of correlation structure needed} +\usage{ +join(..., rho = 0, sigma = NULL, data_arguments = quos(...)) +} +\arguments{ +\item{...}{A series of two or more variable names, unquoted, to join on in order to create cross-classified data.} + +\item{rho}{A fixed (Spearman's rank) correlation coefficient between the variables being joined on: note that if it is not possible to make a correlation matrix from this coefficient (i.e. if you are joining on three or more variables and rho is negative) then the \code{cross_level()} call will fail.} + +\item{sigma}{A matrix with dimensions equal to the number of variables you are joining on, specifying the correlation for the resulting joined data. Only one of rho and sigma should be provided.} + +\item{data_arguments}{Internal, not for end-user use.} +} +\description{ +Helper function handling specification of which variables to join a +cross-classified data on, and what kind of correlation structure needed +} diff --git a/man/level.Rd b/man/level.Rd new file mode 100644 index 0000000..a4592ee --- /dev/null +++ b/man/level.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fabricate.R +\name{level} +\alias{level} +\title{Deprecated level call function maintained to provide useful error for +previous fabricatr code.} +\usage{ +level(N = NULL, ID_label = NULL, ...) +} +\description{ +Deprecated level call function maintained to provide useful error for +previous fabricatr code. +} +\keyword{internal} diff --git a/man/resample_data.Rd b/man/resample_data.Rd index 3f024e3..7ebf656 100644 --- a/man/resample_data.Rd +++ b/man/resample_data.Rd @@ -31,8 +31,8 @@ bootsrapped_data # N specifies a number of clusters to return clustered_survey <- fabricate( - clusters = level(N=25), - cities = level(N=runif(25, 1, 5), population=runif(n = N, min=50000, max=1000000)) + clusters = add_level(N=25), + cities = add_level(N=round(runif(25, 1, 5)), population=runif(n = N, min=50000, max=1000000)) ) # Specify the name of the cluster variable one of two ways @@ -47,8 +47,8 @@ cluster_resample_2 my_data <- fabricate( - cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), - citizens = level(N = 3, income = round(elevation * rnorm(n = N, mean = 5))) + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = 3, income = round(elevation * rnorm(n = N, mean = 5))) ) # Specify the levels you wish to resample one of two ways: diff --git a/tests/testthat/test-crossclassified.R b/tests/testthat/test-crossclassified.R new file mode 100644 index 0000000..8a570a1 --- /dev/null +++ b/tests/testthat/test-crossclassified.R @@ -0,0 +1,149 @@ +context("Fabricate") + +test_that("Cross-classified data", { + # Example draw setup + students = fabricate( + primary_schools = add_level(N = 100, + ps_quality = runif(n=N, 1, 100), + ps_hasband = draw_binary(0.5, N=N), + ps_testscores = ps_quality * 5 + rnorm(N, 30, 5)), + secondary_schools = add_level(N = 50, + ss_quality = runif(n=N, 1, 100), + ss_hascomputers = draw_binary(ss_quality/100, N=N), + ss_testscores = ss_quality * 5 + rnorm(N, 30, 5), + nest = FALSE), + students = cross_level(N = 1000, + by = join(ps_quality, ss_quality, rho=0.5), + student_score = ps_testscores * 5 + ss_testscores * 10 + rnorm(N, 10, 5), + student_score_2 = student_score * 2, + extracurricular = ps_hasband + ss_hascomputers + ) + ) + + # Within a reasonable "tolerance" + expect_gte(cor(students$ps_quality, students$ss_quality), 0.3) + expect_lte(cor(students$ps_quality, students$ss_quality), 0.7) + + # Uncorrelated + students_uncorr = fabricate( + primary_schools = add_level(N = 100, + ps_quality = runif(n=N, 1, 100), + ps_hasband = draw_binary(0.5, N=N), + ps_testscores = ps_quality * 5 + rnorm(N, 30, 5)), + secondary_schools = add_level(N = 50, + ss_quality = runif(n=N, 1, 100), + ss_hascomputers = draw_binary(ss_quality/100, N=N), + ss_testscores = ss_quality * 5 + rnorm(N, 30, 5), + nest = FALSE), + students = cross_level(N = 1000, + by = join(ps_quality, ss_quality, rho=0), + student_score = ps_testscores * 5 + ss_testscores * 10 + rnorm(N, 10, 5), + student_score_2 = student_score * 2, + extracurricular = ps_hasband + ss_hascomputers + ) + ) + + # Again, within tolerance + expect_gte(cor(students_uncorr$ps_quality, students_uncorr$ss_quality), -0.1) + expect_lte(cor(students_uncorr$ps_quality, students_uncorr$ss_quality), 0.1) + + + # Specifying sigma in lieu of rho + test_next = fabricate( + l1 = add_level(N = 50, j1 = rnorm(N)), + l2 = add_level(N = 50, j2 = rnorm(N), nest=FALSE), + joined = cross_level(N = 200, + by = join(j1, j2, sigma=matrix(c(1, 0.5, 0.5, 1), ncol=2))) + ) + + expect_gte(cor(test_next$j1, test_next$j2), 0.3) + expect_lte(cor(test_next$j1, test_next$j2), 0.7) +}) + +test_that("Code path without mvnfast", { + # Need to directly call joint_draw_ecdf because we don't let users voluntarily + # override the use_f argument + dl = list(j1 = rnorm(100), + j2 = rnorm(500)) + result = fabricatr:::joint_draw_ecdf(dl, N = 100, rho = 0.3, use_f = FALSE) + data = cbind(dl$j1[result[[1]]], + dl$j2[result[[2]]]) + expect_gte(cor(data[, 1], data[, 2]), 0.1) + expect_lte(cor(data[, 1], data[, 2]), 0.5) +}) + +test_that("Deliberate failures in join_dfs", { + df1 = fabricate(N=100, j1 = rnorm(100)) + df2 = fabricate(N=100, j2 = rnorm(100)) + df3 = fabricate(N=100, j3 = rnorm(100)) + + expect_error(fabricatr:::join_dfs(df1, c("j1"), N=100, rho=0.5)) + expect_error(fabricatr:::join_dfs(list(df1, df2), c("j1"), N=100, rho=0.5)) + expect_error(fabricatr:::join_dfs(list(df1), c("j1"), N=100, rho=0.5)) + expect_error(fabricatr:::join_dfs(list(df1, df2), c("j1", "j2"), N=-1, rho=0.5)) + expect_error(fabricatr:::join_dfs(list(df1, df2), c("j1"), N=c(3, 10), rho=0.5)) + expect_error(fabricatr:::join_dfs(list(df1, df2, df3), c("j1", "j2", "j3"), N=100, rho=-0.5)) + expect_error(fabricatr:::join_dfs(list(df1, df2), c("j1", "j2"), N=100, rho=c(0.5, 0.3))) + + expect_error(fabricatr:::join_dfs(list(df1, df2), c("j1", "j2"), + N=100, + sigma=matrix(c(1, 0.3, 0.3, 0.3, 1, 0.3, 0.3, 0.3, 1), + ncol = 3 + ))) +}) + +test_that("Deliberate failures in cross_level", { + expect_error( + fabricate( + l1 = add_level(N = 50, j1 = rnorm(N)), + l2 = add_level(N = 50, j2 = rnorm(N), nest=FALSE), + joined = cross_level(N = 200, + by = join(j1, + j_error, + sigma=matrix(c(1, 0.5, 0.5, 1), + ncol=2) + ) + ) + ) + ) + + expect_error( + fabricate( + l1 = add_level(N = 50, j1 = rnorm(N)), + l2 = add_level(N = 50, j_var = rnorm(N), j1 = runif(N, 1, 3), nest=FALSE), + joined = cross_level(N = 200, + by = join(j1, j_var, sigma=matrix(c(1, 0.5, 0.5, 1), ncol=2))) + ) + ) + + expect_error( + fabricate( + l1 = add_level(N = 50, j1 = rnorm(N)), + l2 = add_level(N = 50, j2 = rnorm(N), nest=FALSE), + joined = cross_level(N = 200) + ) + ) + + expect_error( + fabricate( + l1 = add_level(N = 50), + joined = cross_level(N = 200) + ) + ) + + expect_error( + fabricate( + l1 = add_level(N = 50, v1 = rnorm(N), v2 = rnorm(N), v3 = rnorm(N)), + l2 = add_level(N = 30, v4 = rnorm(N), nest=FALSE), + joined = cross_level(N = 100, by=join(v1, v2)) + ) + ) + + expect_error( + fabricate( + l1 = add_level(N = 50, v1 = rnorm(N), v2 = rnorm(N), v3 = rnorm(N)), + l2 = add_level(N = 30, v4 = rnorm(N), nest=FALSE), + joined = cross_level(N = 100, by=join(v1, v4, v1)) + ) + ) +}) diff --git a/tests/testthat/test-fabrication.R b/tests/testthat/test-fabrication.R index 33b207b..cb93181 100644 --- a/tests/testthat/test-fabrication.R +++ b/tests/testthat/test-fabrication.R @@ -26,84 +26,96 @@ test_that("Fabricate", { ID_label = "ID" ) - fabricate(regions = level(N = 5, gdp = rnorm(N))) + fabricate(regions = add_level(N = 5, gdp = rnorm(N))) fabricate( - regions = level(N = 5, gdp = rnorm(N)), - cities = level(N = sample(1:5), subways = gdp + 10) + regions = add_level(N = 5, gdp = rnorm(N)), + cities = add_level(N = sample(1:5), subways = gdp + 10) ) - fabricate(regions = level(N = 5), - cities = level(N = sample(1:5), subways = rnorm(N, mean = 5))) + fabricate(regions = add_level(N = 5), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = 5))) fabricate( - regions = level(N = 5, gdp = runif(N)), - cities = level(N = sample(1:5), subways = rnorm(N, mean = 5)) + regions = add_level(N = 5, gdp = runif(N)), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = 5)) ) # User provides matrix, test conversion. - fabricate(data = matrix(rep(c(1, 2, 3), 3), byrow=TRUE, ncol=3, nrow=3)) + fabricate(data = matrix(rep(c(1, 2, 3), 3), + byrow=TRUE, + ncol=3, + nrow=3)) }) -test_that("use a function to choose N of a level", { +test_that("choose N of a level based on data from higher levels", { fabricate( - regions = level(N = 2, gdp = runif(N)), - cities = level( - N = function(x) - return(round(gdp) * 10 + 1), + regions = add_level(N = 2, gdp = runif(N)), + cities = add_level( + N = round(gdp) * 10 + 1, subways = rnorm(N, mean = 5) ) ) }) +test_that("Import data, single level var modification, with/without ID", { + expect_equal( + ncol(fabricate(datasets::BOD, dd = demand * 2, ID_label="Time")), + 3) + + expect_equal( + ncol(fabricate(datasets::BOD, dd = demand * 2)), + 4) + + expect_equal( + ncol(fabricate(datasets::BOD, dd = demand * 2, ID_label="Jello")), + 4) +}) + test_that("trigger errors", { + # User didn't provide a name for a level, and let's make sure that we also + # didn't interpret the unnamed level as any of the special arguments contextually expect_error(fabricate( - regions = level(), - cities = level(N = sample(1:5), subways = rnorm(N, mean = 5)) - )) + data = NULL, + N = NULL, + ID_label = NULL, + countries = add_level(N = 10), + add_level(N = 5, population = rnorm(N)))) expect_error(fabricate( - regions = level(N = c(1, 2)), - cities = level(N = sample(1:5), subways = rnorm(N, mean = 5)) + regions = add_level(), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = 5)) )) expect_error(fabricate( - regions = level(N = 2), - cities = level(N = c(5, 5, 5), subways = rnorm(N, mean = 5)) + regions = add_level(N = c(1, 2)), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = 5)) )) expect_error(fabricate( - regions = level(N = 2), - cities = level(N = "N that is a character vector", subways = rnorm(N, mean = 5)) + regions = add_level(N = 2), + cities = add_level(N = c(5, 5, 5), subways = rnorm(N, mean = 5)) )) - region_data <- data.frame(capital = c(1, 0, 0, 0, 0)) - expect_error(fabricatr:::fabricate_data_single_level(data = region_data, N = 5, gdp = runif(N))) - expect_error(fabricate( - regions = level(N = rep(5, 2)), - cities = level(N = c(5, 5, 5), subways = rnorm(N, mean = 5)) + regions = add_level(N = 2), + cities = add_level(N = "N that is a character vector", subways = rnorm(N, mean = 5)) )) - expect_error(fabricatr:::fabricate_data_single_level( - N = c(5, 2), - gdp = runif(N), - ID_label = "my-level" + expect_error(fabricate( + regions = add_level(N = rep(5, 2)), + cities = add_level(N = c(5, 5, 5), subways = rnorm(N, mean = 5)) )) # you must provide name for levels expect_error(fabricate(level(N = 5, gdp = rnorm(N)), - level( + add_level( N = sample(1:5), subways = rnorm(N, mean = gdp) ))) - # same for a single level - expect_error(fabricate(level(N = 5, - gdp = rnorm(N)))) - # No N, no data expect_error(fabricate(test1 = runif(10), test2 = test1 * 3 * runif(10, 1, 2))) @@ -120,9 +132,8 @@ test_that("trigger errors", { # Negative N expect_error(fabricate(N = -1, test1=runif(10))) - # must send a data frame to data - expect_error(user_data <- fabricate(data = c(5))) - + # Scalar as data + expect_error(fabricate(data = c(5))) # Vector as ID_label expect_error(fabricate(N=10, test1=rnorm(10), test2=rpois(10, lambda=2), ID_label=c("invalid", "id"))) # Matrix as ID_label @@ -133,9 +144,50 @@ test_that("trigger errors", { fabricate(N=10, test1=rnorm(10), test2=rpois(10, lambda=2), ID_label="hello") fabricate(N=10, test1=rnorm(10), test2=rpois(10, lambda=2), ID_label=c("hello")) # Symbol as ID_label - fabricate(N=10, test1=rnorm(10), test2=rpois(10, lambda=2), ID_label=test1) - fabricate(N=10, test1=rnorm(10), test2=rpois(10, lambda=2), ID_label=test3) + expect_error(fabricate(N=10, test1=rnorm(10), test2=rpois(10, lambda=2), ID_label=test1)) + expect_error(fabricate(N=10, test1=rnorm(10), test2=rpois(10, lambda=2), ID_label=test3)) # Unusual test with implicit data argument expect_error(fabricate(N=10, 1:N)) + +}) + +test_that("unusual pass of add_level call to single level generation as data matrix", { + expect_error(fabricate(add_level(N = 5, + gdp = rnorm(N)))) +}) + +test_that("modify_level call when you probably meant add_level", { + expect_error(fabricate(countries = modify_level(N = 10, new_var = rnorm(N)))) +}) + +test_that("modify_level call where you don't specify which level", { + expect_error(fabricate(countries = add_level(N=20), + modify_level(ID_new = as.numeric(ID) * 2))) +}) + +test_that("nest_level call when there was no data to nest", { + # No import data, nest level + expect_error(fabricate(countries = nest_level(N = 10, new_var = rnorm(N)))) + + # Import data, should be able to nest level + fabricate(datasets::BOD, units = nest_level(N = 2, dd = demand * 2)) +}) + + +test_that("multiple non-nested data frames, again and again", { + fabricate( + l1 = add_level(N = 100), + l2 = add_level(N = 200, nest=FALSE), + l3 = add_level(N = 100, nest=FALSE), + l4 = add_level(N = 300, nest=FALSE) + ) +}) + +test_that("importing data and then specifying a level ID variable that is in data.", { + df = fabricate(N = 100, d1 = rnorm(N), ID_label = "hello") + df2 = fabricate(df, ID_label = "hello", new_var1 = d1 * 2) + expect_equal(length(colnames(df2)), 3) + df3 = fabricate(df, new_var1 = d1 * 2) + expect_equal(length(colnames(df3)), 4) }) diff --git a/tests/testthat/test-helper-functions.R b/tests/testthat/test-helper-functions.R new file mode 100644 index 0000000..56f5c74 --- /dev/null +++ b/tests/testthat/test-helper-functions.R @@ -0,0 +1,80 @@ +context("Helper functions") + +test_that("Error handlers: handle_data", { + # User passed in data that isn't a data frame (no dimensionality) + expect_error(handle_data(1:10)) + + # It's not a data frame and the user didn't specify the argument name + # this is a weird case involving arguments being interpreted by position + # from fabricate and weird stuff flipping through to handle_data + expect_error(handle_data(matrix(1:9, nrow=3, ncol=3))) + + # Test coercion of dimensional but non-df objects -- a matrix is most common + # example -- this is also a working use case. + handle_data(data = matrix(1:9, nrow=3, ncol=3)) + + # Really stupid test -- object has dimensionality but won't coerce to df + # this should almost never happen except for very poorly behaved objects + X = 1:10 + Y = X*2 + rnorm(10) + df = data.frame(X = X, Y=Y) + model = lm(Y ~ X, df) + dim(model) = c(3, 4) # This will break the model object, but fine for test + expect_error(handle_data(data = model)) +}) + +test_that("Error handlers: handle_id", { + # Cartoon scenario where we're asked to generate an ID + # but our 6 default variables are all taken + data = data.frame(ID = 1:10, + fab_ID_1 = 11:20, + fab_ID_2 = 21:30, + fab_ID_3 = 31:40, + fab_ID_4 = 41:50, + fab_ID_5 = 51:60) + expect_error(handle_id(NULL, data)) + + # And verify that the waterfall works as expected + ID = handle_id(NULL, data[, 1:2]) + expect_equal(ID, "fab_ID_2") +}) + +test_that("Error handlers: handle_n", { + # Passed closure as N, didn't evaluate it + expect_error(handle_n(N = function(x) { x*2 })) + + # Passed closure as N, did evaluate it + func = function(x) { x*2 } + handle_n(N = func(4)) + + # Non-numeric type where coercion gives a warning + expect_error(handle_n(N = "hello")) + + # Non-numeric type where coercion gives an explicit error + expect_error(handle_n(N = list(Z = Y ~ X))) +}) + +test_that("Error handlers: check_rectangular", { + # Everything is either length N or length 1 + test = list(X = 1:10, + Y = 11:20, + Z = 4) + N = 10 + check_rectangular(test, N) + + test[["K"]] = 5:7 + expect_error(check_rectangular(test, N)) +}) + +test_that("get_unique_variables_by_level", { + df = datasets::ChickWeight + expect_equal(length(get_unique_variables_by_level(df, "Diet")), 0) + + df$DietVar = as.numeric(df$Diet) * 3 + expect_equal(length(get_unique_variables_by_level(df, "Diet")), 1) +}) + +test_that("Advance lookahead symbol evaluator", { + my_quos = rlang::quos(J = KK * LMNOP * max(F, G, H, 20, (((K))))) + expect_equal(length(get_symbols_from_quosure(my_quos)[[1]]), 6) +}) diff --git a/tests/testthat/test-hierarchical.R b/tests/testthat/test-hierarchical.R index efbdf14..fa13265 100644 --- a/tests/testthat/test-hierarchical.R +++ b/tests/testthat/test-hierarchical.R @@ -3,13 +3,13 @@ context("hierarchical") test_that("hierarchical data is created correctly when you have a vector variable that is of length N per level",{ hierarchy <- fabricate( - regions = level(N = 3, gdp = rnorm(N)), - districts = level( + regions = add_level(N = 3, gdp = rnorm(N)), + districts = add_level( N = 2, var1 = c("recent", "ancient"), var2 = ifelse(var1 == "recent", gdp, 5) ), - cities = level(N = 2, subways = rnorm(N, mean = gdp)) + cities = add_level(N = 2, subways = rnorm(N, mean = gdp)) ) df_2 <- unique(hierarchy[,c("regions", "var2")]) @@ -25,17 +25,17 @@ df_2 <- unique(hierarchy[,c("regions", "var2")]) test_that("creating variables", { population <- fabricate( - block = level( + block = add_level( N = 5, block_effect = rnorm(N) ), - individuals = level(N = 2, noise = rnorm(N)) + individuals = add_level(N = 2, noise = rnorm(N)) ) expect_error(fabricate( N = 10, noise = rnorm(N), - block = level( + block = add_level( N = 5, block_effect = rnorm(N) ) diff --git a/tests/testthat/test-resampling.R b/tests/testthat/test-resampling.R index 8c79340..0b69fba 100644 --- a/tests/testthat/test-resampling.R +++ b/tests/testthat/test-resampling.R @@ -2,8 +2,8 @@ context("Resampling") test_that("Resampling", { two_levels <- fabricate( - regions = level(N = 5, gdp = rnorm(N)), - cities = level(N = 5, subways = rnorm(N, mean = gdp)) + regions = add_level(N = 5, gdp = rnorm(N)), + cities = add_level(N = 5, subways = rnorm(N, mean = gdp)) ) # Example with data.table codepath @@ -24,8 +24,8 @@ test_that("Resampling", { test_that("Error handling of Resampling", { two_levels <- fabricate( - regions = level(N = 5, gdp = rnorm(N)), - cities = level(N = sample(1:5), subways = rnorm(N, mean = gdp)) + regions = add_level(N = 5, gdp = rnorm(N)), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = gdp)) ) resampled_two_levels <- resample_data(two_levels) # Missing N @@ -36,14 +36,20 @@ test_that("Error handling of Resampling", { expect_error(resample_data(two_levels, c(100, 10), ID_labels = c("regions"))) # Negative N expect_error(resample_data(two_levels, c(-1), ID_labels = c("regions"))) - # Non-numeric + # Non-numeric N expect_error(resample_data(two_levels, c("hello world"), ID_labels = c("regions"))) + # Non-numeric N in direct call of resample_single_level. This is unlikely to + # arise normally since we don't export it and the code paths that call it have + # separate error handling + expect_error(resample_single_level(two_levels, N=c(1, 2), ID_label = "regions")) + expect_error(resample_single_level(two_levels, N=1.5, ID_label = "regions")) + expect_error(resample_single_level(two_levels, N="hello", ID_label = "regions")) }) test_that("Direct resample_single_level", { two_levels <- fabricate( - regions = level(N = 5, gdp = rnorm(N)), - cities = level(N = sample(1:5), subways = rnorm(N, mean = gdp)) + regions = add_level(N = 5, gdp = rnorm(N)), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = gdp)) ) null_data = two_levels[two_levels$gdp > 100, ] @@ -55,15 +61,45 @@ test_that("Direct resample_single_level", { expect_error(resample_single_level(two_levels, ID_label="invalid-id", N=10)) }) +test_that("Extremely deep resampling", { + rect_data = fabricate( + N = 10, + xA = 1:10, + xB = 11:20, + xC = 21:30, + xD = 31:40, + xE = 41:50, + xF = 51:60, + xG = 61:70, + xH = 71:80, + xI = 81:90, + xJ = 91:100, + xK = 101:110 + ) + + expect_error(resample_data(rect_data, + N = c(xA = 5, + xB = 3, + xC = 6, + xD = 7, + xE = 3, + xF = 1, + xG = 2, + xH = ALL, + xI = 2, + xJ = 4, + xK = 9))) +}) + test_that("Extremely high volume data creation.", { skip("Slows build substantially.") deep_dive_data = fabricate( - countries = level(N = 100, gdp = rlnorm(N)), - states = level(N = 50, population = rlnorm(N)), - cities = level(N = 50, holiday = runif(N, 1, 365)), - neighborhoods = level(N = 5, stoplights = draw_binary(x=0.5, N)), - houses = level(N = 5, population = runif(N, 1, 5)), - people = level(N = population, sex = ifelse(draw_binary(x=0.5, N), "M", "F")) + countries = add_level(N = 100, gdp = rlnorm(N)), + states = add_level(N = 50, population = rlnorm(N)), + cities = add_level(N = 50, holiday = runif(N, 1, 365)), + neighborhoods = add_level(N = 5, stoplights = draw_binary(x=0.5, N)), + houses = add_level(N = 5, population = runif(N, 1, 5)), + people = add_level(N = population, sex = ifelse(draw_binary(x=0.5, N), "M", "F")) ) test_resample = resample_data(deep_dive_data, @@ -73,8 +109,8 @@ test_that("Extremely high volume data creation.", { test_that("Providing ID_labels through names of N.", { two_levels <- fabricate( - regions = level(N = 5, gdp = rnorm(N)), - cities = level(N = sample(1:5), subways = rnorm(N, mean = gdp)) + regions = add_level(N = 5, gdp = rnorm(N)), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = gdp)) ) resample_data(two_levels, N=c(regions=3, cities=5)) @@ -85,18 +121,23 @@ test_that("Providing ID_labels through names of N.", { expect_error(resample_data(two_levels, N=c(invalidid=3, cities=5))) + expect_error(resample_data(two_levels, + N=c(3, cities=5))) + expect_error(resample_data(two_levels, N=c(3, 5))) }) test_that("Passthrough resampling.", { two_levels <- fabricate( - regions = level(N = 5, gdp = rnorm(N)), - cities = level(N = sample(1:5), subways = rnorm(N, mean = gdp)) + regions = add_level(N = 5, gdp = rnorm(N)), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = gdp)) ) resample_data(two_levels, N=c(regions=ALL, cities=3)) # Warning when final level resampled has passthrough -- this is superfluous expect_warning(resample_data(two_levels, N=c(regions=ALL, cities=ALL))) + + }) diff --git a/tests/testthat/test-start-with-existing-data.R b/tests/testthat/test-start-with-existing-data.R index 858e9fa..32f348f 100644 --- a/tests/testthat/test-start-with-existing-data.R +++ b/tests/testthat/test-start-with-existing-data.R @@ -1,17 +1,22 @@ context("Start with existing multi-level data and add variables") +test_that("Import data plus single-level add variables.", { + result = fabricate(mtcars, zort = drat * 2) + expect_equal(dim(result), c(32, 13)) +}) + test_that("Start with existing multi-level data and add variables",{ user_data <- fabricate( - regions = level(N = 5, gdp = rnorm(N))) + regions = add_level(N = 5, gdp = rnorm(N))) expect_equal(dim(user_data), c(5, 2)) user_data <- fabricate( - regions = level(N = 5, gdp = rnorm(N)), - cities = level(N = sample(1:5), subways = rnorm(N, mean = gdp))) + regions = add_level(N = 5, gdp = rnorm(N)), + cities = add_level(N = sample(1:5), subways = rnorm(N, mean = gdp))) expect_equal(dim(user_data), c(15, 4)) @@ -19,31 +24,38 @@ test_that("Start with existing multi-level data and add variables",{ user_data_2 <- fabricate(data = user_data, - regions = level(rob = paste0(regions, "r"))) + regions = modify_level(rob = paste0(regions, "r"))) expect_equal(dim(user_data_2), c(15, 5)) ## add a variable at the cities level user_data_3 <- fabricate(data = user_data, - cities = level(rob = paste0(cities, "c"))) + cities = modify_level(rob = paste0(cities, "c"))) expect_equal(dim(user_data_3), c(15, 5)) user_data_4 <- fabricate(data = user_data, - regions = level(rob = paste0(regions, "r")), - cities = level(bob = paste0(cities, "c"))) + regions = modify_level(rob = paste0(regions, "r")), + cities = modify_level(bob = paste0(cities, "c"))) expect_equal(dim(user_data_4), c(15, 6)) user_data_5 <- fabricate(data = user_data, - regions = level(rob = paste0(regions, "r")), - cities = level(bob = paste0(cities, "c")), - neighborhoods = level(N = 10, tmp = rnorm(N))) + regions = modify_level(rob = paste0(regions, "r")), + cities = modify_level(bob = paste0(cities, "c")), + neighborhoods = add_level(N = 10, tmp = rnorm(N))) expect_equal(dim(user_data_5), c(150, 8)) }) + +test_that("Modify var at wrong level", { + df = fabricate(country = add_level(N = 50, population = runif(N, 10000, 20000)), + state = add_level(N = 10, latitude = runif(N, 40, 50)), + town = add_level(N = 5, stop_lights = draw_binary(x = 0.7, N=N))) + expect_error(fabricate(df, state = modify_level(crime = 0.5 + stop_lights + latitude))) +}) diff --git a/tests/testthat/test-variables.R b/tests/testthat/test-variables.R index 496deb8..f31d435 100644 --- a/tests/testthat/test-variables.R +++ b/tests/testthat/test-variables.R @@ -2,14 +2,14 @@ context("Variable functions") test_that("Variable functions", { # Single-level data, logit link, inherit or implicit N - fabricate(my_level = level( + fabricate(my_level = add_level( N = 10, Y1 = rnorm(N), Y2 = draw_binary(Y1, link = "logit") )) # Single level, count, inherit or implicit N - fabricate(my_level = level( + fabricate(my_level = add_level( N = 10, Y1 = rnorm(N, 5), Y2 = draw_discrete(Y1, type = "count", k = 3) @@ -143,14 +143,8 @@ test_that("Ordered data invalid tests", { breaks=NA, break_labels=NA)) # Need to specify breaks expect_error(draw_discrete(x=rnorm(5), type="ordered", breaks=c("invalid", "break", "test"), break_labels=NA)) # Non-numeric breaks - expect_error(draw_discrete(x=rnorm(5), type="ordered", - breaks=c(1, 2), break_labels=NA)) # Insufficient number of breaks expect_error(draw_discrete(x=rnorm(5), type="ordered", breaks=c(1, 3, 2), break_labels=NA)) # Breaks out of order - expect_error(draw_discrete(x=rnorm(5), type="ordered", - breaks=c(10, 20, 30), break_labels=NA)) # Break endpoints above data - expect_error(draw_discrete(x=rnorm(5), type="ordered", - breaks=c(-50, -40, -30), break_labels=NA)) # Break endpoints below data expect_error(draw_discrete(x=rnorm(5), type="ordered", breaks=matrix(rep(c(0, 1, 2), 3), byrow=TRUE, ncol=3, nrow=3))) # Non-vector breaks expect_error(draw_discrete(x=rnorm(5), type="ordered", @@ -170,3 +164,103 @@ test_that("Ordered data valid tests", { break_labels = c("A", "B"), link="probit") }) + +test_that("Binary ICCs", { + clusters = rep(1:5, 10) + # Single probability + draw_binary_icc(clusters = clusters) + # Probability = length(cluster ids) + draw_binary_icc(x = c(0.3, 0.5, 0.7, 0.8, 0.9), clusters = clusters) + + # Invalid cluster IDs + expect_error(draw_binary_icc(clusters = data.frame(X=1:10, Y=1:10))) + # X doesn't match cluster IDs + expect_error(draw_binary_icc(x = c(0.5, 0.8), clusters = clusters)) + # X isn't a vector + expect_error(draw_binary_icc(x = data.frame(j = c(0.1, 0.2), + k = c(0.2, 0.4), + m = c(0.3, 0.6), + o = c(0.4, 0.8), + p = c(0.5, 1.0)), + clusters = clusters)) + # X isn't numeric + expect_error(draw_binary_icc(x = "hello", clusters = clusters)) + # X isn't a probability + expect_error(draw_binary_icc(x = -0.5, clusters = clusters)) + # rho isn't a single number + expect_error(draw_binary_icc(clusters = clusters, rho = c(0.5, 0.8))) + # rho isn't a probability + expect_error(draw_binary_icc(clusters = clusters, rho = 2)) + # rho isn't a number + expect_error(draw_binary_icc(clusters = clusters, rho = "hello")) + # Non-numeric N + expect_error(draw_binary_icc(clusters = clusters, N = "hello")) + # N provided but doesn't match + expect_error(draw_binary_icc(clusters = clusters, N = 20)) +}) + +test_that("Likert data example", { + set.seed(19861108) + latent = rnorm(n=100, mean=3, sd=10) + cutpoints = c(-15, -7, -3, 3, 7, 15) + likert = draw_discrete(x=latent, + type="ordered", + breaks = cutpoints) + expect_equal(length(unique(likert)), 7) + expect_equal(max(likert), 7) + expect_equal(min(likert), 1) + + draw_discrete(x=latent, + type="ordered", + breaks = cutpoints, + break_labels = c("Strongly Disagree", + "Disagree", + "Lean Disagree", + "No Opinion", + "Lean Agree", + "Agree", + "Strongly Agree")) +}) + +test_that("Normal ICC", { + clusters = rep(1:5, 10) + # Single mean + draw_normal_icc(clusters = clusters) + # Means = length(cluster ids) + draw_normal_icc(x = c(-1, -0.5, 0, 0.5, 1), clusters = clusters) + + # Invalid cluster IDs + expect_error(draw_normal_icc(clusters = data.frame(X=1:10, Y=1:10))) + # X doesn't match cluster IDs + expect_error(draw_normal_icc(x = c(0.5, 0.8), clusters = clusters)) + # X isn't a vector + expect_error(draw_normal_icc(x = data.frame(j = c(0.1, 0.2), + k = c(0.2, 0.4), + m = c(0.3, 0.6), + o = c(0.4, 0.8), + p = c(0.5, 1.0)), + clusters = clusters)) + # X isn't numeric + expect_error(draw_normal_icc(x = "hello", clusters = clusters)) + # rho isn't a single number + expect_error(draw_normal_icc(clusters = clusters, rho = c(0.5, 0.8))) + # rho isn't a probability + expect_error(draw_normal_icc(clusters = clusters, rho = 2)) + # rho isn't a number + expect_error(draw_normal_icc(clusters = clusters, rho = "hello")) + # Non-numeric N + expect_error(draw_normal_icc(clusters = clusters, N = "hello")) + # N provided but doesn't match + expect_error(draw_normal_icc(clusters = clusters, N = 20)) + # SD is wrong length + expect_error(draw_normal_icc(clusters = clusters, sd = c(1, 2))) + # SD is non-numeric + expect_error(draw_normal_icc(clusters = clusters, sd = "hello")) + # SD is not a vector + expect_error(draw_normal_icc(sd = data.frame(j = c(0.1, 0.2), + k = c(0.2, 0.4), + m = c(0.3, 0.6), + o = c(0.4, 0.8), + p = c(0.5, 1.0)), + clusters = clusters)) +}) diff --git a/vignettes/advanced_features.Rmd b/vignettes/advanced_features.Rmd index c6627e8..d3163af 100644 --- a/vignettes/advanced_features.Rmd +++ b/vignettes/advanced_features.Rmd @@ -16,13 +16,13 @@ library(fabricatr) # More complicated level creation with variable numbers of observations -[`level()`](../reference/level.html) can be used to create more complicated patterns of nesting. For example, when creating lower level data, it is possible to use a different `N` for each of the values of the higher level data: +[`add_level()`](../reference/add_level.html) can be used to create more complicated patterns of nesting. For example, when creating lower level data, it is possible to use a different `N` for each of the values of the higher level data: ```{r echo=TRUE, results="hide"} variable_data <- fabricate( - cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), - citizens = level(N = c(2, 4), age = runif(N, 18, 70)) + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = c(2, 4), age = runif(N, 18, 70)) ) variable_data ``` @@ -30,16 +30,16 @@ variable_data knitr::kable(variable_data) ``` -Here, each city has a different number of citizens. And the value of `N` used to create the age variable automatically updates as needed. The result is a dataset with 6 citizens, 2 in the first city and 4 in the second. As long as N is either a number, or a vector of the same length of the current lowest level of the data, [`level()`](../reference/level.html) will know what to do. +Here, each city has a different number of citizens. And the value of `N` used to create the age variable automatically updates as needed. The result is a dataset with 6 citizens, 2 in the first city and 4 in the second. As long as N is either a number, or a vector of the same length of the current lowest level of the data, [`add_level()`](../reference/add_level.html) will know what to do. It is also possible to provide a function to N, enabling a *random* number of citizens per city: ```{r echo=TRUE, results="hide"} my_data <- fabricate( - cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), - citizens = level(N = sample(1:6, size = 2, replace = TRUE), - age = runif(N, 18, 70)) + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = sample(1:6, size = 2, replace = TRUE), + age = runif(N, 18, 70)) ) my_data ``` @@ -53,8 +53,8 @@ Finally, it is possible to define `N` on the basis of higher level variables the ```{r echo=TRUE, results="hide"} variable_n_function = fabricate( - cities = level(N = 5, population = runif(N, 10, 200)), - citizens = level(N = round(population * 0.3)) + cities = add_level(N = 5, population = runif(N, 10, 200)), + citizens = add_level(N = round(population * 0.3)) ) head(variable_n_function) ``` @@ -70,10 +70,10 @@ You may want to include the mean value of a variable within a group defined by a ```{r echo=TRUE, results="hide"} ave_example = fabricate( - cities = level(N = 2), - citizens = level(N = 1:2, - income = rnorm(N), - income_mean_city = ave(income, cities)) + cities = add_level(N = 2), + citizens = add_level(N = 1:2, + income = rnorm(N), + income_mean_city = ave(income, cities)) ) ave_example ``` @@ -92,8 +92,8 @@ library(dplyr) my_data <- fabricate( - cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), - citizens = level(N = c(2, 3), age = runif(N, 18, 70)) + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = c(2, 3), age = runif(N, 18, 70)) ) %>% group_by(cities) %>% mutate(pop = n()) @@ -106,8 +106,8 @@ knitr::kable(my_data) ```{r echo=TRUE, results="hide"} my_data <- -data_frame(Y = sample(1:10, 2)) %>% - fabricate(lower_level = level(N = 3, Y2 = Y + rnorm(N))) +data_frame(Y = sample(1:10, 2)) %>% + fabricate(lower_level = add_level(N = 3, Y2 = Y + rnorm(N))) my_data ``` ```{r echo=FALSE} diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd index 65ff1a6..a285685 100644 --- a/vignettes/getting_started.Rmd +++ b/vignettes/getting_started.Rmd @@ -66,10 +66,10 @@ The most powerful use of **fabricatr** is to create hierarchical ("nested") data ```{r echo=TRUE, results="hide"} country_data <- fabricate( - countries = level(N = 5, + countries = add_level(N = 5, gdp_per_capita = runif(N, min=10000, max=50000), life_expectancy = 50 + runif(N, 10, 20) + ((gdp_per_capita > 30000) * 10)), - provinces = level(N = 10, + provinces = add_level(N = 10, has_nat_resources = draw_discrete(x=0.3, N=N, type="bernoulli"), has_manufacturing = draw_discrete(x=0.7, N=N, type="bernoulli")) ) @@ -80,7 +80,7 @@ knitr::kable(head(country_data), format.args=list(big.mark = ",")) ``` -Several things can be observed in this example. First, fabricate knows that your second [`level()`](../reference/level.html) command will be nested under the first level of data. Each level gets its own ID variable, in addition to the variables you create. Second, the meaning of the variable "N" changes. During the [`level()`](../reference/level.html) call for countries, N is 5. During the [`level()`](../reference/level.html) call for provinces, N is 10. And the resulting data, of course, has 50 observations. +Several things can be observed in this example. First, fabricate knows that your second [`add_level()`](../reference/add_level.html) command will be nested under the first level of data. Each level gets its own ID variable, in addition to the variables you create. Second, the meaning of the variable "N" changes. During the [`add_level()`](../reference/add_level.html) call for countries, N is 5. During the [`add_level()`](../reference/level.html) call for provinces, N is 10. And the resulting data, of course, has 50 observations. Finally, the province-level variables are created using the [`draw_discrete()`](../reference/draw_discrete.html) function. This is a function provided by **fabricatr** to make simulating discrete random variables simple. When you simulate your own data, you can use **fabricatr**'s functions, R's built-ins, or any custom functions you wish. [`draw_discrete()`](../reference/draw_discrete.html) is explained in [our tutorial on variable generation using **fabricatr**](variable_generation.html) @@ -94,11 +94,11 @@ Imagine importing the country-province data simulated in the previous example. B citizen_data <- fabricate( data = country_data, - citizens = level(N=10, - salary = rnorm(N, - mean = gdp_per_capita + - has_nat_resources * 5000 + - has_manufacturing * 5000, + citizens = add_level(N=10, + salary = rnorm(N, + mean = gdp_per_capita + + has_nat_resources * 5000 + + has_manufacturing * 5000, sd = 10000))) head(citizen_data) ``` @@ -106,14 +106,13 @@ head(citizen_data) knitr::kable(head(citizen_data), format.args=list(big.mark = ",")) ``` - In this example, we add a third level of data; for each of our 50 country-province observations, we now have 10 citizen-level observations. Citizen-level covariates like salary can draw from both the country-level covariate and the province-level covariate. Notice that the syntax for adding a new nested level to existing data is different than the syntax for adding new variables to the original dataset. # Modifying existing levels -Suppose you have hierarchical data, and wish to simulate variables at a higher level of aggregation. For example, imagine you import a dataset containing citizens within countries, but you wish to simulate additional country-level variables. In **fabricatr**, you can do this using the [`level()`](../reference/level.html) command. +Suppose you have hierarchical data, and wish to simulate variables at a higher level of aggregation. For example, imagine you import a dataset containing citizens within countries, but you wish to simulate additional country-level variables. In **fabricatr**, you can do this using the [`modify_level()`](../reference/modify_level.html) command. Let's use our country-province data from earlier: @@ -121,7 +120,7 @@ Let's use our country-province data from earlier: new_country_data <- fabricate( data = country_data, - countries = level(avg_temp = runif(N, 30, 80)) + countries = modify_level(avg_temp = runif(N, 30, 80)) ) head(new_country_data) @@ -131,9 +130,7 @@ knitr::kable(head(new_country_data), format.args=list(big.mark = ",")) ``` -How does [`level()`](../reference/level.html) know whether to modify your data or add a new level? [`level()`](../reference/level.html) uses contextual information -- if the name you provide to your [`level()`](../reference/level.html) call is already a field that exists in your data set, [`level()`](../reference/level.html) will treat this as a request to modify this level of data. If, on the other hand, you provide a name not used in the data set, [`level()`](../reference/level.html) will assume you mean to add nested data under the existing data. - -We can observe that the new variable is created at the level of aggregation you chose -- countries. Also, although N is not specified anywhere, [`level()`](../reference/level.html) knows how large N should be based on the number of countries it finds in the dataset. It is important, then, to ensure that the [`level()`](../reference/level.html) command is correctly assigned to the level of interest. +We can observe that the new variable is created at the level of aggregation you chose -- countries. Also, although N is not specified anywhere, [`modify_level()`](../reference/level.html) knows how large N should be based on the number of countries it finds in the dataset. It is important, then, to ensure that the [`modify_level()`](../reference/level.html) command is correctly assigned to the level of interest. We can also modify more than one level. Recalling our country-province-citizen data from above, the following process is possible: @@ -141,22 +138,22 @@ We can also modify more than one level. Recalling our country-province-citizen d new_citizen_data <- fabricate( data = citizen_data, - countries = level(avg_temp = runif(N, 30, 80)), - provinces = level(conflict_zone = draw_discrete(N, - x=0.2 + has_nat_resources * 0.3, - type="binary"), - infant_mortality = runif(N, 0, 10) + - conflict_zone * 10 + - (avg_temp > 70) * 10), - citizens = level(college_degree = draw_discrete(N, - x=0.4 - (0.3 * conflict_zone), - type="binary")) + countries = modify_level(avg_temp = runif(N, 30, 80)), + provinces = modify_level(conflict_zone = draw_discrete(N, + x=0.2 + has_nat_resources * 0.3, + type="binary"), + infant_mortality = runif(N, 0, 10) + + conflict_zone * 10 + + (avg_temp > 70) * 10), + citizens = modify_level(college_degree = draw_discrete(N, + x=0.4 - (0.3 * conflict_zone), + type="binary")) ) ``` -Before assessing what this tells us about [`level()`](../reference/level.html), let's consider what the data simulated does. It creates a new variable at the country level, for a country level average temperature. Subsequently, it creates a province level binary indicator for whether the province is an active conflict site. Provinces that have natural resources are more likely to be in conflict in this simulation, drawing on conclusions from literature on "resource curses". The infant mortality rate for the province is able to depend both on province level data we have just generated, and country-level data: it is higher in high-temperature areas (reflecting literature on increased disease burden near the equator) and also higher in conflict zones. Citizens access to education is also random, but depends on whether they live in a conflict area. +Before assessing what this tells us about [`modify_level()`](../reference/modify_level.html), let's consider what the data simulated does. It creates a new variable at the country level, for a country level average temperature. Subsequently, it creates a province level binary indicator for whether the province is an active conflict site. Provinces that have natural resources are more likely to be in conflict in this simulation, drawing on conclusions from literature on "resource curses". The infant mortality rate for the province is able to depend both on province level data we have just generated, and country-level data: it is higher in high-temperature areas (reflecting literature on increased disease burden near the equator) and also higher in conflict zones. Citizens access to education is also random, but depends on whether they live in a conflict area. -There are a lot of things to learn from this example. First, it's possible to modify multiple levels. Any new variable created will automatically propagate to the lower level data according -- by setting an average temperature for a country, all provinces, and all citizens of those provinces, have the value for the country. Values created from one [`level()`](../reference/level.html) call can be used in subsequent variables of the same call, or subsequent calls. +There are a lot of things to learn from this example. First, it's possible to modify multiple levels. Any new variable created will automatically propagate to the lower level data according -- by setting an average temperature for a country, all provinces, and all citizens of those provinces, have the value for the country. Values created from one [`modify_level()`](../reference/level.html) call can be used in subsequent variables of the same call, or subsequent calls. Again, we see the use of [`draw_discrete()`](../reference/draw_discrete.html). Using this function is covered in our tutorial on [generating discrete random variables](variable_generation.html), linked below. diff --git a/vignettes/resampling.Rmd b/vignettes/resampling.Rmd index 5572404..8604e39 100644 --- a/vignettes/resampling.Rmd +++ b/vignettes/resampling.Rmd @@ -54,8 +54,8 @@ Consider this example, which takes a dataset containing 2 cities of 3 citizens, ```{r echo=TRUE, results="hide"} my_data <- fabricate( - cities = level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), - citizens = level(N = 3, age = runif(N, 18, 70)) + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = 3, age = runif(N, 18, 70)) ) my_data_2 <- resample_data(my_data, @@ -68,3 +68,20 @@ knitr::kable(my_data_2) ``` [`resample_data()`](../reference/resample_data.html) will first select the cities to be resampled. Then, for each city, it will continue by selecting the citizens to be resampled. If a higher level unit is used more than once (for example, the same city being chosen twice), and a lower level is subsequently resampled, the choices of which units to keep for the lower level will differ for each copy of the higher level. In this example, if city 1 is chosen twice, then the sets of five citizens chosen for each copy of the city 1 will differ. + +You can also specify the levels you wish to resample from using the name arguents to the `N` parameter, like in this example which does exactly the same thing as the previous example, but specifies the level names in a different way: + +```{r echo=TRUE, results="hide"} +my_data <- + fabricate( + cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), + citizens = add_level(N = 3, age = runif(N, 18, 70)) + ) + +my_data_2 <- resample_data(my_data, + N = c(cities=3, citizens=5)) +my_data_2 +``` +```{r echo=FALSE} +knitr::kable(my_data_2) +``` diff --git a/vignettes/variable_generation.Rmd b/vignettes/variable_generation.Rmd index 44f622e..5dcd66e 100644 --- a/vignettes/variable_generation.Rmd +++ b/vignettes/variable_generation.Rmd @@ -18,7 +18,7 @@ library(fabricatr) **fabricatr** provides a convenient helper function, [`draw_discrete()`](../reference/draw_discrete.html), which you can use to generate discrete random variables far more easily than using R's built-in data generation mechanisms. Below, we introduce you to the types of data you can generate using [`draw_discrete()`](../reference/draw_discrete.html). -# Binary and binomial outcomes +## Binary and binomial outcomes The simplest possible type of data, and [`draw_discrete()`](../reference/draw_discrete.html)'s default, is a binary random variable (also called a bernoulli random variable). Generating a binary random variable requires only one parameter `x` which specifies the probability that outcomes drawn from this variable are equal to 1. By default, [`draw_discrete()`](../reference/draw_discrete.html) will generate `N = length(x)` draws. `N` can also be specified explicitly. Consider these examples: @@ -59,7 +59,7 @@ bernoulli_probit = fabricate(N = 3, x = 10*rnorm(N), link = "probit")) ``` -# Ordered outcomes +## Ordered outcomes Some researchers may be interested in generating ordered outcomes -- for example, Likert scale outcomes. This can be done using the "ordered" type. Ordered variables require a vector of breakpoints, supplied as the argument `breaks` -- points at which the underlying latent variable switches from category to category. The first break should always be below the lower bound of the data, while the final break should always be above the upper bound of the data. @@ -88,7 +88,7 @@ ordered_probit_example = fabricate(N = 3, ) ``` -# Count outcomes +## Count outcomes [`draw_discrete()`](../reference/draw_discrete.html) supports Poisson-distributed count outcomes. These require that the user specify the parameter `x`, equal to the Poisson distribution mean (often referred to as `lambda`). @@ -98,7 +98,7 @@ count_outcome_example = fabricate(N = 3, count = draw_discrete(x, type = "count")) ``` -# Categorical data +## Categorical data [`draw_discrete()`](../reference/draw_discrete.html) can also generate non-ordered, categorical data. Users must provide a vector of probabilities for each category (or a matrix, if each observation should have separate probabilities), as well as setting `type` to be "categorical". @@ -127,3 +127,118 @@ warn_draw_discrete_example = fabricate(N = 6, ``` "categorical" variables can also use link functions, for example to generate multinomial probit data. + +# Fabricating cluster-correlated random variables. + +We also provide helper functions to generate cluster-correlated random variables with fixed intra-cluster correlation (ICC) values. Our two functions `draw_binary_icc()` and `draw_normal_icc()` allow you to generate both discrete binary data with fixed ICCs and normal data with fixed ICCs. + +## Binary data with fixed ICCs + +`draw_binary_icc()` takes three required arguments: `x`, a probability or vector of probabilities which determine the chance a given observation will be a 1; `clusters`, a map of units to clusters (required to generate the correlation structure); and `rho`, the fixed intra-cluster correlation (from 0 to 1). Users may optionally specify `N`; if it is not specified, `draw_binary_icc()` will determine it based on the length of the `clusters` vector. + +Consider the following example, which models whether individuals smoke: + +```{r echo=FALSE} +set.seed(19861108) +``` +```{r echo=TRUE, results="hide"} +# 100 individual population, 10 each in each of 10 clusters +clusters = rep(1:10, 10) + +# Individuals have a 20% chance of smoking, but clusters are highly correlated +# in their tendency to smoke +smoker = draw_binary_icc(x = 0.2, + clusters = clusters, + rho = 0.7) + +# Observe distribution of smokers and non-smokers +table(smoker) +``` +```{r echo=FALSE} +knitr::kable(as.matrix(t(table(smoker)))) +``` + +We see that approximately 20% of the population smokes, in line with our specification, but what patterns of heterogeneity do we see by cluster? + +```{r echo=TRUE, results="hide"} +table(clusters, smoker) +``` +```{r echo=FALSE} +knitr::kable(table(clusters, smoker)) +``` + +We observe that 7 clusters have no smokers at all, two clusters are overwhelming smokers, and one cluster is overwhelmingly non-smokers. + +We can also specify separate means for each cluster; but it is worth noting that the higher the ICC, the more the cluster mean will depart from the nominal cluster mean. + +If you do not specify a vector of probabilities or a correlation coefficient, the default values are probability 0.5 for each cluster and ICC (rho) of 0.5. If you do not specify cluster IDs, the function will return an error. + +## Normal data with fixed ICCs + +`draw_normal_icc()` takes four required arguments: `x`, a mean or vector of means, one for each cluster; `clusters`, a map of units to clusters (required to generate the correlation structure); `rho`, the fixed intra-cluster correlation coefficient; and `sd`, a standard deviation or vector of standard deviations, one for each cluster. Users can optionally specify `N`, a number of units, but if it is not supplied `draw_normal_icc()` will determine it based on the length of the `clusters` vector. + +If `sd` is not supplied, each cluster will be assumed to have a within-cluster standard deviation of 1. If `x` is not supplied, each cluster will be assumed to be mean zero. If `rho` is not supplied, it will be set to 0.5. + +Here, we model student academic performance by cluster: +```{r echo=FALSE} +set.seed(19861108) +``` +```{r echo=TRUE, results="hide"} +# 100 students, 10 each in 10 clusters +clusters = rep(1:10, 10) + +numeric_grade = draw_normal_icc(x = 80, + clusters = clusters, + rho = 0.5, + sd = 15) + +letter_grade = draw_discrete(x = numeric_grade, + type = "ordered", + breaks = c(-Inf, 60, 70, 80, 90, Inf), + break_labels = c("F", "D", "C", "B", "A")) + +mean(numeric_grade) +``` +`r mean(numeric_grade)` + +The mean grade matches the population mean. Now let's look at the relationship between cluster and letter grade to observe the cluster pattern: + +```{r echo=TRUE, results="hide"} +table(letter_grade, clusters) +``` +```{r echo=FALSE} +knitr::kable(table(clusters, letter_grade)) +``` + +It is obvious upon inspection that some clusters are higher performing than others despite having identical cluster means in expectation. + +## Technical Appendix + +When generating binary data with a fixed ICC, we follow this formula, where $i$ is a cluster and $j$ is a unit in a cluster: + +$$ +\begin{aligned} + z_i &\sim \text{Bern}(p_i) \\ + u_{ij} &\sim \text{Bern}(\sqrt{\rho}) \\ + x_{ij} &= + \begin{cases} + x_{ij} \sim \text{Bern}(p_i) & \quad \text{if } u_{ij} = 1 \\ + z_i & \quad \text{if } u_{ij} = 0 + \end{cases} +\end{aligned} +$$ + +In expectation, this guarantees an intra-cluster correlation of $\rho$ and a cluster proportion of $p_i$. This approach derives from Hossain, Akhtar and Chakraborti, Hrishikesh. "ICCBin: Facilitates Clustered Binary Data Generation, and Estimation of Intracluster Correlation Coefficient (ICC) for Binary Data", available on [https://cran.r-project.org/web/packages/ICCbin/index.html](CRAN) or [https://github.com/akhtarh/ICCbin](GitHub) + +When generating normal data with a fixed ICC, we follow this formula, again with $i$ as a cluster and $j$ as a unit in the cluster: + +$$ +\begin{aligned} + \sigma^2_{\alpha i} &= \frac{(\rho * \sigma^2_{\epsilon i})}{(1 - \rho)} \\ + \alpha_i &\sim \mathcal{N}(0, \sigma^2_{\alpha i}) \\ + \mu_{ij} &\sim \mathcal{N}(\mu_i, \sigma^2_{\epsilon i}) \\ + x_{ij} &= \mu_{ij} + \alpha_i +\end{aligned} +$$ + +In expectation, this approach guarantees an intra-cluster correlation of $\rho$, a cluster mean of $\mu_{i}$, and a cluster-level variance in error terms of $\sigma^2_{\epsilon i}$. This approach is specified on [https://stats.stackexchange.com/questions/263451/create-synthetic-data-with-a-given-intraclass-correlation-coefficient-icc](StatsExchange).