CRAN Submission PR (#59)

DeclareDesign · Jan 23, 2018 · 775eb4b · 775eb4b
1 parent c0ffea1
commit 775eb4b
Show file tree

Hide file tree

Showing 40 changed files with 769 additions and 869 deletions.
diff --git a/R/cross_level.R b/R/cross_level.R
@@ -1,12 +1,20 @@
 
-#' Creates cross-classified (partially non-nested, joined data) with a fixed
-#' correlation structure.
+#' Creates panel or cross-classified data
+#'
+#' This function allows the user to create data structures that are paneled or
+#' cross-classified: where one level of observation draws simultaneously from
+#' two or many source levels. Common examples of panels include country-year
+#' data which have country-level and year-level characteristics.
+#'
+#' By specifying the appropriate arguments in \code{join()} within the
+#' function call, it is possible to induce correlation in cross-classified data.
 #'
 #' @param N The number of observations in the resulting data frame.
-#' If N is NULL or not provided, the join will be an "outer join" -- creating a
-#' full panel of each of the rows from each data frame provided.
-#' @param by The result of a call to \code{join()} which specifies how the
-#' cross-classified data will be created
+#' If \code{N} is NULL or not provided, the join will be an "outer product" --
+#' merging each row of each provided data frame with each other data frame to
+#' make a full panel.
+#' @param by The result of a call to \code{join()} which specifies how
+#' the cross-classified data will be created
 #' @param ... A variable or series of variables to add to the resulting data
 #' frame after the cross-classified data is created.
 #'
@@ -15,22 +23,23 @@
 #' @examples
 #'
 #' # Generate full panel data
-#'
 #' panel <- fabricate(
 #'  countries = add_level(N = 20, country_shock = runif(N, 1, 10)),
 #'  years = add_level(N = 20, year_shock = runif(N, 1, 10), nest=FALSE),
 #'  obs = cross_level(by=join(countries, years), GDP_it = country_shock + year_shock)
 #' )
 #'
-#' # Generate cross-classified data and merge, no correlation
+#' # Include an "N" argument to allow for cross-classified
+#' # data.
 #' students <- fabricate(
 #'  primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)),
 #'  secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE),
 #'  students = cross_level(N = 500, by = join(primary_school, secondary_school))
 #' )
 #' head(students)
 #'
-#' # Cross-classified data with a correlation structure
+#' # Induce a correlation structure in cross-classified data by providing
+#' # rho.
 #' students <- fabricate(
 #'  primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)),
 #'  secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE),
@@ -198,10 +207,11 @@ cross_level_internal <- function(N = NULL,
 #' variables being joined on: note that if it is not possible to make a
 #' correlation matrix from this coefficient (e.g. if you are joining on three
 #' or more variables and rho is negative) then the \code{cross_level()} call
-#' will fail.
+#' will fail. Do not provide \code{rho} if making panel data.
 #' @param sigma A matrix with dimensions equal to the number of variables you
 #' are joining on, specifying the correlation for the resulting joined data.
-#' Only one of rho and sigma should be provided.
+#' Only one of rho and sigma should be provided. Do not provide \code{sigma} if
+#' making panel data.
 #' @export
 join <- function(..., rho=0, sigma=NULL) {
   data_arguments <- quos(...)

diff --git a/R/draw_binary_icc.R b/R/draw_binary_icc.R
@@ -12,19 +12,32 @@
 #' generated. Must be equal to length(clusters) if provided.
 #' @param clusters A vector of factors or items that can be coerced to
 #' clusters; the length will determine the length of the generated data.
-#' @param ICC A number indicating the desired ICC, if none is provided will
-#' default to 0.
+#' @param ICC A number indicating the desired \code{ICC}, if none is provided
+#' the default ICC will be 0.
 #' @return A vector of binary numbers corresponding to the observations from
 #' the supplied cluster IDs.
 #' @examples
+#' # Divide units into clusters
 #' clusters = rep(1:5, 10)
+#'
+#' # Default probability 0.5, default ICC 0
 #' draw_binary_icc(clusters = clusters)
-#' draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5)
+#'
+#' # Specify probability or ICC
+#' corr_draw = draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5)
+#'
+#' # Verify ICC of data.
+#' summary(lm(corr_draw ~ as.factor(clusters)))$r.squared
 #'
 #' @importFrom stats rbinom
 #'
 #' @export
 draw_binary_icc <- function(prob = 0.5, N = NULL, clusters, ICC = 0) {
+
+    if(is.null(clusters)) {
+    stop("You must provide clusters to `draw_binary_icc`")
+  }
+
   # Let's not worry about how clusters are provided
   tryCatch({
     clusters <- as.numeric(as.factor(clusters))

diff --git a/R/draw_normal_icc.R b/R/draw_normal_icc.R
@@ -32,9 +32,23 @@
 #' @return A vector of numbers corresponding to the observations from
 #' the supplied cluster IDs.
 #' @examples
+#'
+#' # Divide observations into clusters
 #' clusters = rep(1:5, 10)
+#'
+#' # Default: unit variance within each cluster
 #' draw_normal_icc(clusters = clusters, ICC = 0.5)
 #'
+#' # Alternatively, you can specify characteristics:
+#' draw_normal_icc(mean = 10, clusters = clusters, sd = 3, ICC = 0.3)
+#'
+#' # Can specify between-cluster standard deviation instead:
+#' draw_normal_icc(clusters = clusters, sd_between = 4, ICC = 0.2)
+#'
+#' # Verify that ICC generated is accurate
+#' corr_draw = draw_normal_icc(clusters = clusters, ICC = 0.4)
+#' summary(lm(corr_draw ~ as.factor(clusters)))$r.squared
+#'
 #' @importFrom stats rnorm
 #'
 #' @export

diff --git a/R/fabricate.R b/R/fabricate.R
@@ -6,10 +6,13 @@
 #' \code{N}. Create hierarchical data with multiple levels of data such as
 #' citizens within cities within states using \code{add_level()} or modify
 #' existing hierarchical data using \code{modify_level()}. You can use any R
-#' function to create each variable. We provide several built-in options to
-#' easily draw from binary and count outcomes, including
-#' \code{\link{draw_binary}}, \code{\link{draw_count}},
-#' \code{\link{draw_binary_icc}}, and \code{\link{draw_normal_icc}}.
+#' function to create each variable. Use \code{cross_level()} to make more
+#' complex designs such as panel or cross-classified data.
+#'
+#' We also provide several built-in options to easily create variables, including
+#' \code{\link{draw_binary}}, \code{\link{draw_count}}, \code{\link{draw_likert}},
+#' and intra-cluster correlated variables \code{\link{draw_binary_icc}} and
+#' \code{\link{draw_normal_icc}}
 #'
 #' @param data (optional) user-provided data that forms the basis of the
 #' fabrication, e.g. you can add variables to existing data. Provide either
@@ -34,9 +37,6 @@
 #'
 #' @examples
 #'
-#' # Draw a single-level dataset with no covariates
-#' df <- fabricate(N = 100)
-#' head(df)
 #'
 #' # Draw a single-level dataset with a covariate
 #' building_df <- fabricate(
@@ -45,7 +45,7 @@
 #' )
 #' head(building_df)
 #'
-#' # Start with existing data
+#' # Start with existing data instead
 #' building_modified <- fabricate(
 #'   data = building_df,
 #'   rent = rnorm(N, mean = height_ft * 100, sd = height_ft * 30)
@@ -73,9 +73,9 @@
 #'   cities = modify_level(runoff = rnorm(N))
 #' )
 #'
-#' # fabricatr can also make cross-classified data. For more information about
-#' # syntax for this functionality please read our vignette or check
-#' # documentation for \code{cross_level}:
+#' # fabricatr can also make panel or cross-classified data. For more
+#' # information about syntax for this functionality please read our vignette
+#' # or check documentation for \code{cross_level}:
 #' cross_classified <- fabricate(
 #'   primary_schools = add_level(N = 50, ps_quality = runif(N, 0, 10)),
 #'   secondary_schools = add_level(N = 100, ss_quality = runif(N, 0, 10), nest=FALSE),

diff --git a/R/resample_data.R b/R/resample_data.R
@@ -1,55 +1,63 @@
 #' Resample data, including hierarchical data
 #'
+#' This function allows you to resample any data frame. The default mode
+#' performs a single resample of size \code{N} without replacement. Users can
+#' also specify more complex resampling strategies to resample hierarchical
+#' data.
+#'
 #' @param data A data.frame, usually provided by the user.
-#' @param N The number of sample N to return. If N is a single scalar and no labels are provided, N will specify the number of unit observations to resample. If N is named, or if the ID_labels argument is specified (in which case, both N and ID_labels should be the same length), then the units resampled will be values of the levels resampled (this is useful for, e.g., cluster resampling). If N is the constant ALL for any level, all units of this level will be transparently passed through to the next level of resampling.
-#' @param ID_labels A character vector of the variables that indicate the data hierarchy, from highest to lowest (i.e., from cities to citizens).
+#' @param N The number of sample observations to return. If \code{N} is a single
+#' scalar and no labels are provided, \code{N} will specify the number of unit
+#' observations to resample. If \code{N} is named, or if the \code{ID_labels}
+#' argument is specified (in which case, both \code{N} and \code{ID_labels}
+#' should be the same length), then the units resampled will be values of the
+#' levels resampled (this is useful for, e.g., cluster resampling). If \code{N}
+#' is the constant \code{ALL} for any level, all units of this level will be
+#' transparently passed through to the next level of resampling.
+#' @param ID_labels A character vector of the variables that indicate the data
+#' hierarchy, from highest to lowest (i.e., from cities to citizens).
 #'
 #' @return A data.frame
 #'
 #' @examples
 #'
-#' # Bootstrap a dataset without any hierarchy. N specifies a number of observations to return
+#' # Resample a dataset of size N without any hierarchy
+#' baseline_survey <- fabricate(N = 50, Y_pre = rnorm(N))
+#' bootstrapped_data <- resample_data(baseline_survey)
 #'
-#' baseline_survey <- fabricate(N = 5, Y_pre = rnorm(N))
-#' bootsrapped_data <- resample_data(baseline_survey, N = 10)
-#' bootsrapped_data
+#' # Specify a fixed number of observations to return
+#' baseline_survey <- fabricate(N = 50, Y_pre = rnorm(N))
+#' bootstrapped_data <- resample_data(baseline_survey, N = 100)
 #'
-#' # Resample by a single level of a hierarchical dataset (e.g. resampling clusters of observations)
-#' # N specifies a number of clusters to return
+#' # Resample by a single level of a hierarchical dataset (e.g. resampling
+#' # clusters of observations): N specifies a number of clusters to return
 #'
 #' clustered_survey <- fabricate(
 #'   clusters = add_level(N=25),
-#'   cities = add_level(N=round(runif(25, 1, 5)), population=runif(n = N, min=50000, max=1000000))
+#'   cities = add_level(N=round(runif(25, 1, 5)),
+#'                      population=runif(n = N, min=50000, max=1000000))
 #' )
 #'
-#' # Specify the name of the cluster variable one of two ways
-#'
 #' cluster_resample <- resample_data(clustered_survey, N = 5, ID_labels = "clusters")
-#' cluster_resample
 #'
+#' # Alternatively, pass the level to resample as a name:
 #' cluster_resample_2 <- resample_data(clustered_survey, N=c(clusters = 5))
-#' cluster_resample_2
 #'
 #' # Resample a hierarchical dataset on multiple levels
-#'
 #' my_data <-
 #' fabricate(
-#'   cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
-#'   citizens = add_level(N = 3, income = round(elevation * rnorm(n = N, mean = 5)))
+#'   cities = add_level(N = 20, elevation = runif(n = N, min = 1000, max = 2000)),
+#'   citizens = add_level(N = 30, age = runif(n = N, min = 18, max = 85))
 #' )
 #'
-#' # Specify the levels you wish to resample one of two ways:
-#' my_data_2 <- resample_data(my_data, N = c(3, 5), ID_labels = c("cities", "citizens"))
-#' my_data_2
+#' # Specify the levels you wish to resample:
+#' my_data_2 <- resample_data(my_data, N = c(3, 5),
+#'                            ID_labels = c("cities", "citizens"))
 #'
-#' my_data_3 <- resample_data(my_data, N = c(cities=3, citizens=5))
-#' my_data_3
+#' # To resample every unit at a given level, use the ALL constant
+#' # This example will resample 10 citizens at each of the cities:
 #'
-#' # Transparently pass through all units at a given level
-#' # This example will resample 2 citizens at each of the cities:
-#'
-#' passthrough_resample_data <- resample_data(my_data, N = c(cities=ALL, citizens=2))
-#' passthrough_resample_data
+#' passthrough_resample_data <- resample_data(my_data, N = c(cities=ALL, citizens=10))
 #'
 #'
 #' @export
@@ -62,7 +70,9 @@ resample_data <- function(data, N, ID_labels=NULL) {
   return(df)
 }
 
-#' Magic number constant to allow users to specify "ALL" for passthrough resampling
+#' Magic number constant to allow users to specify \code{ALL} for passthrough
+#' resampling
+#'
 #' @keywords internal
 #' @export
 ALL <- -20171101L

diff --git a/R/variable_creation_functions.R b/R/variable_creation_functions.R
@@ -3,56 +3,82 @@
 #'
 #' Drawing discrete data based on probabilities or latent traits is a common
 #' task that can be cumbersome. Each function in our discrete drawing set creates
-#' a different type of discrete data: `draw_binary` creates binary 0/1 data,
-#' `draw_binomial` creates binomial data (repeated trial binary data),
-#' `draw_categorical` creates categorical data, `draw_ordered` transforms latent
-#' data into observed ordered categories, `draw_count` creates count data
-#' (poisson-distributed). `draw_liket` is an alias to `draw_ordered` that
-#' pre-specifies break labels and offers default breaks appropriate for a likert
-#' survey question.
+#' a different type of discrete data: \code{draw_binary} creates binary 0/1 data,
+#' \code{draw_binomial} creates binomial data (repeated trial binary data),
+#' \code{draw_categorical} creates categorical data, \code{draw_ordered}
+#' transforms latent data into observed ordered categories, \code{draw_count}
+#' creates count data (poisson-distributed). \code{draw_likert} is an alias to
+#' \code{draw_ordered} that pre-specifies break labels and offers default breaks
+#' appropriate for a likert survey question.
+#'
+#' For variables with intra-cluster correlations, see
+#' \code{\link{draw_binary_icc}} and \code{\link{draw_normal_icc}}
 #'
 #' @param prob A number or vector of numbers representing the probability for
 #' binary or binomial outcomes; or a number, vector, or matrix of numbers
 #' representing probabilities for categorical outcomes. If you supply a link
 #' function, these underlying probabilities will be transformed.
 #' @param trials for `draw_binomial`, the number of trials for each observation
 #' @param mean for `draw_count`, the mean number of count units for each observation
-#' @param x for `draw_ordered`, the latent data for each observation.
-#' @param breaks vector of breaks to cut a latent outcome into ordered categories
+#' @param x for `draw_ordered` or `draw_likert`, the latent data for each
+#' observation.
+#' @param breaks vector of breaks to cut a latent outcome into ordered
+#' categories with `draw_ordered` or `draw_likert`
 #' @param break_labels vector of labels for the breaks to cut a latent outcome
-#' into ordered categories.
+#' into ordered categories with `draw_ordered`.
 #' @param type Type of Likert scale data for `draw_likert`. Valid options are 4,
-#' 5, and 7.
+#' 5, and 7. Type corresponds to the number of categories in the Likert scale.
 #' @param N number of units to draw. Defaults to the length of the vector of
-#' probabilities or latent data you provided
+#' probabilities or latent data you provided.
 #' @param link link function between the latent variable and the probability of
 #' a postiive outcome, e.g. "logit", "probit", or "identity". For the "identity"
 #' link, the latent variable must be a probability.
 #'
 #' @examples
+#'
+#' # Drawing binary values (success or failure, treatment assignment)
 #' fabricate(N = 3,
 #'    p = c(0, .5, 1),
 #'    binary = draw_binary(prob = p))
 #'
-#'
+#' # Drawing binary values with probit link (transforming continuous data
+#' # into a probability range).
 #' fabricate(N = 3,
 #'    x = 10 * rnorm(N),
 #'    binary = draw_binary(prob = x, link = "probit"))
 #'
+#' # Repeated trials: `draw_binomial`
 #' fabricate(N = 3,
 #'    p = c(0, .5, 1),
 #'    binomial = draw_binomial(prob = p, trials = 10))
 #'
+#' # Ordered data: transforming latent data into observed, ordinal data.
+#' # useful for survey responses.
 #' fabricate(N = 3,
 #'    x = 5 * rnorm(N),
-#'    ordered = draw_ordered(x=x,
+#'    ordered = draw_ordered(x = x,
 #'                           breaks = c(-Inf, -1, 1, Inf)))
 #'
+#' # Providing break labels for latent data.
 #' fabricate(N = 3,
-#'    x = c(0,5,100),
-#'    count = draw_count(mean=x))
+#'    x = 5 * rnorm(N),
+#'    ordered = draw_ordered(x = x,
+#'                           breaks = c(-Inf, -1, 1, Inf),
+#'                           break_labels = c("Not at all concerned",
+#'                                            "Somewhat concerned",
+#'                                            "Very concerned")))
+#'
+#' # Likert data: often used for survey data
+#' fabricate(N = 10,
+#'           support_free_college = draw_likert(x = rnorm(N),
+#'                                              type = 5))
+#'
+#' # Count data: useful for rates of occurrences over time.
+#' fabricate(N = 5,
+#'    x = c(0, 5, 25, 50, 100),
+#'    theft_rate = draw_count(mean=x))
 #'
-#' # Categorical
+#' # Categorical data: useful for demographic data.
 #' fabricate(N = 6, p1 = runif(N), p2 = runif(N), p3 = runif(N),
 #'           cat = draw_categorical(cbind(p1, p2, p3)))
 #'