Skip to content

Commit

Permalink
CRAN Submission PR (#59)
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronrudkin committed Jan 23, 2018
1 parent c0ffea1 commit 775eb4b
Show file tree
Hide file tree
Showing 40 changed files with 769 additions and 869 deletions.
32 changes: 21 additions & 11 deletions R/cross_level.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@

#' Creates cross-classified (partially non-nested, joined data) with a fixed
#' correlation structure.
#' Creates panel or cross-classified data
#'
#' This function allows the user to create data structures that are paneled or
#' cross-classified: where one level of observation draws simultaneously from
#' two or many source levels. Common examples of panels include country-year
#' data which have country-level and year-level characteristics.
#'
#' By specifying the appropriate arguments in \code{join()} within the
#' function call, it is possible to induce correlation in cross-classified data.
#'
#' @param N The number of observations in the resulting data frame.
#' If N is NULL or not provided, the join will be an "outer join" -- creating a
#' full panel of each of the rows from each data frame provided.
#' @param by The result of a call to \code{join()} which specifies how the
#' cross-classified data will be created
#' If \code{N} is NULL or not provided, the join will be an "outer product" --
#' merging each row of each provided data frame with each other data frame to
#' make a full panel.
#' @param by The result of a call to \code{join()} which specifies how
#' the cross-classified data will be created
#' @param ... A variable or series of variables to add to the resulting data
#' frame after the cross-classified data is created.
#'
Expand All @@ -15,22 +23,23 @@
#' @examples
#'
#' # Generate full panel data
#'
#' panel <- fabricate(
#' countries = add_level(N = 20, country_shock = runif(N, 1, 10)),
#' years = add_level(N = 20, year_shock = runif(N, 1, 10), nest=FALSE),
#' obs = cross_level(by=join(countries, years), GDP_it = country_shock + year_shock)
#' )
#'
#' # Generate cross-classified data and merge, no correlation
#' # Include an "N" argument to allow for cross-classified
#' # data.
#' students <- fabricate(
#' primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)),
#' secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE),
#' students = cross_level(N = 500, by = join(primary_school, secondary_school))
#' )
#' head(students)
#'
#' # Cross-classified data with a correlation structure
#' # Induce a correlation structure in cross-classified data by providing
#' # rho.
#' students <- fabricate(
#' primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)),
#' secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE),
Expand Down Expand Up @@ -198,10 +207,11 @@ cross_level_internal <- function(N = NULL,
#' variables being joined on: note that if it is not possible to make a
#' correlation matrix from this coefficient (e.g. if you are joining on three
#' or more variables and rho is negative) then the \code{cross_level()} call
#' will fail.
#' will fail. Do not provide \code{rho} if making panel data.
#' @param sigma A matrix with dimensions equal to the number of variables you
#' are joining on, specifying the correlation for the resulting joined data.
#' Only one of rho and sigma should be provided.
#' Only one of rho and sigma should be provided. Do not provide \code{sigma} if
#' making panel data.
#' @export
join <- function(..., rho=0, sigma=NULL) {
data_arguments <- quos(...)
Expand Down
19 changes: 16 additions & 3 deletions R/draw_binary_icc.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,32 @@
#' generated. Must be equal to length(clusters) if provided.
#' @param clusters A vector of factors or items that can be coerced to
#' clusters; the length will determine the length of the generated data.
#' @param ICC A number indicating the desired ICC, if none is provided will
#' default to 0.
#' @param ICC A number indicating the desired \code{ICC}, if none is provided
#' the default ICC will be 0.
#' @return A vector of binary numbers corresponding to the observations from
#' the supplied cluster IDs.
#' @examples
#' # Divide units into clusters
#' clusters = rep(1:5, 10)
#'
#' # Default probability 0.5, default ICC 0
#' draw_binary_icc(clusters = clusters)
#' draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5)
#'
#' # Specify probability or ICC
#' corr_draw = draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5)
#'
#' # Verify ICC of data.
#' summary(lm(corr_draw ~ as.factor(clusters)))$r.squared
#'
#' @importFrom stats rbinom
#'
#' @export
draw_binary_icc <- function(prob = 0.5, N = NULL, clusters, ICC = 0) {

if(is.null(clusters)) {
stop("You must provide clusters to `draw_binary_icc`")
}

# Let's not worry about how clusters are provided
tryCatch({
clusters <- as.numeric(as.factor(clusters))
Expand Down
14 changes: 14 additions & 0 deletions R/draw_normal_icc.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,23 @@
#' @return A vector of numbers corresponding to the observations from
#' the supplied cluster IDs.
#' @examples
#'
#' # Divide observations into clusters
#' clusters = rep(1:5, 10)
#'
#' # Default: unit variance within each cluster
#' draw_normal_icc(clusters = clusters, ICC = 0.5)
#'
#' # Alternatively, you can specify characteristics:
#' draw_normal_icc(mean = 10, clusters = clusters, sd = 3, ICC = 0.3)
#'
#' # Can specify between-cluster standard deviation instead:
#' draw_normal_icc(clusters = clusters, sd_between = 4, ICC = 0.2)
#'
#' # Verify that ICC generated is accurate
#' corr_draw = draw_normal_icc(clusters = clusters, ICC = 0.4)
#' summary(lm(corr_draw ~ as.factor(clusters)))$r.squared
#'
#' @importFrom stats rnorm
#'
#' @export
Expand Down
22 changes: 11 additions & 11 deletions R/fabricate.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@
#' \code{N}. Create hierarchical data with multiple levels of data such as
#' citizens within cities within states using \code{add_level()} or modify
#' existing hierarchical data using \code{modify_level()}. You can use any R
#' function to create each variable. We provide several built-in options to
#' easily draw from binary and count outcomes, including
#' \code{\link{draw_binary}}, \code{\link{draw_count}},
#' \code{\link{draw_binary_icc}}, and \code{\link{draw_normal_icc}}.
#' function to create each variable. Use \code{cross_level()} to make more
#' complex designs such as panel or cross-classified data.
#'
#' We also provide several built-in options to easily create variables, including
#' \code{\link{draw_binary}}, \code{\link{draw_count}}, \code{\link{draw_likert}},
#' and intra-cluster correlated variables \code{\link{draw_binary_icc}} and
#' \code{\link{draw_normal_icc}}
#'
#' @param data (optional) user-provided data that forms the basis of the
#' fabrication, e.g. you can add variables to existing data. Provide either
Expand All @@ -34,9 +37,6 @@
#'
#' @examples
#'
#' # Draw a single-level dataset with no covariates
#' df <- fabricate(N = 100)
#' head(df)
#'
#' # Draw a single-level dataset with a covariate
#' building_df <- fabricate(
Expand All @@ -45,7 +45,7 @@
#' )
#' head(building_df)
#'
#' # Start with existing data
#' # Start with existing data instead
#' building_modified <- fabricate(
#' data = building_df,
#' rent = rnorm(N, mean = height_ft * 100, sd = height_ft * 30)
Expand Down Expand Up @@ -73,9 +73,9 @@
#' cities = modify_level(runoff = rnorm(N))
#' )
#'
#' # fabricatr can also make cross-classified data. For more information about
#' # syntax for this functionality please read our vignette or check
#' # documentation for \code{cross_level}:
#' # fabricatr can also make panel or cross-classified data. For more
#' # information about syntax for this functionality please read our vignette
#' # or check documentation for \code{cross_level}:
#' cross_classified <- fabricate(
#' primary_schools = add_level(N = 50, ps_quality = runif(N, 0, 10)),
#' secondary_schools = add_level(N = 100, ss_quality = runif(N, 0, 10), nest=FALSE),
Expand Down
64 changes: 37 additions & 27 deletions R/resample_data.R
Original file line number Diff line number Diff line change
@@ -1,55 +1,63 @@
#' Resample data, including hierarchical data
#'
#' This function allows you to resample any data frame. The default mode
#' performs a single resample of size \code{N} without replacement. Users can
#' also specify more complex resampling strategies to resample hierarchical
#' data.
#'
#' @param data A data.frame, usually provided by the user.
#' @param N The number of sample N to return. If N is a single scalar and no labels are provided, N will specify the number of unit observations to resample. If N is named, or if the ID_labels argument is specified (in which case, both N and ID_labels should be the same length), then the units resampled will be values of the levels resampled (this is useful for, e.g., cluster resampling). If N is the constant ALL for any level, all units of this level will be transparently passed through to the next level of resampling.
#' @param ID_labels A character vector of the variables that indicate the data hierarchy, from highest to lowest (i.e., from cities to citizens).
#' @param N The number of sample observations to return. If \code{N} is a single
#' scalar and no labels are provided, \code{N} will specify the number of unit
#' observations to resample. If \code{N} is named, or if the \code{ID_labels}
#' argument is specified (in which case, both \code{N} and \code{ID_labels}
#' should be the same length), then the units resampled will be values of the
#' levels resampled (this is useful for, e.g., cluster resampling). If \code{N}
#' is the constant \code{ALL} for any level, all units of this level will be
#' transparently passed through to the next level of resampling.
#' @param ID_labels A character vector of the variables that indicate the data
#' hierarchy, from highest to lowest (i.e., from cities to citizens).
#'
#' @return A data.frame
#'
#' @examples
#'
#' # Bootstrap a dataset without any hierarchy. N specifies a number of observations to return
#' # Resample a dataset of size N without any hierarchy
#' baseline_survey <- fabricate(N = 50, Y_pre = rnorm(N))
#' bootstrapped_data <- resample_data(baseline_survey)
#'
#' baseline_survey <- fabricate(N = 5, Y_pre = rnorm(N))
#' bootsrapped_data <- resample_data(baseline_survey, N = 10)
#' bootsrapped_data
#' # Specify a fixed number of observations to return
#' baseline_survey <- fabricate(N = 50, Y_pre = rnorm(N))
#' bootstrapped_data <- resample_data(baseline_survey, N = 100)
#'
#' # Resample by a single level of a hierarchical dataset (e.g. resampling clusters of observations)
#' # N specifies a number of clusters to return
#' # Resample by a single level of a hierarchical dataset (e.g. resampling
#' # clusters of observations): N specifies a number of clusters to return
#'
#' clustered_survey <- fabricate(
#' clusters = add_level(N=25),
#' cities = add_level(N=round(runif(25, 1, 5)), population=runif(n = N, min=50000, max=1000000))
#' cities = add_level(N=round(runif(25, 1, 5)),
#' population=runif(n = N, min=50000, max=1000000))
#' )
#'
#' # Specify the name of the cluster variable one of two ways
#'
#' cluster_resample <- resample_data(clustered_survey, N = 5, ID_labels = "clusters")
#' cluster_resample
#'
#' # Alternatively, pass the level to resample as a name:
#' cluster_resample_2 <- resample_data(clustered_survey, N=c(clusters = 5))
#' cluster_resample_2
#'
#' # Resample a hierarchical dataset on multiple levels
#'
#' my_data <-
#' fabricate(
#' cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
#' citizens = add_level(N = 3, income = round(elevation * rnorm(n = N, mean = 5)))
#' cities = add_level(N = 20, elevation = runif(n = N, min = 1000, max = 2000)),
#' citizens = add_level(N = 30, age = runif(n = N, min = 18, max = 85))
#' )
#'
#' # Specify the levels you wish to resample one of two ways:
#' my_data_2 <- resample_data(my_data, N = c(3, 5), ID_labels = c("cities", "citizens"))
#' my_data_2
#' # Specify the levels you wish to resample:
#' my_data_2 <- resample_data(my_data, N = c(3, 5),
#' ID_labels = c("cities", "citizens"))
#'
#' my_data_3 <- resample_data(my_data, N = c(cities=3, citizens=5))
#' my_data_3
#' # To resample every unit at a given level, use the ALL constant
#' # This example will resample 10 citizens at each of the cities:
#'
#' # Transparently pass through all units at a given level
#' # This example will resample 2 citizens at each of the cities:
#'
#' passthrough_resample_data <- resample_data(my_data, N = c(cities=ALL, citizens=2))
#' passthrough_resample_data
#' passthrough_resample_data <- resample_data(my_data, N = c(cities=ALL, citizens=10))
#'
#'
#' @export
Expand All @@ -62,7 +70,9 @@ resample_data <- function(data, N, ID_labels=NULL) {
return(df)
}

#' Magic number constant to allow users to specify "ALL" for passthrough resampling
#' Magic number constant to allow users to specify \code{ALL} for passthrough
#' resampling
#'
#' @keywords internal
#' @export
ALL <- -20171101L
Expand Down
60 changes: 43 additions & 17 deletions R/variable_creation_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,56 +3,82 @@
#'
#' Drawing discrete data based on probabilities or latent traits is a common
#' task that can be cumbersome. Each function in our discrete drawing set creates
#' a different type of discrete data: `draw_binary` creates binary 0/1 data,
#' `draw_binomial` creates binomial data (repeated trial binary data),
#' `draw_categorical` creates categorical data, `draw_ordered` transforms latent
#' data into observed ordered categories, `draw_count` creates count data
#' (poisson-distributed). `draw_liket` is an alias to `draw_ordered` that
#' pre-specifies break labels and offers default breaks appropriate for a likert
#' survey question.
#' a different type of discrete data: \code{draw_binary} creates binary 0/1 data,
#' \code{draw_binomial} creates binomial data (repeated trial binary data),
#' \code{draw_categorical} creates categorical data, \code{draw_ordered}
#' transforms latent data into observed ordered categories, \code{draw_count}
#' creates count data (poisson-distributed). \code{draw_likert} is an alias to
#' \code{draw_ordered} that pre-specifies break labels and offers default breaks
#' appropriate for a likert survey question.
#'
#' For variables with intra-cluster correlations, see
#' \code{\link{draw_binary_icc}} and \code{\link{draw_normal_icc}}
#'
#' @param prob A number or vector of numbers representing the probability for
#' binary or binomial outcomes; or a number, vector, or matrix of numbers
#' representing probabilities for categorical outcomes. If you supply a link
#' function, these underlying probabilities will be transformed.
#' @param trials for `draw_binomial`, the number of trials for each observation
#' @param mean for `draw_count`, the mean number of count units for each observation
#' @param x for `draw_ordered`, the latent data for each observation.
#' @param breaks vector of breaks to cut a latent outcome into ordered categories
#' @param x for `draw_ordered` or `draw_likert`, the latent data for each
#' observation.
#' @param breaks vector of breaks to cut a latent outcome into ordered
#' categories with `draw_ordered` or `draw_likert`
#' @param break_labels vector of labels for the breaks to cut a latent outcome
#' into ordered categories.
#' into ordered categories with `draw_ordered`.
#' @param type Type of Likert scale data for `draw_likert`. Valid options are 4,
#' 5, and 7.
#' 5, and 7. Type corresponds to the number of categories in the Likert scale.
#' @param N number of units to draw. Defaults to the length of the vector of
#' probabilities or latent data you provided
#' probabilities or latent data you provided.
#' @param link link function between the latent variable and the probability of
#' a postiive outcome, e.g. "logit", "probit", or "identity". For the "identity"
#' link, the latent variable must be a probability.
#'
#' @examples
#'
#' # Drawing binary values (success or failure, treatment assignment)
#' fabricate(N = 3,
#' p = c(0, .5, 1),
#' binary = draw_binary(prob = p))
#'
#'
#' # Drawing binary values with probit link (transforming continuous data
#' # into a probability range).
#' fabricate(N = 3,
#' x = 10 * rnorm(N),
#' binary = draw_binary(prob = x, link = "probit"))
#'
#' # Repeated trials: `draw_binomial`
#' fabricate(N = 3,
#' p = c(0, .5, 1),
#' binomial = draw_binomial(prob = p, trials = 10))
#'
#' # Ordered data: transforming latent data into observed, ordinal data.
#' # useful for survey responses.
#' fabricate(N = 3,
#' x = 5 * rnorm(N),
#' ordered = draw_ordered(x=x,
#' ordered = draw_ordered(x = x,
#' breaks = c(-Inf, -1, 1, Inf)))
#'
#' # Providing break labels for latent data.
#' fabricate(N = 3,
#' x = c(0,5,100),
#' count = draw_count(mean=x))
#' x = 5 * rnorm(N),
#' ordered = draw_ordered(x = x,
#' breaks = c(-Inf, -1, 1, Inf),
#' break_labels = c("Not at all concerned",
#' "Somewhat concerned",
#' "Very concerned")))
#'
#' # Likert data: often used for survey data
#' fabricate(N = 10,
#' support_free_college = draw_likert(x = rnorm(N),
#' type = 5))
#'
#' # Count data: useful for rates of occurrences over time.
#' fabricate(N = 5,
#' x = c(0, 5, 25, 50, 100),
#' theft_rate = draw_count(mean=x))
#'
#' # Categorical
#' # Categorical data: useful for demographic data.
#' fabricate(N = 6, p1 = runif(N), p2 = runif(N), p3 = runif(N),
#' cat = draw_categorical(cbind(p1, p2, p3)))
#'
Expand Down
Loading

0 comments on commit 775eb4b

Please sign in to comment.