From 775eb4b32ab4b0cbbb41f00aaa6ddf2993dc5e0f Mon Sep 17 00:00:00 2001 From: Aaron Rudkin Date: Tue, 23 Jan 2018 00:11:44 -0800 Subject: [PATCH] CRAN Submission PR (#59) --- R/cross_level.R | 32 +++-- R/draw_binary_icc.R | 19 ++- R/draw_normal_icc.R | 14 ++ R/fabricate.R | 22 +-- R/resample_data.R | 64 +++++---- R/variable_creation_functions.R | 60 +++++--- README.Rmd | 16 +-- README.md | 28 ++-- docs/articles/advanced_features.html | 20 ++- docs/articles/building_importing.html | 28 ++-- docs/articles/cross_classified.html | 63 ++++----- docs/articles/getting_started.html | 33 ++--- docs/articles/index.html | 2 +- docs/articles/resampling.html | 185 ++++++------------------- docs/articles/variable_generation.html | 159 ++++++--------------- docs/index.html | 20 +-- docs/news/index.html | 8 +- docs/reference/ALL.html | 9 +- docs/reference/cross_level.html | 35 +++-- docs/reference/draw_binary_icc.html | 17 ++- docs/reference/draw_binomial.html | 101 ++++++++++---- docs/reference/draw_normal_icc.html | 30 +++- docs/reference/fabricate.html | 66 ++++----- docs/reference/index.html | 3 +- docs/reference/join.html | 5 +- docs/reference/resample_data.html | 141 +++++-------------- man/ALL.Rd | 6 +- man/cross_level.Rd | 29 ++-- man/draw_binary_icc.Rd | 14 +- man/draw_binomial.Rd | 63 ++++++--- man/draw_normal_icc.Rd | 14 ++ man/fabricate.Rd | 23 +-- man/join.Rd | 5 +- man/resample_data.Rd | 62 +++++---- vignettes/advanced_features.Rmd | 21 ++- vignettes/building_importing.Rmd | 28 ++-- vignettes/cross_classified.Rmd | 64 ++++----- vignettes/getting_started.Rmd | 21 ++- vignettes/resampling.Rmd | 54 +++++--- vignettes/variable_generation.Rmd | 54 +++----- 40 files changed, 769 insertions(+), 869 deletions(-) diff --git a/R/cross_level.R b/R/cross_level.R index ca5605b..bb8ac2e 100644 --- a/R/cross_level.R +++ b/R/cross_level.R @@ -1,12 +1,20 @@ -#' Creates cross-classified (partially non-nested, joined data) with a fixed -#' correlation structure. +#' Creates panel or cross-classified data +#' +#' This function allows the user to create data structures that are paneled or +#' cross-classified: where one level of observation draws simultaneously from +#' two or many source levels. Common examples of panels include country-year +#' data which have country-level and year-level characteristics. +#' +#' By specifying the appropriate arguments in \code{join()} within the +#' function call, it is possible to induce correlation in cross-classified data. #' #' @param N The number of observations in the resulting data frame. -#' If N is NULL or not provided, the join will be an "outer join" -- creating a -#' full panel of each of the rows from each data frame provided. -#' @param by The result of a call to \code{join()} which specifies how the -#' cross-classified data will be created +#' If \code{N} is NULL or not provided, the join will be an "outer product" -- +#' merging each row of each provided data frame with each other data frame to +#' make a full panel. +#' @param by The result of a call to \code{join()} which specifies how +#' the cross-classified data will be created #' @param ... A variable or series of variables to add to the resulting data #' frame after the cross-classified data is created. #' @@ -15,14 +23,14 @@ #' @examples #' #' # Generate full panel data -#' #' panel <- fabricate( #' countries = add_level(N = 20, country_shock = runif(N, 1, 10)), #' years = add_level(N = 20, year_shock = runif(N, 1, 10), nest=FALSE), #' obs = cross_level(by=join(countries, years), GDP_it = country_shock + year_shock) #' ) #' -#' # Generate cross-classified data and merge, no correlation +#' # Include an "N" argument to allow for cross-classified +#' # data. #' students <- fabricate( #' primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)), #' secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE), @@ -30,7 +38,8 @@ #' ) #' head(students) #' -#' # Cross-classified data with a correlation structure +#' # Induce a correlation structure in cross-classified data by providing +#' # rho. #' students <- fabricate( #' primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)), #' secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE), @@ -198,10 +207,11 @@ cross_level_internal <- function(N = NULL, #' variables being joined on: note that if it is not possible to make a #' correlation matrix from this coefficient (e.g. if you are joining on three #' or more variables and rho is negative) then the \code{cross_level()} call -#' will fail. +#' will fail. Do not provide \code{rho} if making panel data. #' @param sigma A matrix with dimensions equal to the number of variables you #' are joining on, specifying the correlation for the resulting joined data. -#' Only one of rho and sigma should be provided. +#' Only one of rho and sigma should be provided. Do not provide \code{sigma} if +#' making panel data. #' @export join <- function(..., rho=0, sigma=NULL) { data_arguments <- quos(...) diff --git a/R/draw_binary_icc.R b/R/draw_binary_icc.R index fe59694..705cd53 100644 --- a/R/draw_binary_icc.R +++ b/R/draw_binary_icc.R @@ -12,19 +12,32 @@ #' generated. Must be equal to length(clusters) if provided. #' @param clusters A vector of factors or items that can be coerced to #' clusters; the length will determine the length of the generated data. -#' @param ICC A number indicating the desired ICC, if none is provided will -#' default to 0. +#' @param ICC A number indicating the desired \code{ICC}, if none is provided +#' the default ICC will be 0. #' @return A vector of binary numbers corresponding to the observations from #' the supplied cluster IDs. #' @examples +#' # Divide units into clusters #' clusters = rep(1:5, 10) +#' +#' # Default probability 0.5, default ICC 0 #' draw_binary_icc(clusters = clusters) -#' draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5) +#' +#' # Specify probability or ICC +#' corr_draw = draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5) +#' +#' # Verify ICC of data. +#' summary(lm(corr_draw ~ as.factor(clusters)))$r.squared #' #' @importFrom stats rbinom #' #' @export draw_binary_icc <- function(prob = 0.5, N = NULL, clusters, ICC = 0) { + + if(is.null(clusters)) { + stop("You must provide clusters to `draw_binary_icc`") + } + # Let's not worry about how clusters are provided tryCatch({ clusters <- as.numeric(as.factor(clusters)) diff --git a/R/draw_normal_icc.R b/R/draw_normal_icc.R index 269694d..ca580cb 100644 --- a/R/draw_normal_icc.R +++ b/R/draw_normal_icc.R @@ -32,9 +32,23 @@ #' @return A vector of numbers corresponding to the observations from #' the supplied cluster IDs. #' @examples +#' +#' # Divide observations into clusters #' clusters = rep(1:5, 10) +#' +#' # Default: unit variance within each cluster #' draw_normal_icc(clusters = clusters, ICC = 0.5) #' +#' # Alternatively, you can specify characteristics: +#' draw_normal_icc(mean = 10, clusters = clusters, sd = 3, ICC = 0.3) +#' +#' # Can specify between-cluster standard deviation instead: +#' draw_normal_icc(clusters = clusters, sd_between = 4, ICC = 0.2) +#' +#' # Verify that ICC generated is accurate +#' corr_draw = draw_normal_icc(clusters = clusters, ICC = 0.4) +#' summary(lm(corr_draw ~ as.factor(clusters)))$r.squared +#' #' @importFrom stats rnorm #' #' @export diff --git a/R/fabricate.R b/R/fabricate.R index 68e9638..8dfed29 100644 --- a/R/fabricate.R +++ b/R/fabricate.R @@ -6,10 +6,13 @@ #' \code{N}. Create hierarchical data with multiple levels of data such as #' citizens within cities within states using \code{add_level()} or modify #' existing hierarchical data using \code{modify_level()}. You can use any R -#' function to create each variable. We provide several built-in options to -#' easily draw from binary and count outcomes, including -#' \code{\link{draw_binary}}, \code{\link{draw_count}}, -#' \code{\link{draw_binary_icc}}, and \code{\link{draw_normal_icc}}. +#' function to create each variable. Use \code{cross_level()} to make more +#' complex designs such as panel or cross-classified data. +#' +#' We also provide several built-in options to easily create variables, including +#' \code{\link{draw_binary}}, \code{\link{draw_count}}, \code{\link{draw_likert}}, +#' and intra-cluster correlated variables \code{\link{draw_binary_icc}} and +#' \code{\link{draw_normal_icc}} #' #' @param data (optional) user-provided data that forms the basis of the #' fabrication, e.g. you can add variables to existing data. Provide either @@ -34,9 +37,6 @@ #' #' @examples #' -#' # Draw a single-level dataset with no covariates -#' df <- fabricate(N = 100) -#' head(df) #' #' # Draw a single-level dataset with a covariate #' building_df <- fabricate( @@ -45,7 +45,7 @@ #' ) #' head(building_df) #' -#' # Start with existing data +#' # Start with existing data instead #' building_modified <- fabricate( #' data = building_df, #' rent = rnorm(N, mean = height_ft * 100, sd = height_ft * 30) @@ -73,9 +73,9 @@ #' cities = modify_level(runoff = rnorm(N)) #' ) #' -#' # fabricatr can also make cross-classified data. For more information about -#' # syntax for this functionality please read our vignette or check -#' # documentation for \code{cross_level}: +#' # fabricatr can also make panel or cross-classified data. For more +#' # information about syntax for this functionality please read our vignette +#' # or check documentation for \code{cross_level}: #' cross_classified <- fabricate( #' primary_schools = add_level(N = 50, ps_quality = runif(N, 0, 10)), #' secondary_schools = add_level(N = 100, ss_quality = runif(N, 0, 10), nest=FALSE), diff --git a/R/resample_data.R b/R/resample_data.R index 0b5ce51..7680929 100644 --- a/R/resample_data.R +++ b/R/resample_data.R @@ -1,55 +1,63 @@ #' Resample data, including hierarchical data #' +#' This function allows you to resample any data frame. The default mode +#' performs a single resample of size \code{N} without replacement. Users can +#' also specify more complex resampling strategies to resample hierarchical +#' data. +#' #' @param data A data.frame, usually provided by the user. -#' @param N The number of sample N to return. If N is a single scalar and no labels are provided, N will specify the number of unit observations to resample. If N is named, or if the ID_labels argument is specified (in which case, both N and ID_labels should be the same length), then the units resampled will be values of the levels resampled (this is useful for, e.g., cluster resampling). If N is the constant ALL for any level, all units of this level will be transparently passed through to the next level of resampling. -#' @param ID_labels A character vector of the variables that indicate the data hierarchy, from highest to lowest (i.e., from cities to citizens). +#' @param N The number of sample observations to return. If \code{N} is a single +#' scalar and no labels are provided, \code{N} will specify the number of unit +#' observations to resample. If \code{N} is named, or if the \code{ID_labels} +#' argument is specified (in which case, both \code{N} and \code{ID_labels} +#' should be the same length), then the units resampled will be values of the +#' levels resampled (this is useful for, e.g., cluster resampling). If \code{N} +#' is the constant \code{ALL} for any level, all units of this level will be +#' transparently passed through to the next level of resampling. +#' @param ID_labels A character vector of the variables that indicate the data +#' hierarchy, from highest to lowest (i.e., from cities to citizens). #' #' @return A data.frame #' #' @examples #' -#' # Bootstrap a dataset without any hierarchy. N specifies a number of observations to return +#' # Resample a dataset of size N without any hierarchy +#' baseline_survey <- fabricate(N = 50, Y_pre = rnorm(N)) +#' bootstrapped_data <- resample_data(baseline_survey) #' -#' baseline_survey <- fabricate(N = 5, Y_pre = rnorm(N)) -#' bootsrapped_data <- resample_data(baseline_survey, N = 10) -#' bootsrapped_data +#' # Specify a fixed number of observations to return +#' baseline_survey <- fabricate(N = 50, Y_pre = rnorm(N)) +#' bootstrapped_data <- resample_data(baseline_survey, N = 100) #' -#' # Resample by a single level of a hierarchical dataset (e.g. resampling clusters of observations) -#' # N specifies a number of clusters to return +#' # Resample by a single level of a hierarchical dataset (e.g. resampling +#' # clusters of observations): N specifies a number of clusters to return #' #' clustered_survey <- fabricate( #' clusters = add_level(N=25), -#' cities = add_level(N=round(runif(25, 1, 5)), population=runif(n = N, min=50000, max=1000000)) +#' cities = add_level(N=round(runif(25, 1, 5)), +#' population=runif(n = N, min=50000, max=1000000)) #' ) #' -#' # Specify the name of the cluster variable one of two ways -#' #' cluster_resample <- resample_data(clustered_survey, N = 5, ID_labels = "clusters") -#' cluster_resample #' +#' # Alternatively, pass the level to resample as a name: #' cluster_resample_2 <- resample_data(clustered_survey, N=c(clusters = 5)) -#' cluster_resample_2 #' #' # Resample a hierarchical dataset on multiple levels -#' #' my_data <- #' fabricate( -#' cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)), -#' citizens = add_level(N = 3, income = round(elevation * rnorm(n = N, mean = 5))) +#' cities = add_level(N = 20, elevation = runif(n = N, min = 1000, max = 2000)), +#' citizens = add_level(N = 30, age = runif(n = N, min = 18, max = 85)) #' ) #' -#' # Specify the levels you wish to resample one of two ways: -#' my_data_2 <- resample_data(my_data, N = c(3, 5), ID_labels = c("cities", "citizens")) -#' my_data_2 +#' # Specify the levels you wish to resample: +#' my_data_2 <- resample_data(my_data, N = c(3, 5), +#' ID_labels = c("cities", "citizens")) #' -#' my_data_3 <- resample_data(my_data, N = c(cities=3, citizens=5)) -#' my_data_3 +#' # To resample every unit at a given level, use the ALL constant +#' # This example will resample 10 citizens at each of the cities: #' -#' # Transparently pass through all units at a given level -#' # This example will resample 2 citizens at each of the cities: -#' -#' passthrough_resample_data <- resample_data(my_data, N = c(cities=ALL, citizens=2)) -#' passthrough_resample_data +#' passthrough_resample_data <- resample_data(my_data, N = c(cities=ALL, citizens=10)) #' #' #' @export @@ -62,7 +70,9 @@ resample_data <- function(data, N, ID_labels=NULL) { return(df) } -#' Magic number constant to allow users to specify "ALL" for passthrough resampling +#' Magic number constant to allow users to specify \code{ALL} for passthrough +#' resampling +#' #' @keywords internal #' @export ALL <- -20171101L diff --git a/R/variable_creation_functions.R b/R/variable_creation_functions.R index d804201..afe5d96 100644 --- a/R/variable_creation_functions.R +++ b/R/variable_creation_functions.R @@ -3,13 +3,16 @@ #' #' Drawing discrete data based on probabilities or latent traits is a common #' task that can be cumbersome. Each function in our discrete drawing set creates -#' a different type of discrete data: `draw_binary` creates binary 0/1 data, -#' `draw_binomial` creates binomial data (repeated trial binary data), -#' `draw_categorical` creates categorical data, `draw_ordered` transforms latent -#' data into observed ordered categories, `draw_count` creates count data -#' (poisson-distributed). `draw_liket` is an alias to `draw_ordered` that -#' pre-specifies break labels and offers default breaks appropriate for a likert -#' survey question. +#' a different type of discrete data: \code{draw_binary} creates binary 0/1 data, +#' \code{draw_binomial} creates binomial data (repeated trial binary data), +#' \code{draw_categorical} creates categorical data, \code{draw_ordered} +#' transforms latent data into observed ordered categories, \code{draw_count} +#' creates count data (poisson-distributed). \code{draw_likert} is an alias to +#' \code{draw_ordered} that pre-specifies break labels and offers default breaks +#' appropriate for a likert survey question. +#' +#' For variables with intra-cluster correlations, see +#' \code{\link{draw_binary_icc}} and \code{\link{draw_normal_icc}} #' #' @param prob A number or vector of numbers representing the probability for #' binary or binomial outcomes; or a number, vector, or matrix of numbers @@ -17,42 +20,65 @@ #' function, these underlying probabilities will be transformed. #' @param trials for `draw_binomial`, the number of trials for each observation #' @param mean for `draw_count`, the mean number of count units for each observation -#' @param x for `draw_ordered`, the latent data for each observation. -#' @param breaks vector of breaks to cut a latent outcome into ordered categories +#' @param x for `draw_ordered` or `draw_likert`, the latent data for each +#' observation. +#' @param breaks vector of breaks to cut a latent outcome into ordered +#' categories with `draw_ordered` or `draw_likert` #' @param break_labels vector of labels for the breaks to cut a latent outcome -#' into ordered categories. +#' into ordered categories with `draw_ordered`. #' @param type Type of Likert scale data for `draw_likert`. Valid options are 4, -#' 5, and 7. +#' 5, and 7. Type corresponds to the number of categories in the Likert scale. #' @param N number of units to draw. Defaults to the length of the vector of -#' probabilities or latent data you provided +#' probabilities or latent data you provided. #' @param link link function between the latent variable and the probability of #' a postiive outcome, e.g. "logit", "probit", or "identity". For the "identity" #' link, the latent variable must be a probability. #' #' @examples +#' +#' # Drawing binary values (success or failure, treatment assignment) #' fabricate(N = 3, #' p = c(0, .5, 1), #' binary = draw_binary(prob = p)) #' -#' +#' # Drawing binary values with probit link (transforming continuous data +#' # into a probability range). #' fabricate(N = 3, #' x = 10 * rnorm(N), #' binary = draw_binary(prob = x, link = "probit")) #' +#' # Repeated trials: `draw_binomial` #' fabricate(N = 3, #' p = c(0, .5, 1), #' binomial = draw_binomial(prob = p, trials = 10)) #' +#' # Ordered data: transforming latent data into observed, ordinal data. +#' # useful for survey responses. #' fabricate(N = 3, #' x = 5 * rnorm(N), -#' ordered = draw_ordered(x=x, +#' ordered = draw_ordered(x = x, #' breaks = c(-Inf, -1, 1, Inf))) #' +#' # Providing break labels for latent data. #' fabricate(N = 3, -#' x = c(0,5,100), -#' count = draw_count(mean=x)) +#' x = 5 * rnorm(N), +#' ordered = draw_ordered(x = x, +#' breaks = c(-Inf, -1, 1, Inf), +#' break_labels = c("Not at all concerned", +#' "Somewhat concerned", +#' "Very concerned"))) +#' +#' # Likert data: often used for survey data +#' fabricate(N = 10, +#' support_free_college = draw_likert(x = rnorm(N), +#' type = 5)) +#' +#' # Count data: useful for rates of occurrences over time. +#' fabricate(N = 5, +#' x = c(0, 5, 25, 50, 100), +#' theft_rate = draw_count(mean=x)) #' -#' # Categorical +#' # Categorical data: useful for demographic data. #' fabricate(N = 6, p1 = runif(N), p2 = runif(N), p3 = runif(N), #' cat = draw_categorical(cbind(p1, p2, p3))) #' diff --git a/README.Rmd b/README.Rmd index 976d55f..e7b121f 100644 --- a/README.Rmd +++ b/README.Rmd @@ -39,24 +39,20 @@ Once you have installed **fabricatr**, you can easily import your own data or ge ```{r} library(fabricatr) -house_members = fabricate( +house_members <- fabricate( party_id = add_level( - N = 2, - party_names = c("Republican", "Democrat"), - party_ideology = c(0.5, -0.5), - in_power = c(1, 0), - party_incumbents = c(241, 194)), + N = 2, party_names = c("Republican", "Democrat"), party_ideology = c(0.5, -0.5), + in_power = c(1, 0), party_incumbents = c(241, 194)), rep_id = add_level( - N = party_incumbents, - member_ideology = rnorm(N, party_ideology, sd=0.5), - terms_served = draw_count(N = N, mean = 4), + N = party_incumbents, member_ideology = rnorm(N, party_ideology, sd = 0.5), + terms_served = draw_count(N = N, mean = 4), female = draw_binary(N = N, prob = 0.198)) ) ``` ```{r echo=FALSE} set.seed(19861108) -knitr::kable(house_members[sample.int(nrow(house_members), 5, replace=FALSE), c(2, 3, 4, 7, 8, 9)]) +knitr::kable(house_members[sample.int(nrow(house_members), 5, replace=FALSE), c(2, 3, 4, 7, 8, 9)], row.names = FALSE) ``` ### Next Steps diff --git a/README.md b/README.md index 5f5e513..7d6adc0 100644 --- a/README.md +++ b/README.md @@ -28,28 +28,24 @@ Once you have installed **fabricatr**, you can easily import your own data or ge ``` r library(fabricatr) -house_members = fabricate( +house_members <- fabricate( party_id = add_level( - N = 2, - party_names = c("Republican", "Democrat"), - party_ideology = c(0.5, -0.5), - in_power = c(1, 0), - party_incumbents = c(241, 194)), + N = 2, party_names = c("Republican", "Democrat"), party_ideology = c(0.5, -0.5), + in_power = c(1, 0), party_incumbents = c(241, 194)), rep_id = add_level( - N = party_incumbents, - member_ideology = rnorm(N, party_ideology, sd=0.5), - terms_served = draw_count(N = N, mean = 4), + N = party_incumbents, member_ideology = rnorm(N, party_ideology, sd = 0.5), + terms_served = draw_count(N = N, mean = 4), female = draw_binary(N = N, prob = 0.198)) ) ``` -| | party\_names | party\_ideology| in\_power| member\_ideology| terms\_served| female| -|-----|:-------------|----------------:|----------:|-----------------:|--------------:|-------:| -| 339 | Democrat | -0.5| 0| 0.11| 3| 0| -| 217 | Republican | 0.5| 1| -0.37| 1| 0| -| 233 | Republican | 0.5| 1| 0.71| 2| 1| -| 263 | Democrat | -0.5| 0| -1.05| 3| 0| -| 140 | Republican | 0.5| 1| 0.24| 4| 0| +| party\_names | party\_ideology| in\_power| member\_ideology| terms\_served| female| +|:-------------|----------------:|----------:|-----------------:|--------------:|-------:| +| Democrat | -0.5| 0| 0.11| 3| 0| +| Republican | 0.5| 1| -0.37| 1| 0| +| Republican | 0.5| 1| 0.71| 2| 1| +| Democrat | -0.5| 0| -1.05| 3| 0| +| Republican | 0.5| 1| 0.24| 4| 0| ### Next Steps diff --git a/docs/articles/advanced_features.html b/docs/articles/advanced_features.html index 46d0c6d..a3cc4d3 100644 --- a/docs/articles/advanced_features.html +++ b/docs/articles/advanced_features.html @@ -113,7 +113,7 @@

Aaron Rudkin

More complicated level creation with variable numbers of observations

-

add_level() can be used to create more complicated patterns of nesting. For example, when creating lower level data, it is possible to use a different N for each of the values of the higher level data:

+

add_level() can be used to create more complicated patterns of nesting. For example, when creating lower level data, it is possible to use a different N for each of the values of the higher level data:

variable_data <-
   fabricate(
     cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
@@ -166,13 +166,12 @@ 

-

Here, each city has a different number of citizens. And the value of N used to create the age variable automatically updates as needed. The result is a dataset with 6 citizens, 2 in the first city and 4 in the second. As long as N is either a number, or a vector of the same length of the current lowest level of the data, add_level() will know what to do.

+

Here, each city has a different number of citizens. And the value of N used to create the age variable automatically updates as needed. The result is a dataset with 6 citizens, 2 in the first city and 4 in the second. As long as N is either a number, or a vector of the same length of the current lowest level of the data, add_level() will know what to do.

It is also possible to provide a function to N, enabling a random number of citizens per city:

my_data <-
   fabricate(
     cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
-    citizens = add_level(N = sample(1:6, size = 2, replace = TRUE), 
-                         age = runif(N, 18, 70))
+    citizens = add_level(N = sample(1:6, size = 2, replace = TRUE), age = runif(N, 18, 70))
   )
 my_data
@@ -223,11 +222,10 @@

Here, each city is given a random number of citizens between 1 and 6. Since the sample() function returns a vector of length 2, this is like specifying 2 separate Ns as in the example above.

Finally, it is possible to define N on the basis of higher level variables themselves. Consider the following example:

-
variable_n_function = fabricate(
+
variable_n = fabricate(
   cities = add_level(N = 5, population = runif(N, 10, 200)),
   citizens = add_level(N = round(population * 0.3))
-)
-head(variable_n_function)
+)
@@ -275,8 +273,7 @@

You may want to include the mean value of a variable within a group defined by a higher level of the hierarchy, for example the average income of citizens within city. You can do this with ave():

ave_example = fabricate(
     cities = add_level(N = 2),
-    citizens = add_level(N = 1:2, 
-                         income = rnorm(N), 
+    citizens = add_level(N = 1:2, income = rnorm(N), 
                          income_mean_city = ave(income, cities))
     ) 
 ave_example
@@ -312,11 +309,9 @@

Tidyverse integration

-

Because the functions in fabricatr take data and return data, they are cross-compatible with a tidyverse workflow:

+

Because the functions in fabricatr take data and return data, they are cross-compatible with a tidyverse workflow. Here is an example of using magrittr’s pipe operator (%>%) and dplyr’s group_by and mutate verbs to add new data.

library(dplyr)
 
-# letting higher levels depend on lower levels
-
 my_data <- 
 fabricate(
     cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
@@ -372,6 +367,7 @@ 

cities
+

It is also possible to use the pipe operator (%>%) to direct the flow of data between fabricate() calls. Remember that every fabricate() call can import existing data frames, and every call returns a single data frame.

my_data <- 
 data_frame(Y = sample(1:10, 2)) %>% 
   fabricate(lower_level = add_level(N = 3, Y2 = Y + rnorm(N)))
diff --git a/docs/articles/building_importing.html b/docs/articles/building_importing.html
index eca4b70..e727c60 100644
--- a/docs/articles/building_importing.html
+++ b/docs/articles/building_importing.html
@@ -126,7 +126,7 @@ 

Single-level datasets from scratch

Making a single-level dataset begins with providing the argument N, a number representing the number of observations you wish to create, followed by a series of variable definitions. Variables can be defined using any function you have access to in R. fabricatr provides several simple functions for generating common types of data. These are covered below. Functions that create subsequent variables can rely on previously created variables, which ensures that variables can be related to one another:

library(fabricatr)
-my_data <- fabricate(N = 5, Y = runif(N), Y2 = Y*5)
+my_data <- fabricate(N = 5, Y = runif(N), Y2 = Y * 5)
 my_data
@@ -261,8 +261,8 @@

gdp_per_capita = runif(N, min=10000, max=50000), life_expectancy = 50 + runif(N, 10, 20) + ((gdp_per_capita > 30000) * 10)), provinces = add_level(N = 10, - has_nat_resources = draw_binary(prob=0.3, N=N), - has_manufacturing = draw_binary(prob=0.7, N=N)) + has_nat_resources = draw_binary(prob = 0.3, N = N), + has_manufacturing = draw_binary(prob = 0.7, N = N)) ) head(country_data)

@@ -336,12 +336,11 @@

citizen_data <- 
   fabricate(
     data = country_data,
-    citizens = add_level(N=10,
-                         salary = rnorm(N, 
-                                        mean = gdp_per_capita +
-                                          has_nat_resources * 5000 + 
-                                          has_manufacturing * 5000,
-                                    sd = 10000)))
+    citizens = add_level(N=10, 
+                         salary = rnorm(N, mean = gdp_per_capita + 
+                                          has_nat_resources * 5000 + has_manufacturing * 5000,
+                                        sd = 10000))
+    )
 head(citizen_data)

@@ -505,15 +504,10 @@

fabricate( data = citizen_data, countries = modify_level(avg_temp = runif(N, 30, 80)), - provinces = modify_level(conflict_zone = draw_binary(N, - prob=0.2 + has_nat_resources * 0.3), - infant_mortality = runif(N, 0, 10) + - conflict_zone * 10 + + provinces = modify_level(conflict_zone = draw_binary(N, prob = 0.2 + has_nat_resources * 0.3), + infant_mortality = runif(N, 0, 10) + conflict_zone * 10 + (avg_temp > 70) * 10), - citizens = modify_level(college_degree = draw_binary(N, - prob=0.4 - (0.3 * conflict_zone) - ) - ) + citizens = modify_level(college_degree = draw_binary(N, prob = 0.4 - (0.3 * conflict_zone))) )

Before assessing what this tells us about modify_level(), let’s consider what the data simulated does. It creates a new variable at the country level, for a country level average temperature. Subsequently, it creates a province level binary indicator for whether the province is an active conflict site. Provinces that have natural resources are more likely to be in conflict in this simulation, drawing on conclusions from literature on “resource curses”. The infant mortality rate for the province is able to depend both on province level data we have just generated, and country-level data: it is higher in high-temperature areas (reflecting literature on increased disease burden near the equator) and also higher in conflict zones. Citizens access to education is also random, but depends on whether they live in a conflict area.

There are a lot of things to learn from this example. First, it’s possible to modify multiple levels. Any new variable created will automatically propagate to the lower level data according – by setting an average temperature for a country, all provinces, and all citizens of those provinces, have the value for the country. Values created from one modify_level() call can be used in subsequent variables of the same call, or subsequent calls.

diff --git a/docs/articles/cross_classified.html b/docs/articles/cross_classified.html index 41332aa..b4de131 100644 --- a/docs/articles/cross_classified.html +++ b/docs/articles/cross_classified.html @@ -5,7 +5,7 @@ -fabricatr - Panel and Cross-classifieddata +fabricatr - Panel and Cross-classified data @@ -102,7 +102,7 @@
@@ -120,7 +120,7 @@

The steps for generating a panel in fabricatr are as follows:

  1. Generate multiple non-nested data frames (Countries and Years)
  2. -
  3. Use the [cross_level()] function to join the non-nested data frames to make a panel.
  4. +
  5. Use the cross_level() function to join the non-nested data frames to make a panel.
  6. Optionally, add new variables or further levels in the resulting cross-classified data. (Observation-level variables)
@@ -138,25 +138,25 @@

Importing non-nested data frames

-

It is also possible to import multiple non-nested data frames; this will allow you to assemble pre-existing data sources however you would like. Recall that the first argument to a [fabricate()] call is the data you wish to import. We have previously seen that it is possible to import a single data frame this way, but it is also possible to import a list of data frames, staging them all for use for cross-classifying data. Data imported in this manner looks like this:

+

It is also possible to import multiple non-nested data frames; this will allow you to assemble pre-existing data sources however you would like. Recall that the first argument to a fabricate() call is the data you wish to import. We have previously seen that it is possible to import a single data frame this way, but it is also possible to import a list of data frames, staging them all for use for cross-classifying data. Data imported in this manner looks like this:

example_data <- fabricate(
   list(data_frame_1, data_frame_2),
   ...
 )
-

Again, the [fabricate()] call is incomplete – we have imported the data we wish to cross-classify on, but not yet learned how to merge the data. If you do not specify how to merge the data, [fabricate()] will simply return the most recent data frame imported or generated, unmodified.

+

Again, the fabricate() call is incomplete – we have imported the data we wish to cross-classify on, but not yet learned how to merge the data. If you do not specify how to merge the data, fabricate() will simply return the most recent data frame imported or generated, unmodified.

Specifying a merge function

-

Specifying a merge function to create a panel is simple. You need only to tell fabricatr which levels you wish to merge, and then you will have an assembled panel and can generate new variables at the observation-level. We do this using a call to [cross_level()]:

+

Specifying a merge function to create a panel is simple. You need only to tell fabricatr which levels you wish to merge, and then you will have an assembled panel and can generate new variables at the observation-level. We do this using a call to cross_level():

panels <- fabricate(
   countries = add_level(N = 150, country_fe = runif(N, 1, 10)),
   years = add_level(N = 25, year_shock = runif(N, 1, 10), nest = FALSE),
   obs = cross_level(by = join(countries, years), 
                     new_variable = country_fe + year_shock + rnorm(N, 0, 2))
 )
-

Note that [cross_level()] takes a single required argument, which is of the form by = join(...). This join command tells fabricatr how to assemble your data. In this case, we are telling it to join the countries data frame to the years data frame, resulting in country-year observations.

-

Just like with regular [add_level()] commands, you can add new variables which have full access to the existing columns.

+

Note that cross_level() takes a single required argument, which is of the form by = join(...). This join command tells fabricatr how to assemble your data. In this case, we are telling it to join the countries data frame to the years data frame, resulting in country-year observations.

+

Just like with regular add_level() commands, you can add new variables which have full access to the existing columns.

@@ -168,20 +168,20 @@

The main steps involved in generating cross-classified data are as follows:

  1. Generate multiple non-nested data frames (Primary and Secondary Schools)
  2. -
  3. Use the [cross_level()] function to join the non-nested data frames on particular variables, optionally specifying a desired correlation outcome.
  4. +
  5. Use the cross_level() function to join the non-nested data frames on particular variables, optionally specifying a desired correlation outcome.
  6. Optionally, add new variables or further levels in the resulting cross-classified data. (Student-level characteristics)

We have already learned above how to generate multiple non-nested data frames; now we example how to specify a merge function for cross-classified data.

Specifying a merge function for cross-classified data.

-

One difference between panel data and cross-classified data is that we need to specify the number of observations we will create by combining the existing levels of data. For example, there may be 20 primary schools and 15 secondary schools in a mid-sized city, but several thousand students. Specifying this is as easy as providing an N argument to the [cross_level()] call. At this juncture, we will assume there is no relationship between the primary school a student attends and the secondary school a student attends:

+

One difference between panel data and cross-classified data is that we need to specify the number of observations we will create by combining the existing levels of data. For example, there may be 20 primary schools and 15 secondary schools in a mid-sized city, but several thousand students. Specifying this is as easy as providing an N argument to the cross_level() call. At this juncture, we will assume there is no relationship between the primary school a student attends and the secondary school a student attends:

schools_data <- fabricate(
   primary_schools = add_level(N = 20, ps_quality = runif(N, 1, 10)),
   secondary_schools = add_level(N = 15, ss_quality = runif(N, 1, 10), nest = FALSE),
   students = cross_level(N = 1500, by = join(primary_schools, secondary_schools))
 )
-

We see that the only change here is providing an N argument to [cross_level()] call. The result is, predictably, a data frame containing 1500 observations, each of five columns: primary_schools (ID), ps_quality, secondary_schools (ID), ss_quality, and students (ID).

+

We see that the only change here is providing an N argument to cross_level() call. The result is, predictably, a data frame containing 1500 observations, each of five columns: primary_schools (ID), ps_quality, secondary_schools (ID), ss_quality, and students (ID).

@@ -191,9 +191,7 @@

primary_schools = add_level(N = 20, ps_quality = runif(N, 1, 10)), secondary_schools = add_level(N = 15, ss_quality = runif(N, 1, 10), nest = FALSE), students = cross_level(N = 1500, by = join(primary_schools, secondary_schools), - SAT_score = 800 + - 13 * ps_quality + - 26 * ss_quality + + SAT_score = 800 + 13 * ps_quality + 26 * ss_quality + rnorm(N, 0, 50)) )

Here, each student is assigned a standardized testing score, equal to a baseline, plus an additive effect from the quality of their primary school, plus a large additive effect from the quality of their secondary school, plus a stochastic component.

@@ -241,13 +239,12 @@

primary_schools = add_level(N = 20, ps_quality = runif(N, 1, 10)), secondary_schools = add_level(N = 15, ss_quality = runif(N, 1, 10), nest = FALSE), students = cross_level(N = 1500, by = join(ps_quality, ss_quality, rho = 0.5), - SAT_score = 800 + - 13 * ps_quality + - 26 * ss_quality + + SAT_score = 800 + 13 * ps_quality + 26 * ss_quality + rnorm(N, 0, 50)) )

-

Here, we have changed the structure of our [join()] function call. First, the variables we are joining on are ps_quality and ss_quality. fabricatr locates these variables within the data frames they come from. Second we specify a Spearman’s (rank) correlation coefficient, rho, which will induce a correlation in the resulting data based on the join function. In this case, we want a correlation between ps_quality and ss_quality of 0.5.

-

Technical details of the implementation of this function are contained below, but in the mean time the important thing to note is that rho can be any value from -1 to 1, and that the resulting correlation will be approximately equal to rho. Note: Because of the technical details of our implementation, the true correlation in the resulting data will be slightly attenuated (smaller in magnitude) from the specified rho. There is no general purpose correction to compensate for this attenuation.

+

Here, we have changed the structure of our join() function call. First, the variables we are joining on are ps_quality and ss_quality. fabricatr locates these variables within the data frames they come from. Second we specify a Spearman’s (rank) correlation coefficient, rho, which will induce a correlation in the resulting data based on the join function. In this case, we want a correlation between ps_quality and ss_quality of 0.5.

+

Technical details of the implementation of this function are contained below, but in the mean time the important thing to note is that rho can be any value from -1 to 1, and that the resulting correlation will be approximately equal to rho.

+

Note: Because of the technical details of our implementation, the true correlation in the resulting data will be slightly attenuated (smaller in magnitude) from the specified rho. There is no general purpose correction to compensate for this attenuation.

We can check the resulting correlation here:

cor(corr_data$ps_quality, corr_data$ss_quality)

0.47

@@ -263,17 +260,14 @@

primary_schools = add_level(N = 20, ps_quality = runif(N, 1, 10)), secondary_schools = add_level(N = 15, ss_quality = runif(N, 1, 10), nest = FALSE), colleges = add_level(N = 50, c_quality = runif(N, 1, 10), nest = FALSE), - students = cross_level(N = 1500, by = join(ps_quality, - ss_quality, - c_quality, - rho = 0.2), - earning_potential = 20000 + - 2000 * ps_quality + - 6000 * ss_quality + - 10000 * c_quality + + students = cross_level(N = 1500, + by = join(ps_quality, ss_quality, c_quality, + rho = 0.2), + earning_potential = 20000 + (2000 * ps_quality) + + (6000 * ss_quality) + (10000 * c_quality) + rnorm(N, 0, 5000)) )

-

One potential source for failure is specifying an invalid rho. If you specify a rho that makes the correlation between the three variables impossible to obtain, the [fabricate()] call will fail. A common case of this occurring is specifying a negative rho with three or more levels – it is clear that if A is negatively correlated with B, and B is negatively correlated with C, then A and C cannot be negatively correlated.

+

One potential source for failure is specifying an invalid rho. If you specify a rho that makes the correlation between the three variables impossible to obtain, the fabricate() call will fail. A common case of this occurring is specifying a negative rho with three or more levels – it is clear that if A is negatively correlated with B, and B is negatively correlated with C, then A and C cannot be negatively correlated.

Instead of specifying a rho correlation coefficient, users can specify a sigma correlation matrix to make the resulting correlations more sophisticated. Consider the following setup:

sigma = matrix(c(1, 0.4, 0.2,
                  0.4, 1, 0.8,
@@ -284,14 +278,11 @@ 

primary_schools = add_level(N = 20, ps_quality = runif(N, 1, 10)), secondary_schools = add_level(N = 15, ss_quality = runif(N, 1, 10), nest = FALSE), colleges = add_level(N = 50, c_quality = runif(N, 1, 10), nest = FALSE), - students = cross_level(N = 1500, by = join(ps_quality, - ss_quality, - c_quality, - sigma = sigma), - earning_potential = 20000 + - 2000 * ps_quality + - 6000 * ss_quality + - 10000 * c_quality + + students = cross_level(N = 1500, + by = join(ps_quality, ss_quality, c_quality, + sigma = sigma), + earning_potential = 20000 + (2000 * ps_quality) + + (6000 * ss_quality) + (10000 * c_quality) + rnorm(N, 0, 5000)) )

sigma must be specified as a symmetric square matrix with a diagonal of all 1s and a feasible correlation structure.

diff --git a/docs/articles/getting_started.html b/docs/articles/getting_started.html index b1e3b5b..46fbc6c 100644 --- a/docs/articles/getting_started.html +++ b/docs/articles/getting_started.html @@ -116,19 +116,17 @@

Aaron Rudkin

1. Creating common variable types

-

fabricatr allows you to quickly create variables that mimic those you plan to collect when conducting your final experiment. The current version supports common experimental variables including assignment to treatment, count data, ordinal data (including “Likert scale” data, popular in surveys and survey experiments), categorical data (popular for modeling demographic characteristics). In addition, we support the creation of data with fixed intra-cluster correlations, so individual observations can be modelled as being part of groups or regions.

+

fabricatr allows you to quickly create variables that mimic those you plan to collect during the course of observational or experimental work. The current version supports common variable types including assignment to treatment, count data, ordinal data (including “Likert scale” data, popular in surveys and survey experiments), categorical data (popular for modeling demographic characteristics). In addition, we support the creation of data with fixed intra-cluster correlations, so individual observations can be modelled as being part of groups or regions.

Imagine a survey experiment of voters from across social groups. With fabricatr, we can model voters as part of social groups, each of whom has characteristics like ideology and income, opinions about political issues. We can assign these voters to a treatment encouraging them to vote for a proposition, and model the results of the experiment:

library(fabricatr)
 
 voters = fabricate(
   N = 1000,
-  demographic_group = rep(1:10, 100),
-  ideology = draw_normal_icc(mean = 0, N = N, clusters = demographic_group, ICC = 0.7),
+  group_id = rep(1:10, 100),
+  ideology = draw_normal_icc(mean = 0, N = N, clusters = group_id, ICC = 0.7),
   ideological_label = draw_ordered(x = ideology,
-                                   break_labels = c("Very Conservative",
-                                                    "Conservative",
-                                                    "Liberal",
-                                                    "Very Liberal")),
+                                   break_labels = c("Very Conservative", "Conservative",
+                                                    "Liberal", "Very Liberal")),
   income = exp(rlnorm(n = N, meanlog = 2.4 - (ideology * 0.1), sdlog = 0.12)),
   Q1_immigration = draw_likert(x = ideology, type = 7),
   Q2_defence = draw_likert(x = ideology + 0.5, type = 7),
@@ -138,57 +136,57 @@ 

Let’s look at a small fraction of the data generated this way:

- - + + - + - + - + - + - + @@ -205,11 +203,8 @@

library(fabricatr)
 
 panel = fabricate(
-  countries = add_level(N = 150,
-                        country_fe = runif(N, 1, 10)),
-  years = add_level(N = 25,
-                    year_shock = runif(N, 1, 10),
-                    nest = FALSE),
+  countries = add_level(N = 150, country_fe = runif(N, 1, 10)),
+  years = add_level(N = 25, year_shock = runif(N, 1, 10), nest = FALSE),
   observations = cross_level(by = join(countries, years),
                              outcome_it = country_fe + year_shock + rnorm(N, 0, 2))
 )
diff --git a/docs/articles/index.html b/docs/articles/index.html index 14b7604..6089d82 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -136,7 +136,7 @@

All vignettes

  • Advanced features
  • Building and Importing Data
  • Common Social Sciences variables
  • -
  • Panel and Cross-classifieddata
  • +
  • Panel and Cross-classified data
  • Getting started with **fabricatr**
  • Using other data generating packages with **fabricatr**
  • Resampling data with fabricatr
  • diff --git a/docs/articles/resampling.html b/docs/articles/resampling.html index 0aaaeeb..3441b28 100644 --- a/docs/articles/resampling.html +++ b/docs/articles/resampling.html @@ -110,22 +110,15 @@

    Aaron Rudkin

    -

    fabricatr is a package designed to help you imagine your data before you collect it. While many solutions exist for creating simulated datasets, fabricatr is specifically designed to make the creation of realistic social science datasets easy. In particular, we need to be able to imagine correlated data and hierarchical data. fabricatr is designed to integrate into a tidyverse workflow, and to allow users to imagine data from scratch or by modifying existing data.

    -

    fabricatr is a member of the DeclareDesign software suite that also includes the r packages randomizr, estimatr, and DeclareDesign.

    -
    -

    -Simulating “resampling” from existing data.

    One way to imagine new data is to take data you already have and resample it, ensuring that existing inter-correlations between variables are preserved, while generating new data or expanding the size of the dataset. fabricatr offers several options to simulate resampling.

    -

    Bootstrapping

    -

    The simplest option in fabricatr is to “bootstrap” data. Taking data with N observations, the “bootstrap” resamples these observations with replacement and generates N new observations. Existing observations may be used zero times, once, or more than once. Bootstrapping is very simple with the resample_data() function:

    -
    survey_data = fabricate(N=10, 
    -                        voted_republican = draw_binary(prob=0.5, N=N))
    +

    The simplest option in fabricatr is to “bootstrap” data. Taking data with N observations, the “bootstrap” resamples these observations with replacement and generates N new observations. Existing observations may be used zero times, once, or more than once. Bootstrapping is very simple with the resample_data() function:

    +
    survey_data = fabricate(N = 10, voted_republican = draw_binary(prob = 0.5, N = N))
     
     survey_data_new = resample_data(survey_data)
    -survey_data
    +head(survey_data_new)
    demographic_groupgroup_id ideology ideological_label Q1_immigrationQ2_defence treatment proposition_vote
    680 10 -1.39 Very Conservative Lean DisagreeLean Disagree 1 0
    607 7 0.10 Liberal Don’t Know / NeutralLean Agree 0 1
    563 3 -2.69 Very Conservative Strongly DisagreeDisagree 0 0
    356 6 -0.28 Conservative Don’t Know / NeutralDon’t Know / Neutral 1 1
    743 3 0.67 Liberal Lean AgreeLean Agree 1 1
    @@ -133,29 +126,13 @@

    - - - - - - - - - - - - - + - - - - @@ -165,17 +142,17 @@

    - + - +
    ID
    011
    02 0
    031
    041
    0508 0
    061
    07 00
    0906 1
    1005 0
    -

    It is also possible to resample fewer or greater number of observations from your existing data. We can do this by specifying the argument N to resample_data(). Consider expanding a small dataset to allow for better imagination of larger data to be collected later.

    -
    large_survey_data = resample_data(survey_data, N=100)
    +

    It is also possible to resample fewer or greater number of observations from your existing data. We can do this by specifying the argument N to resample_data(). Consider expanding a small dataset to allow for better imagination of larger data to be collected later.

    +
    large_survey_data = resample_data(survey_data, N = 100)
     nrow(large_survey_data)

    100

    @@ -190,10 +167,8 @@

    citizens = add_level(N = 3, age = runif(N, 18, 70)) ) -my_data_2 <- resample_data(my_data, - N = c(3, 5), - ID_labels = c("cities", "citizens")) -my_data_2

    +my_data_2 <- resample_data(my_data, N = c(3, 5), ID_labels = c("cities", "citizens")) +head(my_data_2)

    @@ -238,63 +213,9 @@

    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    cities3 26
    11493235
    11493235
    11493133
    11493326
    21971444
    21971519
    21971519
    21971645
    21971444
    -

    resample_data() will first select the cities to be resampled. Then, for each city, it will continue by selecting the citizens to be resampled. If a higher level unit is used more than once (for example, the same city being chosen twice), and a lower level is subsequently resampled, the choices of which units to keep for the lower level will differ for each copy of the higher level. In this example, if city 1 is chosen twice, then the sets of five citizens chosen for each copy of the city 1 will differ.

    +

    resample_data() will first select the cities to be resampled. Then, for each city, it will continue by selecting the citizens to be resampled. If a higher level unit is used more than once (for example, the same city being chosen twice), and a lower level is subsequently resampled, the choices of which units to keep for the lower level will differ for each copy of the higher level. In this example, if city 1 is chosen twice, then the sets of five citizens chosen for each copy of the city 1 will differ.

    You can also specify the levels you wish to resample from using the name arguents to the N parameter, like in this example which does exactly the same thing as the previous example, but specifies the level names in a different way:

    my_data <-
       fabricate(
    @@ -302,9 +223,8 @@ 

    citizens = add_level(N = 3, age = runif(N, 18, 70)) ) -my_data_2 <- resample_data(my_data, - N = c(cities=3, citizens=5)) -my_data_2

    +my_data_2 <- resample_data(my_data, N = c(cities=3, citizens=5)) +head(my_data_2)
    @@ -349,59 +269,40 @@

    - - - - - - - - - - - - - - - - - - - - - - - - + +
    cities4 41
    21014639
    21014551
    21014441
    21014551
    + +
    +

    +“Passthrough” Resampling

    +

    In some cases it may make sense to resample each unit at a given level. For example, there may be value in resampling 1 citizen in each and every city represented in the data set. fabricatr allows the user to specify ALL for the N argument to a given level to accomplish this:

    +
    my_data <-
    +  fabricate(
    +    cities = add_level(N = 2, elevation = runif(n = N, min = 1000, max = 2000)),
    +    citizens = add_level(N = 3, age = runif(N, 18, 70))
    +  )
    +
    +my_data_3 <- resample_data(my_data, N = c(ALL, 1), ID_labels = c("cities", "citizens"))
    +head(my_data_3)
    + + + + + + + + - + - - - - - - - - - - - - - + - - - - - - - - - + + +
    citieselevationcitizensage
    114211122 164
    11421266
    1142134339
    11421343
    11421 2661438623
    @@ -414,9 +315,9 @@

    Contents

    diff --git a/docs/articles/variable_generation.html b/docs/articles/variable_generation.html index 722099a..775148f 100644 --- a/docs/articles/variable_generation.html +++ b/docs/articles/variable_generation.html @@ -123,14 +123,10 @@

    binary_2 = draw_binary(N = 3, prob = 0.5))

    In addition to binary variables, you can make data from repeated Bernoulli trials (“binomial” data). This requires using the draw_binomial() function and specifying an argument trials, equal to the number of trials.

    binomial_ex = fabricate(N = 3, 
    -                        freethrows = draw_binomial(N = N, 
    -                                                   prob = 0.5, 
    -                                                   trials = 10)
    -                        )
    + freethrows = draw_binomial(N = N, prob = 0.5, trials = 10))

    Some researchers may be interested in specifying probabilities through a “link function”. This can be done in any of your data generating functions through the link argument. The default link function is “identity”, but we also support “logit”, and “probit”. These link functions transform continuous and unbounded latent data into probabilities of a positive outcome.

    -
    bernoulli_probit = fabricate(N = 3, x = 10*rnorm(N), 
    -                             binary = draw_binary(prob = x,
    -                                                  link = "probit"))
    +
    bernoulli_probit = fabricate(N = 3, x = 10 * rnorm(N), 
    +                             binary = draw_binary(prob = x, link = "probit"))

    @@ -139,18 +135,12 @@

    In the following example, each of three observations has a latent variable x which is continuous and unbounded. The variable ordered transforms x into three numeric categories: 1, 2, and 3. All values of x below -1 result in ordered 1; all values of x between -1 and 1 result in ordered 2; all values of x above 1 result in ordered 3:

    ordered_example = fabricate(N = 3, 
                                 x = 5 * rnorm(N), 
    -                            ordered = draw_ordered(x,
    -                                                   breaks = c(-Inf, -1, 1, Inf)
    -                                                  )
    -                            )
    + ordered = draw_ordered(x, breaks = c(-Inf, -1, 1, Inf)))

    Ordered data also supports link functions including “logit” or “probit”:

    ordered_probit_example = fabricate(N = 3, 
                                        x = 5 * rnorm(N), 
    -                                   ordered = draw_ordered(x,
    -                                                          breaks = c(-Inf, -1, 1, Inf), 
    -                                                          link = "probit"
    -                                                         )
    -                                   )
    + ordered = draw_ordered(x, breaks = c(-Inf, -1, 1, Inf), + link = "probit"))

    @@ -168,8 +158,7 @@

    Q1 = draw_likert(x = rnorm(N), type = 7), Q2 = draw_likert(x = rnorm(N), type = 5), Q3 = draw_likert(x = rnorm(N), type = 4), - Q4 = draw_likert(x = rnorm(N), - breaks = c(-Inf, -0.8, 0, 1, 2, Inf)) + Q4 = draw_likert(x = rnorm(N), breaks = c(-Inf, -0.8, 0, 1, 2, Inf)) ) table(survey_data$Q2)

    @@ -209,14 +198,10 @@

    p1 = runif(N, 0, 1), p2 = runif(N, 0, 1), p3 = runif(N, 0, 1), - cat = draw_categorical(N = N, - prob = cbind(p1, p2, p3)) - ) + cat = draw_categorical(N = N, prob = cbind(p1, p2, p3)))

    In the second example, each unit has the same probability of getting a given category. draw_categorical() will issue a warning to remind you that it is interpreting the vector in this way.

    warn_draw_cat_example = fabricate(N = 6,
    -                                  cat = draw_categorical(N = N,
    -                                                         prob = c(0.2, 0.4, 0.4))
    -                                  )
    + cat = draw_categorical(N = N, prob = c(0.2, 0.4, 0.4)))
    ## Warning in draw_categorical(N = N, prob = c(0.2, 0.4, 0.4)): For a
     ## categorical (multinomial) distribution, a matrix of probabilities should
     ## be provided. The data below is generated by interpreting the vector of
    @@ -233,14 +218,12 @@ 

    Binary data with fixed ICCs

    draw_binary_icc() takes three required arguments: prob, a probability or vector of probabilities which determine the chance a given observation will be a 1; clusters, a map of units to clusters (required to generate the correlation structure); and ICC, the fixed intra-cluster correlation (from 0 to 1). Users may optionally specify N; if it is not specified, draw_binary_icc() will determine it based on the length of the clusters vector.

    Consider the following example, which models whether individuals smoke:

    -
    # 100 individual population, 10 each in each of 10 clusters
    -clusters = rep(1:10, 10)
    +
    # 100 individual population, 20 each in each of 5 clusters
    +clusters = rep(1:5, 20)
     
     # Individuals have a 20% chance of smoking, but clusters are highly correlated
     # in their tendency to smoke
    -smoker = draw_binary_icc(prob = 0.2,
    -                         clusters = clusters,
    -                         ICC = 0.7)
    +smoker = draw_binary_icc(prob = 0.2, clusters = clusters, ICC = 0.5)
     
     # Observe distribution of smokers and non-smokers
     table(smoker)
    @@ -250,8 +233,8 @@

    1 -83 -17 +76 +24

    We see that approximately 20% of the population smokes, in line with our specification, but what patterns of heterogeneity do we see by cluster?

    @@ -263,48 +246,28 @@

    -10 -0 - - -10 -0 - - -10 -0 - - -10 -0 - - -10 -0 +16 +4 +19 1 -9 -10 -0 +18 +2 -9 -1 +4 +16 -3 -7 - - -10 -0 +19 +1 -

    We observe that 7 clusters have no smokers at all, two clusters are overwhelming smokers, and one cluster is overwhelmingly non-smokers.

    +

    Here we learn that of our 5 clusters, 4 are overwhelmingly non-smokers, while a fifth is composed of 80% smokers.

    We can also specify separate mean for each cluster; but it is worth noting that the higher the ICC, the more the cluster mean will depart from the nominal cluster mean.

    If you do not specify a vector of probabilities or a correlation coefficient, the default values are probability 0.5 for each cluster and ICC of 0.5. If you do not specify cluster IDs, the function will return an error.

    @@ -315,19 +278,16 @@

    If sd is not supplied, each cluster will be assumed to have a within-cluster standard deviation of 1. If mean is not supplied, each cluster will be assumed to be mean zero. If ICC is not supplied, it will be set to 0.5.

    Here, we model student academic performance by cluster:

    # 100 students, 10 each in 10 clusters
    -clusters = rep(1:10, 10)
    +clusters = rep(1:5, 20)
     
    -numeric_grade = draw_normal_icc(mean = 80,
    -                               clusters = clusters,
    -                               ICC = 0.5,
    -                               sd = 15)
    +numeric_grade = draw_normal_icc(mean = 80, clusters = clusters, ICC = 0.5, sd = 15)
     
     letter_grade = draw_ordered(x = numeric_grade,
                                 breaks = c(-Inf, 60, 70, 80, 90, Inf),
                                 break_labels = c("F", "D", "C", "B", "A"))
     
     mean(numeric_grade)
    -

    82.77

    +

    84.12

    The mean grade matches the population mean. Now let’s look at the relationship between cluster and letter grade to observe the cluster pattern:

    table(letter_grade, clusters)
    @@ -342,76 +302,41 @@

    + - - - - - - - - - - - - - - - - + - - - - - - - - - + - - - - - - - + - - - + + + + - - - + - - - - - - - + - + + + - - - +
    0 03 226
    02422
    4040215
    25 0 30
    00015 9
    144013
    0115382 26
    10 47 23
    1211 52
    00 0143 218
    -

    It is obvious upon inspection that some clusters are higher performing than others despite having identical cluster means in expectation.

    +

    It is obvious upon inspection that two of the clusters contain academic high-performers, while two of the clusters have a substantial failure rate. Although each cluster has the same mean in expectation, the induced intra-cluster correlation forces some clusters higher and others lower.

    diff --git a/docs/index.html b/docs/index.html index b6f8571..b421205 100644 --- a/docs/index.html +++ b/docs/index.html @@ -130,22 +130,17 @@

    fabricatr is easy to learn and easy to read. Consider this example which generates data modeling the United States House of Representatives:

    library(fabricatr)
     
    -house_members = fabricate(
    +house_members <- fabricate(
       party_id = add_level(
    -    N = 2, 
    -    party_names = c("Republican", "Democrat"),
    -    party_ideology = c(0.5, -0.5), 
    -    in_power = c(1, 0), 
    -    party_incumbents = c(241, 194)),
    +    N = 2, party_names = c("Republican", "Democrat"), party_ideology = c(0.5, -0.5), 
    +    in_power = c(1, 0), party_incumbents = c(241, 194)),
       rep_id = add_level(
    -    N = party_incumbents, 
    -    member_ideology = rnorm(N, party_ideology, sd=0.5),
    -    terms_served = draw_count(N = N, mean = 4),
    +    N = party_incumbents, member_ideology = rnorm(N, party_ideology, sd = 0.5),
    +    terms_served = draw_count(N = N, mean = 4), 
         female = draw_binary(N = N, prob = 0.198))
       )
    - @@ -155,7 +150,6 @@

    - @@ -164,7 +158,6 @@

    - @@ -173,7 +166,6 @@

    - @@ -182,7 +174,6 @@

    - @@ -191,7 +182,6 @@

    - diff --git a/docs/news/index.html b/docs/news/index.html index 95acbef..3f9366d 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -130,10 +130,10 @@

    Change log All releases

    -
    +

    -fabricatr 1.0.0

    -

    first CRAN release

    +fabricatr 1.0.1 +

    First CRAN submission for fabricatr

    @@ -142,7 +142,7 @@

    Contents

    diff --git a/docs/reference/ALL.html b/docs/reference/ALL.html index 856ba1a..371d5b0 100644 --- a/docs/reference/ALL.html +++ b/docs/reference/ALL.html @@ -6,7 +6,8 @@ -fabricatr - Magic number constant to allow users to specify "ALL" for passthrough resampling — ALL +fabricatr - Magic number constant to allow users to specify <code>ALL</code> for passthrough +resampling — ALL @@ -125,11 +126,13 @@
    -

    Magic number constant to allow users to specify "ALL" for passthrough resampling

    +

    Magic number constant to allow users to specify ALL for passthrough +resampling

    ALL
    diff --git a/docs/reference/cross_level.html b/docs/reference/cross_level.html index 1d58a78..279acce 100644 --- a/docs/reference/cross_level.html +++ b/docs/reference/cross_level.html @@ -6,8 +6,7 @@ -fabricatr - Creates cross-classified (partially non-nested, joined data) with a fixed -correlation structure. — cross_level +fabricatr - Creates panel or cross-classified data — cross_level @@ -126,13 +125,14 @@
    -

    Creates cross-classified (partially non-nested, joined data) with a fixed -correlation structure.

    +

    This function allows the user to create data structures that are paneled or +cross-classified: where one level of observation draws simultaneously from +two or many source levels. Common examples of panels include country-year +data which have country-level and year-level characteristics.

    cross_level(N = NULL, by = NULL, ...)
    @@ -143,13 +143,14 @@

    Ar

    +If N is NULL or not provided, the join will be an "outer product" -- +merging each row of each provided data frame with each other data frame to +make a full panel.

    - + @@ -162,18 +163,23 @@

    Value

    data.frame

    +

    Details

    + +

    By specifying the appropriate arguments in join() within the +function call, it is possible to induce correlation in cross-classified data.

    +

    Examples

    # Generate full panel data - panel <- fabricate( countries = add_level(N = 20, country_shock = runif(N, 1, 10)), years = add_level(N = 20, year_shock = runif(N, 1, 10), nest=FALSE), obs = cross_level(by=join(countries, years), GDP_it = country_shock + year_shock) ) -# Generate cross-classified data and merge, no correlation +# Include an "N" argument to allow for cross-classified +# data. students <- fabricate( primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)), secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE), @@ -186,7 +192,8 @@

    Examp #> 4 18 7.4 06 2.7 004 #> 5 08 9.9 01 8.4 005 #> 6 11 5.2 07 9.5 006

    -# Cross-classified data with a correlation structure +# Induce a correlation structure in cross-classified data by providing +# rho. students <- fabricate( primary_school = add_level(N = 20, ps_quality = runif(N, 1, 10)), secondary_school = add_level(N = 15, ss_quality = runif(N, 1, 10), nest=FALSE), @@ -201,6 +208,8 @@

    Contents

  • Arguments
  • Value
  • + +
  • Details
  • Examples
  • diff --git a/docs/reference/draw_binary_icc.html b/docs/reference/draw_binary_icc.html index 3e4fc31..ec8f776 100644 --- a/docs/reference/draw_binary_icc.html +++ b/docs/reference/draw_binary_icc.html @@ -158,8 +158,8 @@

    Ar

    - +
    party_names party_ideology in_power
    339 Democrat -0.5 00
    217 Republican 0.5 10
    233 Republican 0.5 11
    263 Democrat -0.5 00
    140 Republican 0.5 1
    N

    The number of observations in the resulting data frame. -If N is NULL or not provided, the join will be an "outer join" -- creating a -full panel of each of the rows from each data frame provided.

    by

    The result of a call to join() which specifies how the -cross-classified data will be created

    The result of a call to join() which specifies how +the cross-classified data will be created

    ...
    ICC

    A number indicating the desired ICC, if none is provided will -default to 0.

    A number indicating the desired ICC, if none is provided +the default ICC will be 0.

    @@ -170,10 +170,17 @@

    Value

    Examples

    -
    clusters = rep(1:5, 10) +
    # Divide units into clusters +clusters = rep(1:5, 10) + +# Default probability 0.5, default ICC 0 draw_binary_icc(clusters = clusters)
    #> [1] 0 0 0 1 1 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 1 0 0 -#> [39] 1 0 0 0 1 0 1 0 1 0 0 1
    draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5)
    #> [1] 1 1 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 0 -#> [39] 1 0 1 1 0 1 0 1 1 1 1 0
    +#> [39] 1 0 0 0 1 0 1 0 1 0 0 1
    +# Specify probability or ICC +corr_draw = draw_binary_icc(prob = 0.5, clusters = clusters, ICC = 0.5) + +# Verify ICC of data. +summary(lm(corr_draw ~ as.factor(clusters)))$r.squared
    #> [1] 0.34