Skip to content

Commit

Permalink
Merge ad6b54c into fd88f3c
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronrudkin authored Dec 20, 2017
2 parents fd88f3c + ad6b54c commit 86dd785
Show file tree
Hide file tree
Showing 45 changed files with 3,224 additions and 712 deletions.
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: fabricatr
Type: Package
Title: Imagine your data before you collect it
Version: 1.0.0
Version: 1.0.1
Authors@R: c(person("Graeme", "Blair", email = "graeme.blair@ucla.edu", role = c("aut", "cre")),
person("Jasper", "Cooper", email = "jjc2247@columbia.edu", role = c("aut")),
person("Alexander", "Coppock", email = "alex.coppock@yale.edu", role = c("aut")),
Expand All @@ -22,6 +22,7 @@ Suggests:
dplyr,
knitr,
rmarkdown,
data.table
FasterWith: data.table
data.table,
mvnfast
FasterWith: data.table, mvnfast
VignetteBuilder: knitr
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
# Generated by roxygen2: do not edit by hand

export(ALL)
export(add_level)
export(cross_level)
export(draw_binary)
export(draw_binary_icc)
export(draw_discrete)
export(draw_normal_icc)
export(fabricate)
export(join)
export(level)
export(modify_level)
export(resample_data)
importFrom(rlang,eval_tidy)
importFrom(rlang,get_expr)
Expand All @@ -16,6 +20,7 @@ importFrom(rlang,lang_modify)
importFrom(rlang,lang_name)
importFrom(rlang,quo)
importFrom(rlang,quo_name)
importFrom(rlang,quo_text)
importFrom(rlang,quos)
importFrom(stats,median)
importFrom(stats,pnorm)
Expand Down
135 changes: 135 additions & 0 deletions R/cross_classify_helpers.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
join_dfs = function(dfs, variables, N, sigma=NULL, rho=0) {
# Error handling
if(is.data.frame(dfs)) {
stop("You need at least two data frames.")
}
if(length(dfs) != length(variables)) {
stop("You must define which variables to join on.")
}
if(length(variables) < 2) {
stop("You must define at least two variables to join on.")
}

# Create the data list -- the subset from the dfs of the variables we're
# joining on -- for each df in dfs, map it to a variable. Subset the df to
# that variable. Unlist and unname, creating a vector. Plonk that in a
# data_list
data_list = Map(function(x, y) {
unname(unlist(x[y]))
}, dfs, variables)

# Do the joint draw
result = joint_draw_ecdf(data_list=data_list,
N=N,
sigma=sigma,
rho=rho)

# result now contains a matrix of indices. Each column of this matrix is
# the indices for each df of dfs. Subset by row the df. This will return
# a list of new dfs. We need to cbind these dfs to make the merged data.
merged_data = do.call("cbind",
Map(function(df, indices) { df[indices, ] },
dfs,
result))

# Cleanup: remove row names
rownames(merged_data) = NULL
# Re-write the column names to be the original column names from the original
# dfs.
colnames(merged_data) = unname(unlist(lapply(dfs, colnames)))

merged_data
}

joint_draw_ecdf = function (data_list, N, ndim=length(data_list),
sigma=NULL, rho=0, use_f = TRUE) {

# We don't modify data_list, but this is useful to ensure the
# argument is evaluated anyway
force(ndim)

# Error handling for N
if(is.null(N) || is.na(N) || !is.atomic(N) || length(N) > 1 || N <= 0) {
stop("N must be a single integer that is positive.")
}

# Error handling for rho, if specified
if(is.null(sigma)) {
if(is.atomic(rho) & length(rho)==1) {
if(ndim>2 & rho<0) {
stop("The correlation matrix must be positive semi-definite. Specifically, ",
"if the number of variables being drawn from jointly is 3 or more, ",
"then the correlation coefficient rho must be non-negative.")
}

if(rho == 0) {
# Uncorrelated draw would be way faster; just sample each column
return(lapply(seq_along(data_list),
function(vn) {
sample.int(length(data_list[[vn]]), N, replace=TRUE)
}))
}
sigma = matrix(rho, ncol=ndim, nrow=ndim)
diag(sigma) = 1
} else {
stop("If specified, rho should be a single number")
}
}

# Error handling for sigma
if(ncol(sigma) != ndim | nrow(sigma) != ndim | any(diag(sigma) != 1)) {
stop("The correlation matrix must be square, with the number of dimensions ",
"equal to the number of dimensions you are drawing from. In addition, ",
"the diagonal of the matrix must be equal to 1.")
}

# Can we use the fast package or are we stuck with the slow one?
use_f = use_f && requireNamespace("mvnfast", quietly = TRUE)

# Standard normal = all dimensions mean 0.
mu = rep(0, ndim)

# Possible options for the joint normal draw
if(!use_f) {
# Below code is a reimplementation of the operative parts of rmvnorm from
# the mvtnorm package so that we don't induce a dependency

# Right cholesky decomposition (i.e. LR = sigma s.t. L is lower triang, R
# is upper triang.)
right_chol = chol(sigma, pivot=TRUE)
# Order columns by the pivot attribute -- induces numerical stability?
right_chol = right_chol[, order(attr(right_chol, "pivot"))]
# Generate standard normal data and right-multiply by decomposed matrix
# with right_chol to make it correlated.
correlated_sn <- matrix(rnorm(N * ndim),
nrow = N) %*% right_chol

} else {
# Using mvnfast
correlated_sn = mvnfast::rmvn(N, ncores = getOption("mc.cores", 2L), mu, sigma)
}

# Z-scores to quantiles
quantiles = pnorm(correlated_sn)
colnames(quantiles) = names(data_list)

# Quantiles to inverse eCDF.
result = lapply(
seq_along(data_list),
function(vn) {
# What would the indices of the quantiles be if our data was ordered --
# if the answer is below 0, set it to 1. round will ensure the tie-
# breaking behaviour is random with respect to outcomes
ordered_indices = pmax(1,
round(quantiles[, vn] * length(data_list[[vn]]))
)

# Now get the order permutation vector and map that to the ordered indices
# to get the indices in the original space
indices = order(data_list[[vn]])[ordered_indices]
})


# Set up response
result
}
8 changes: 5 additions & 3 deletions R/draw_binary_icc.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
#' Generation, and Estimation of Intracluster Correlation Coefficient (ICC)
#' for Binary Data".
#'
#' @param x A number or vector of numbers, one probability per cluster.
#' @param x A number or vector of numbers, one probability per cluster. If none
#' is provided, will default to 0.5.
#' @param N (Optional) A number indicating the number of observations to be
#' generated. Must be equal to length(clusters) if provided.
#' @param clusters A vector of factors or items that can be coerced to
#' clusters; the length will determine the length of the generated data.
#' @param rho A number indicating the desired RCC.
#' @param rho A number indicating the desired ICC, if none is provided will
#' default to 0.
#' @return A vector of binary numbers corresponding to the observations from
#' the supplied cluster IDs.
#' @examples
Expand All @@ -22,7 +24,7 @@
#' @importFrom stats rbinom
#'
#' @export
draw_binary_icc = function(x = 0.5, N = NULL, clusters, rho = 0.5) {
draw_binary_icc = function(x = 0.5, N = NULL, clusters, rho = 0) {
# Let's not worry about how clusters are provided
tryCatch({
clusters = as.numeric(as.factor(clusters))
Expand Down
9 changes: 5 additions & 4 deletions R/draw_normal_icc.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@
#' used in this function is specified at the following URL:
#' \url{https://stats.stackexchange.com/questions/263451/create-synthetic-data-with-a-given-intraclass-correlation-coefficient-icc}
#'
#' @param x A number or vector of numbers, one mean per cluster.
#' @param x A number or vector of numbers, one mean per cluster. If none is
#' provided, will default to 0.
#' @param N (Optional) A number indicating the number of observations to be
#' generated. Must be equal to length(clusters) if provided.
#' @param clusters A vector of factors or items that can be coerced to
#' clusters; the length will determine the length of the generated data.
#' @param sd A number or vector of numbers, indicating the standard deviation of
#' each cluster's error terms
#' @param rho A number indicating the desired RCC.
#' @param rho A number indicating the desired ICC. If none is provided,
#' will default to 0.
#' @return A vector of numbers corresponding to the observations from
#' the supplied cluster IDs.
#' @examples
Expand All @@ -26,8 +28,7 @@ draw_normal_icc = function(x = 0,
N = NULL,
clusters,
sd = 1,
rho = 0.5) {

rho = 0) {
# Let's not worry about how clusters are provided
tryCatch({
clusters = as.numeric(as.factor(clusters))
Expand Down
Loading

0 comments on commit 86dd785

Please sign in to comment.