Skip to content

Commit

Permalink
Merge pull request #20 from DeclareDesign/variable_creation
Browse files Browse the repository at this point in the history
Tests, error handling, and resample rewrite
  • Loading branch information
graemeblair committed Oct 25, 2017
2 parents 18dd32d + 937b1a4 commit 38a2f02
Show file tree
Hide file tree
Showing 5 changed files with 388 additions and 72 deletions.
78 changes: 66 additions & 12 deletions R/fabricate.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,23 @@ fabricate <-
all_levels <- check_all_levels(options)

if (!missing(data) && !"data.frame" %in% class(data)) {
stop(
"Please provide a data object to the data argument, e.g. a data.frame, tibble, or sf object."
)
if(is.null(dim(data))) {
stop(
"User provided data must be a data frame. Provided data was low dimensional."
)
}
if(!"data" %in% names(sys.call())) {
stop(
"The data argument must be a data object. The argument call, ", deparse(substitute(data)), ", was not a data object (e.g. a data.frame, tibble, sf object, or convertible matrix)."
)
}
tryCatch({
data = as.data.frame(data)
}, error=function(e) {
stop(
"User provided data could not convert to a data frame."
)
})
}


Expand Down Expand Up @@ -92,9 +106,26 @@ fabricate <-
if(missing(N)) N <- NULL
if(missing(ID_label)) ID_label <- NULL

ID_label <- substitute(ID_label)
if (!is.null(ID_label)) {
ID_label <- as.character(ID_label)
if(is.symbol(substitute(ID_label))) {
ID_label <- substitute(ID_label)
if (!is.null(ID_label)) {
ID_label <- as.character(ID_label)
}
} else if(!is.null(ID_label)) {
if(is.vector(ID_label) & length(ID_label) > 1) {
# Vector of length n>1, error
stop("Provided ID_label must be a character vector of length 1 or variable name.")
} else if(is.vector(ID_label) & is.numeric(ID_label[1])) {
# Numeric ID_label
warning("Provided ID_label is numeric and will be prefixed with the character \"X\"")
ID_label <- as.character(ID_label)
} else if(is.vector(ID_label) & is.character(ID_label[1])) {
# Valid ID_label
ID_label <- as.character(ID_label)
} else if(!is.null(dim(ID_label))) {
# Higher dimensional ID_label
stop("Provided ID_label must be a character vector or variable name, not a data frame or matrix.")
}
}

fabricate_data_single_level(
Expand All @@ -121,12 +152,36 @@ fabricate_data_single_level <- function(data = NULL,

if (!is.null(N)) {
if (length(N) != 1) {
stop(
"At the top level, ",
ID_label,
", you must provide a single number to N."
)
if(is.null(ID_label)) {
stop(
"At the top level, you must provide a single number to N."
)
} else {
stop(
"At the top level, ",
ID_label,
", you must provide a single number to N."
)
}
}

if(is.numeric(N) & any(!N%%1 == 0)) {
stop(paste0(
"The provided N must be an integer number. Provided N was of type ",
typeof(N)
))
}

if(!is.numeric(N)) {
tryCatch({
N = as.numeric(N)
}, error=function(e) {
stop(
"The provided value for N must be an integer number."
)
})
}

data <- data.frame()
existing_ID <- FALSE
} else if(!is.null(data)){
Expand Down Expand Up @@ -181,7 +236,6 @@ check_all_levels <- function(options){

if (length(options) == 0) return(FALSE)


is_function <- sapply(options, function(i) {
is_lang(get_expr(i))
})
Expand Down
94 changes: 59 additions & 35 deletions R/resample_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,59 +28,83 @@
#'
#'
#' @export
resample_data <- function(data, N, ID_labels = NULL) {
# setup
resample_data = function(data, N, ID_labels=NULL) {
# User didn't provide an N or an ID label, it's clear they just want a regular bootstrap
if (missing(N) & is.null(ID_labels)) {
N <- nrow(data)
return(bootstrap_single_level(data, nrow(data), ID_label=NULL))
}
k <- length(N) ## number of levels

# checks
if (!is.null(ID_labels) & (k != length(ID_labels))) {
# Error handling
if (!is.null(ID_labels) & (length(N) != length(ID_labels))) {
stop(
"If you provide more than one ID_labels to resample data for multilevel data, please provide a vector for N of the same length representing the number to resample at each level."
)
}

# Case 1: Single Level
if (k == 1) {
data <- bootstrap_single_level(data = data, N = N)
} else {
# Case 2: Multi Level

data <- bootstrap_single_level(data, ID_label = ID_labels[1], N = N[1])
if (any(!ID_labels %in% names(data))) {
stop(
"One or more of the ID labels you provided are not columns in the data frame provided."
)
}

for (i in 2:k) {
group_by_set <- ID_labels[1:(i - 1)]
group_by_list <- as.list(data[, group_by_set, drop = FALSE])
new_data_list <- split(data, group_by_list)
new_data_list <-
lapply(new_data_list,
bootstrap_single_level,
ID_label = ID_labels[i],
N = N[i])
data <- do.call(rbind, new_data_list)
}
if(length(N) > 10) {
stop(
"Multi-level bootstrap with more than 10 levels is not advised."
)
}
rownames(data) <- NULL
return(data)
}

# Single level bootstrap with explicit bootstrapping on a particular cluster variable
if(length(N)==1)
{
return(bootstrap_single_level(data,
N[1],
ID_label=ID_labels[1]))
} else {
# Do the current bootstrap level
current_boot_values = unique(data[, ID_labels[1]])
sampled_boot_values = sample(1:length(current_boot_values), N[1], replace=TRUE)
app = 0

# Iterate over each thing chosen at the current level
results_all = lapply(sampled_boot_values, function(i) {
new_results = resample_data(
data[data[, ID_labels[1]] == i, ],
N=N[2:length(N)],
ID_labels=ID_labels[2:length(ID_labels)]
)
})
#res = rbindlist(results_all)
res = do.call(rbind, results_all)
rownames(res) = NULL
# Return to preceding level
return(res)
}
}

bootstrap_single_level <-
function(data, ID_label = NULL, N) {
bootstrap_single_level <- function(data, ID_label = NULL, N) {
if(dim(data)[1] == 0) {
stop("Data being bootstrapped has no rows.")
}
if (is.null(ID_label)) {
boot_indicies <- sample(1:nrow(data), N, replace = TRUE)
} else {
# Simple bootstrap
boot_indices <- sample(1:nrow(data), N, replace = TRUE)
} else if(!ID_label %in% colnames(data)) {
stop("ID label provided is not a column in the data being bootstrapped.")
} else {
# Bootstrapping unique values of ID_label (i.e. cluster selection when data
# are observations, not clusters
boot_ids <-
sample(unique(data[, ID_label]), size = N, replace = TRUE)
boot_indicies <- unlist(lapply(boot_ids, function(i) {
# Need to do the unlist-apply approach to ensure each row
# is appropriately duplicated. Faster than other ways to map
# cluster ids to row ids.
boot_indices <- unlist(lapply(boot_ids, function(i) {
which(data[, ID_label] == i)
}))
}
new_data <- data[boot_indicies, , drop = FALSE]
return(new_data)
}

# Grab the relevant rows
new_data <- data[boot_indices, , drop = FALSE]

return(new_data)
}
96 changes: 87 additions & 9 deletions R/variable_creation_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,18 @@ draw_discrete <-
"ordered",
"count")) {
stop(
"Please choose either 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered' or 'count' as a data type."
"Please choose either 'binary' (or 'bernoulli'), 'binomial', 'categorical', 'ordered', or 'count' as a data type."
)
}

if (N %% length(x) & type != "categorical") {
stop(
"N is not an even multiple of the length of the vector x."
)
}

if (mode(x) != "numeric") {
stop("Please provide a numeric vector to x.")
stop("\"x\" must be a number or vector of numbers.")
}

if (link == "logit") {
Expand All @@ -85,16 +91,49 @@ draw_discrete <-
}
if (link == "identity")
if (!all(0 <= x & x <= 1)) {
warning("The identity link requires values between 0 and 1, inclusive.")
stop("The identity link requires probability values between 0 and 1, inclusive.")
}

out <- rbinom(N, k, prob)

} else if (type == "binomial") {
if (link == "identity")
if (!all(0 <= x & x <= 1)) {
warning("The identity link requires values between 0 and 1, inclusive.")
stop("The identity link requires values between 0 and 1, inclusive.")
}

if(is.vector(k) & length(k)>1) {
if(N %% length(k)) {
stop(
"\"N\" is not an even multiple of the length of the number of trials, \"k\"."
)
}
if(!all(is.numeric(k) & (is.integer(k) | !k%%1))) {
stop(
"All numbers of trials should be integer numbers."
)
}
}
if(!is.null(dim(k))) {
stop(
"Number of trials must be an integer or vector, not higher-dimensional."
)
}
if(is.null(k) | is.na(k)) {
stop(
"Number of trials must be specified, not null or NA."
)
}
if(is.numeric(k) & !is.integer(k) & k%%1) {
stop(
"Number of trials must be an integer."
)
}
if(k <= 0) {
stop(
"Number of trials must be a positive integer."
)
}

out <- rbinom(N, k, prob)

Expand All @@ -103,21 +142,60 @@ draw_discrete <-
x <- x + rnorm(N)
}

# out <- cut(x, breaks, labels = break_labels) - 1
if (is.null(breaks) | any(is.na(breaks))) {
stop("You must specify numeric breaks for ordered data.")
}
if (any(!is.numeric(breaks))) {
stop("All breaks specified for ordered data must be numeric.")
}
if (is.matrix(breaks) | is.data.frame(breaks)) {
stop("Numeric breaks must be a vector.")
}
if (length(breaks) < 3) {
stop("Numeric breaks for ordered data must be of at least length 3.")
}
if (!all(sort(breaks) == breaks)) {
stop("Numeric breaks must be in ascending order.")
}
if(any(breaks[1] > x) | any(breaks[length(breaks)] < x)) {
stop("Numeric break endpoints should be outside min/max of x data range.")
}
if(is.vector(break_labels) & !is.logical(break_labels) & all(!is.na(break_labels)) & length(break_labels) != length(breaks)-1) {
stop("Break labels should be of length one less than breaks.")
}

out <- cut(x, breaks, labels = break_labels)

} else if (type == "count") {
if (link != "identity") {
stop("Count data does not accept link functions.")
}

out <- rpois(N, lambda = x)
if (any(x < 0)) {
stop(
"All provided count values must be non-negative."
)
}

## Categorical
out <- rpois(N, lambda = x)

} else if (type == "categorical") {
if (is.null(dim(x)))
stop("For a categorical distribution a matrix of probabilities should be provided")
if (link != "identity") {
stop("Categorical data does not accept link functions.")
}

if (is.null(dim(x))) {
if (is.vector(x) & all(is.numeric(x)) & length(x)>1) {
x <- matrix(rep(x, N), byrow=TRUE, ncol=length(x), nrow=N)
warning(
"For a categorical (multinomial) distribution, a matrix of probabilities should be provided. Data generated by interpreting vector of category probabilities, identical for each observation."
)
} else {
stop(
"For a categorical (multinomial) distribution, a matrix of probabilities should be provided"
)
}
}
if (!all(apply(x, 1, min) > 0)) {
stop(
"For a categorical (multinomial) distribution, the elements of x should be positive and sum to a positive number."
Expand Down
Loading

0 comments on commit 38a2f02

Please sign in to comment.