diff --git a/DESCRIPTION b/DESCRIPTION index c25344e..d830bd8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: flipData Type: Package Title: Functions for extracting and describing data -Version: 1.6.0 +Version: 1.6.1 Author: Displayr Maintainer: Displayr Description: Functions for extracting data from formulas and @@ -26,7 +26,7 @@ Imports: CVXR (>= 1.0.0), stringr, survey, verbs -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.0 Encoding: UTF-8 Suggests: foreign, gtools, diff --git a/R/combinevariablesetsasbinary.R b/R/combinevariablesetsasbinary.R index d0fbc1c..d7f0d9b 100644 --- a/R/combinevariablesetsasbinary.R +++ b/R/combinevariablesetsasbinary.R @@ -8,10 +8,16 @@ #' @param compute.for.incomplete A boolean value. If \code{FALSE}, cases with any missing data #' will have a missing vlaue. If \code{TRUE}, only cases whose data is entirely missing will #' be assigned a missing value. -#' @importFrom verbs Count AnyOf +#' @param unmatched.pick.any.are.missing Boolean value. When one of the input variable sets +#' is binary (Pick Any variable set) and additonal columns need to be added, the new column is fillend +#' entirely with missing values when a value of \code{TRUE} is supplied. If set to \code{FALSE}, +#' missing values will only be assigned for cases where all existing columns are missing. Note that for +#' mutually-exclusive input variables, new columns will be created such that only cases with entirely +#' missing values are assigned a missing value. +#' @importFrom verbs Count AnyOf SumEachRow #' @importFrom flipTransformations AsNumeric #' @export -CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE) { +CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE, unmatched.pick.any.are.missing = TRUE) { variable.set.list <- list(...) @@ -72,7 +78,10 @@ CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE) { common.labels = Reduce(intersect, all.labels) if (!setequal(unique.labels, common.labels)) { - stop("Unable to match categories from the input data. The labels which do not appear in all inputs are: ", paste0(setdiff(unique.labels, common.labels), collapse = ",")) + binary.versions <- lapply(binary.versions, + FUN = fillInCategoriesWhenNotPresent, + expected.columns = unique.labels, + pick.any.all.missing = unmatched.pick.any.are.missing) } input.args = binary.versions @@ -130,6 +139,7 @@ questionToBinary <- function(x) { net.cols = vapply(cf, isDefaultNet, FUN.VALUE = logical(1), unique.codes = unique.codes) x <- x[, !net.cols] } + attr(x, "originalquestiontype") <- "Pick Any" return(x) } @@ -151,8 +161,45 @@ questionToBinary <- function(x) { binary.version <- AsNumeric(x, binary = TRUE, name = levels(x)) colnames(binary.version) <- levels(x) binary.version[is.na(x), ] <- NA + attr(binary.version, "originalquestiontype") <- "Pick One" return(binary.version) } stop("Unsupported data type: ", question.type) } + +# Function to expand the number of columns in the binary data +# when there are fewer columns than expected. expected.columns +# should be a vector of column names. +fillInCategoriesWhenNotPresent <- function(binary.data, expected.columns, pick.any.all.missing = TRUE) { + + current.colnames <- colnames(binary.data) + + if (all(expected.columns %in% current.colnames)) + return(binary.data) + + new.colnames <- expected.columns[! expected.columns %in% current.colnames] + new.data <- matrix(FALSE, nrow = nrow(binary.data), ncol = length(new.colnames)) + colnames(new.data) <- new.colnames + + + # Missing data rule + # For data which was originally mutually-exclusive, + # cases are assigned missing values in the new columns + # when the case has missing data in the existing columns. + # In this case the row will always be entirely missing + # or entirely non-missing. + # For data which was already binary, new columns should be + # entirely missing unless overridden by the argument. + n.missing.per.case <- SumEachRow(is.na(binary.data)) + missing.in.new.data = rep(TRUE, nrow(binary.data)) + if (attr(binary.data, "originalquestiontype") == "Pick One" || !pick.any.all.missing) { + missing.in.new.data <- n.missing.per.case == ncol(binary.data) + } + + new.data[missing.in.new.data, ] <- NA + + binary.data <- cbind(binary.data, new.data) + binary.data <- binary.data[, expected.columns] + binary.data +} \ No newline at end of file diff --git a/man/CombineVariableSetsAsBinary.Rd b/man/CombineVariableSetsAsBinary.Rd index 6449406..0e58c95 100644 --- a/man/CombineVariableSetsAsBinary.Rd +++ b/man/CombineVariableSetsAsBinary.Rd @@ -4,15 +4,26 @@ \alias{CombineVariableSetsAsBinary} \title{CombineVariableSetsAsBinary} \usage{ -CombineVariableSetsAsBinary(variable.set.list, compute.for.incomplete = TRUE) +CombineVariableSetsAsBinary( + ..., + compute.for.incomplete = TRUE, + unmatched.pick.any.are.missing = TRUE +) } \arguments{ -\item{variable.set.list}{A list containing one or more variable sets which should be -Nominal, Ordinal, Nominal/Ordinal - Multi, Binary - Multi, or Binary - Multi (Compact)} +\item{...}{One or more variable sets which should be Nominal, Ordinal, Nominal/Ordinal - Multi, +Binary - Multi, or Binary - Multi (Compact)} \item{compute.for.incomplete}{A boolean value. If \code{FALSE}, cases with any missing data will have a missing vlaue. If \code{TRUE}, only cases whose data is entirely missing will be assigned a missing value.} + +\item{unmatched.pick.any.are.missing}{Boolean value. When one of the input variable sets +is binary (Pick Any variable set) and additonal columns need to be added, the new column is fillend +entirely with missing values when a value of \code{TRUE} is supplied. If set to \code{FALSE}, +missing values will only be assigned for cases where all existing columns are missing. Note that for +mutually-exclusive input variables, new columns will be created such that only cases with entirely +missing values are assigned a missing value.} } \description{ Combines a list of variable sets to binary variables, matching categories between them. diff --git a/tests/testthat/test-combinevariablesetsasbinary.R b/tests/testthat/test-combinevariablesetsasbinary.R index efe8999..604b820 100644 --- a/tests/testthat/test-combinevariablesetsasbinary.R +++ b/tests/testthat/test-combinevariablesetsasbinary.R @@ -105,10 +105,36 @@ test_that("Missing data", { expect_true(all(is.na(CombineVariableSetsAsBinary(aided, unaided, compute.for.incomplete = FALSE)[n.missing > 0]))) }) +test_that("Filling in unmatched columns correctly", { + + aided.2 <- aided + attr(aided.2, "originalquestiontype") <- "Pick Any" + expect_equal(fillInCategoriesWhenNotPresent(aided.2, colnames(aided.2)), aided.2) + expect_true(all(is.na(fillInCategoriesWhenNotPresent(aided.2, c(colnames(aided.2), "Hello"))[, "Hello"]))) + + Q4.pepsi.light.2 <- Q4.pepsi.light + Q4.pepsi.light.2[c(1,2,3)] <- NA + Q4.pepsi.light.binary <- CombineVariableSetsAsBinary(Q4.pepsi.light.2) + attr(Q4.pepsi.light.binary, "originalquestiontype") <- "Pick One" + + expect_equal(which(is.na(fillInCategoriesWhenNotPresent(Q4.pepsi.light.binary, c(colnames(Q4.pepsi.light.binary), "Hello"))[, "Hello"])), c(1,2,3)) + +}) + +test_that("Unmatched columns included", { + aided.2 <- aided + colnames(aided.2)[11] <- "Hello" + combined <- CombineVariableSetsAsBinary(aided.2, unaided) + unique.cols <- unique(c(colnames(aided.2), colnames(unaided))) + expect_true(all(colnames(combined) %in% unique.cols)) + expect_true(all(unique.cols %in% colnames(combined))) + expect_equal(as.numeric(combined[, "Hello"]), aided.2[, "Hello"]) +}) + + + test_that("Error messages", { aided.2 <- aided colnames(aided.2)[11] <- "Telstra" expect_error(CombineVariableSetsAsBinary(aided.2, unaided), "duplicate") - colnames(aided.2)[11] <- "Phone company" - expect_error(CombineVariableSetsAsBinary(aided.2, unaided), "Unable to match") })