Skip to content

Commit

Permalink
DS-3784: Enable CombineVariableSetsAsBinary to handle unmatched (#29)
Browse files Browse the repository at this point in the history
* Enable CombineVariableSetsAsBinary to handle unmatched

Enable CombineVariableSetsAsBinary to handle unmatched categories. Previously the function would insist that categories match. Now, if a category is unobserved in one variable set but observed in another, the unobserved column is filled in and included in the final data.

* Updates from JW comments

* Update docs
  • Loading branch information
chrisfacer committed Jun 17, 2022
1 parent 00a8ea0 commit 5d2a182
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 10 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: flipData
Type: Package
Title: Functions for extracting and describing data
Version: 1.6.0
Version: 1.6.1
Author: Displayr <opensource@displayr.com>
Maintainer: Displayr <opensource@displayr.com>
Description: Functions for extracting data from formulas and
Expand All @@ -26,7 +26,7 @@ Imports: CVXR (>= 1.0.0),
stringr,
survey,
verbs
RoxygenNote: 7.1.2
RoxygenNote: 7.2.0
Encoding: UTF-8
Suggests: foreign,
gtools,
Expand Down
53 changes: 50 additions & 3 deletions R/combinevariablesetsasbinary.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@
#' @param compute.for.incomplete A boolean value. If \code{FALSE}, cases with any missing data
#' will have a missing vlaue. If \code{TRUE}, only cases whose data is entirely missing will
#' be assigned a missing value.
#' @importFrom verbs Count AnyOf
#' @param unmatched.pick.any.are.missing Boolean value. When one of the input variable sets
#' is binary (Pick Any variable set) and additonal columns need to be added, the new column is fillend
#' entirely with missing values when a value of \code{TRUE} is supplied. If set to \code{FALSE},
#' missing values will only be assigned for cases where all existing columns are missing. Note that for
#' mutually-exclusive input variables, new columns will be created such that only cases with entirely
#' missing values are assigned a missing value.
#' @importFrom verbs Count AnyOf SumEachRow
#' @importFrom flipTransformations AsNumeric
#' @export
CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE) {
CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE, unmatched.pick.any.are.missing = TRUE) {

variable.set.list <- list(...)

Expand Down Expand Up @@ -72,7 +78,10 @@ CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE) {
common.labels = Reduce(intersect, all.labels)

if (!setequal(unique.labels, common.labels)) {
stop("Unable to match categories from the input data. The labels which do not appear in all inputs are: ", paste0(setdiff(unique.labels, common.labels), collapse = ","))
binary.versions <- lapply(binary.versions,
FUN = fillInCategoriesWhenNotPresent,
expected.columns = unique.labels,
pick.any.all.missing = unmatched.pick.any.are.missing)
}

input.args = binary.versions
Expand Down Expand Up @@ -130,6 +139,7 @@ questionToBinary <- function(x) {
net.cols = vapply(cf, isDefaultNet, FUN.VALUE = logical(1), unique.codes = unique.codes)
x <- x[, !net.cols]
}
attr(x, "originalquestiontype") <- "Pick Any"
return(x)
}

Expand All @@ -151,8 +161,45 @@ questionToBinary <- function(x) {
binary.version <- AsNumeric(x, binary = TRUE, name = levels(x))
colnames(binary.version) <- levels(x)
binary.version[is.na(x), ] <- NA
attr(binary.version, "originalquestiontype") <- "Pick One"
return(binary.version)
}

stop("Unsupported data type: ", question.type)
}

# Function to expand the number of columns in the binary data
# when there are fewer columns than expected. expected.columns
# should be a vector of column names.
fillInCategoriesWhenNotPresent <- function(binary.data, expected.columns, pick.any.all.missing = TRUE) {

current.colnames <- colnames(binary.data)

if (all(expected.columns %in% current.colnames))
return(binary.data)

new.colnames <- expected.columns[! expected.columns %in% current.colnames]
new.data <- matrix(FALSE, nrow = nrow(binary.data), ncol = length(new.colnames))
colnames(new.data) <- new.colnames


# Missing data rule
# For data which was originally mutually-exclusive,
# cases are assigned missing values in the new columns
# when the case has missing data in the existing columns.
# In this case the row will always be entirely missing
# or entirely non-missing.
# For data which was already binary, new columns should be
# entirely missing unless overridden by the argument.
n.missing.per.case <- SumEachRow(is.na(binary.data))
missing.in.new.data = rep(TRUE, nrow(binary.data))
if (attr(binary.data, "originalquestiontype") == "Pick One" || !pick.any.all.missing) {
missing.in.new.data <- n.missing.per.case == ncol(binary.data)
}

new.data[missing.in.new.data, ] <- NA

binary.data <- cbind(binary.data, new.data)
binary.data <- binary.data[, expected.columns]
binary.data
}
17 changes: 14 additions & 3 deletions man/CombineVariableSetsAsBinary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 28 additions & 2 deletions tests/testthat/test-combinevariablesetsasbinary.R
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,36 @@ test_that("Missing data", {
expect_true(all(is.na(CombineVariableSetsAsBinary(aided, unaided, compute.for.incomplete = FALSE)[n.missing > 0])))
})

test_that("Filling in unmatched columns correctly", {

aided.2 <- aided
attr(aided.2, "originalquestiontype") <- "Pick Any"
expect_equal(fillInCategoriesWhenNotPresent(aided.2, colnames(aided.2)), aided.2)
expect_true(all(is.na(fillInCategoriesWhenNotPresent(aided.2, c(colnames(aided.2), "Hello"))[, "Hello"])))

Q4.pepsi.light.2 <- Q4.pepsi.light
Q4.pepsi.light.2[c(1,2,3)] <- NA
Q4.pepsi.light.binary <- CombineVariableSetsAsBinary(Q4.pepsi.light.2)
attr(Q4.pepsi.light.binary, "originalquestiontype") <- "Pick One"

expect_equal(which(is.na(fillInCategoriesWhenNotPresent(Q4.pepsi.light.binary, c(colnames(Q4.pepsi.light.binary), "Hello"))[, "Hello"])), c(1,2,3))

})

test_that("Unmatched columns included", {
aided.2 <- aided
colnames(aided.2)[11] <- "Hello"
combined <- CombineVariableSetsAsBinary(aided.2, unaided)
unique.cols <- unique(c(colnames(aided.2), colnames(unaided)))
expect_true(all(colnames(combined) %in% unique.cols))
expect_true(all(unique.cols %in% colnames(combined)))
expect_equal(as.numeric(combined[, "Hello"]), aided.2[, "Hello"])
})



test_that("Error messages", {
aided.2 <- aided
colnames(aided.2)[11] <- "Telstra"
expect_error(CombineVariableSetsAsBinary(aided.2, unaided), "duplicate")
colnames(aided.2)[11] <- "Phone company"
expect_error(CombineVariableSetsAsBinary(aided.2, unaided), "Unable to match")
})

0 comments on commit 5d2a182

Please sign in to comment.