Merge branch 'master' into DS-3773

Displayr · Jul 4, 2022 · 4a5d3e7 · 4a5d3e7
2 parents e1bf67c + 8519d49
commit 4a5d3e7
Show file tree

Hide file tree

Showing 11 changed files with 422 additions and 6 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -4,14 +4,15 @@ r:
 dist: bionic
 cache: packages
 warnings_are_errors: false
+latex: false
 
 # install debian libraries to match R-servers
 # update pre-installed packages to latest versions
 before_install:
   - export R_REMOTES_NO_ERRORS_FROM_WARNINGS="true"
   - sudo add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable
   - sudo apt-get -qq update
-  - sudo apt-get install -y libgdal-dev libproj-dev python-protobuf libprotoc-dev libprotobuf-dev libv8-dev librsvg2-dev libmpfr-dev libnlopt-dev
+  - sudo apt-get install -y libgdal-dev libproj-dev python-protobuf libprotoc-dev  libprotobuf-dev libv8-dev librsvg2-dev libmpfr-dev libnlopt-dev
   - sudo add-apt-repository -y ppa:cran/libgit2
   - sudo apt-get -qq update
   - sudo apt-get install libgit2-dev

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: flipData
 Type: Package
 Title: Functions for extracting and describing data
-Version: 1.5.8
+Version: 1.6.2
 Author: Displayr <opensource@displayr.com>
 Maintainer: Displayr <opensource@displayr.com>
 Description: Functions for extracting data from formulas and
@@ -26,7 +26,7 @@ Imports: CVXR (>= 1.0.0),
  stringr,
  survey,
  verbs
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.0
 Encoding: UTF-8
 Suggests: foreign,
  gtools,

diff --git a/NAMESPACE b/NAMESPACE
@@ -19,6 +19,7 @@ export(CheckPredictionVariables)
 export(CleanBackticks)
 export(CleanSubset)
 export(CleanWeights)
+export(CombineVariableSetsAsBinary)
 export(DataFormula)
 export(EffectiveSampleSize)
 export(ErrorIfInfinity)
@@ -100,6 +101,8 @@ importFrom(survey,calibrate)
 importFrom(survey,rake)
 importFrom(survey,svydesign)
 importFrom(utils,object.size)
+importFrom(verbs,AnyOf)
+importFrom(verbs,Count)
 importFrom(verbs,Sum)
 importFrom(verbs,SumEachColumn)
 importFrom(verbs,SumEachRow)
diff --git a/R/combinevariablesetsasbinary.R b/R/combinevariablesetsasbinary.R
@@ -0,0 +1,217 @@
+
+
+#' CombineVariableSetsAsBinary
+#'
+#' @description Combines a list of variable sets to binary variables, matching categories between them.
+#' @param ... One or more variable sets which should be Nominal, Ordinal, Nominal/Ordinal - Multi,
+#' Binary - Multi, or Binary - Multi (Compact)
+#' @param compute.for.incomplete A boolean value. If \code{FALSE}, cases with any missing data
+#' will have a missing vlaue. If \code{TRUE}, only cases whose data is entirely missing will
+#' be assigned a missing value.
+#' @param unmatched.pick.any.are.missing Boolean value. When one of the input variable sets
+#' is binary (Pick Any variable set) and additonal columns need to be added, the new column is fillend 
+#' entirely with missing values when a value of \code{TRUE} is supplied. If set to \code{FALSE}, 
+#' missing values will only be assigned for cases where all existing columns are missing. Note that for 
+#' mutually-exclusive input variables, new columns will be created such that only cases with entirely 
+#' missing values are assigned a missing value. 
+#' @importFrom verbs Count AnyOf SumEachRow
+#' @importFrom flipTransformations AsNumeric
+#' @export
+CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE, unmatched.pick.any.are.missing = TRUE) {
+
+    variable.set.list <- list(...)
+
+    # Check for duplicated labels which make life difficult when matching
+    duplicated.labels = lapply(variable.set.list, function(x) {
+        question.type <- attr(x, "questiontype")
+        if (is.factor(x)) {
+            question.type <- "PickOne" 
+        }
+
+        # Consider generalizing in future
+        if (is.null(question.type)) {
+            stop("This function should only be applied to variable sets in Displayr.")
+        }
+
+
+        if (question.type == "PickOneMulti") {
+            x = x[[1]]
+        }
+
+        if (is.factor(x)) {
+            levs = levels(x)
+            return(levs[duplicated(levs)])
+        }
+
+        colnames(x)[duplicated(colnames(x))]
+    })
+
+    n.duplicates = vapply(duplicated.labels, FUN = length, FUN.VALUE = numeric(1))
+
+    if (any(n.duplicates > 0)) {
+        dup.qs = names(duplicated.labels)[n.duplicates > 0]
+        dup.labels = duplicated.labels[n.duplicates > 0]
+        stop("The input data contains duplicate labels and cannot be matched. Duplicated labels: " , dup.labels[[1]])
+    }
+
+    binary.versions <- lapply(variable.set.list, FUN = questionToBinary)
+
+    binary.versions <- flattenToSingleList(binary.versions)
+
+    n.cases <- vapply(binary.versions, FUN = NROW, FUN.VALUE = numeric(1))
+
+    if (!all(n.cases == n.cases[1])) {
+        stop("The number of cases is not the same in all of the input data.")
+    }
+
+    # If only one variable set then just return it
+    if (length(binary.versions) == 1) {
+        return(binary.versions[[1]] == 1)
+    }
+
+    # Check matching of column labels in binary data
+    all.labels = lapply(binary.versions, FUN = colnames)
+    unique.labels = unique(unlist(all.labels))
+    common.labels = Reduce(intersect, all.labels)
+
+    if (!setequal(unique.labels, common.labels)) {
+        binary.versions <- lapply(binary.versions,
+            FUN = fillInCategoriesWhenNotPresent,
+            expected.columns = unique.labels,
+            pick.any.all.missing = unmatched.pick.any.are.missing)
+    }
+
+    input.args = binary.versions
+    input.args[["match.elements"]] <- "Yes"
+    input.args[["elements.to.count"]] <- list(numeric = NA, categorical = NULL)
+    input.args[["ignore.missing"]] <- TRUE
+
+    # Count missing values for each case for each binary variable
+    n.missing <- do.call(Count, input.args)
+
+    # Combine the sets of binary variables using AnyOf
+    input.args[["elements.to.count"]] <- list(numeric = 1, categorical = NULL)
+    result <- do.call(AnyOf, input.args)
+
+    # Handle missing values
+    if (compute.for.incomplete) { # Only assign NA if all missing
+        result[n.missing == length(binary.versions)] <- NA
+    } else { # Assign NA if any missing
+        result[n.missing > 0] <- NA
+    }
+
+    # Replace blank level labels
+    c.names <- colnames(result)
+    c.names[c.names == "<BLANK>"] <- ""
+    colnames(result) <- c.names
+    result
+}
+
+
+# Function to identify the default NET column
+# in Pick Any data.
+isDefaultNet <- function(codes, unique.codes) {
+    all(unique.codes %in% codes) && length(codes) == length(unique.codes)
+}
+
+
+flattenToSingleList <- function(input.list)
+{
+    args <- lapply(input.list, function(x) if (is.list(x) && ! is.data.frame(x)) flattenToSingleList(x) else list(x))
+    do.call(c, args)
+}
+
+# Convert Displayr variable sets to binary.
+# Factors are split out with one column per level
+questionToBinary <- function(x) {
+    question.type = attr(x, "questiontype")
+
+    # Standalone factor variables can retain the "questiontype"
+    # value of "PickOneMulti" inherited from their parent
+    # question
+    if (is.factor(x)) {
+        question.type <- "PickOne"
+        levs <- levels(x)
+        levs[levs == ""] <- "<BLANK>" # Protecting against blank level labels. Will be put back later.
+        levels(x) <- levs
+    }
+
+    # Consider generalizing in future
+    if (is.null(question.type)) {
+        stop("This function should only be applied to variable sets in Displayr.")
+    }
+
+    if (question.type %in% c("PickAny", "PickAnyCompact")) {
+        # Identify and remove the NET column basedon the codeframe attribute
+        cf <- attr(x, "codeframe")
+        if (!is.null(cf)) {
+            unique.codes = unique(unlist(cf))
+            net.cols = vapply(cf, isDefaultNet, FUN.VALUE = logical(1), unique.codes = unique.codes)
+            x <- x[, !net.cols]
+        }
+        attr(x, "originalquestiontype") <- "Pick Any"
+        return(x)
+    }
+
+    # Each variable in a Pick One - Multi is split separately
+    if (question.type == "PickOneMulti") {
+        return(lapply(x, questionToBinary))
+    }
+
+    # Split factor into binary columns, retaining
+    # levels as column names and replacing rows
+    # with missing values where the original variable
+    # had a missing value
+
+    if (is.factor(x)) {
+        # Remove the 'ordered' class so that AsNumeric
+        # works as intended
+        if (is.ordered(x))
+            class(x) <- class(x)[class(x) != "ordered"]
+        binary.version <- AsNumeric(x, binary = TRUE, name = levels(x))
+        colnames(binary.version) <- levels(x)
+        binary.version[is.na(x), ] <- NA
+        attr(binary.version, "originalquestiontype") <- "Pick One"
+        return(binary.version)
+    }
+
+    stop("Unsupported data type: ", question.type)
+}
+
+# Function to expand the number of columns in the binary data
+# when there are fewer columns than expected. expected.columns
+
+# should be a vector of column names.
+fillInCategoriesWhenNotPresent <- function(binary.data, expected.columns, pick.any.all.missing = TRUE) {
+    current.colnames <- colnames(binary.data)
+
+    if (all(expected.columns %in% current.colnames))
+        return(binary.data)
+
+    new.colnames <- expected.columns[! expected.columns %in% current.colnames]
+    new.data <- matrix(FALSE, nrow = nrow(binary.data), ncol = length(new.colnames))
+    colnames(new.data) <- new.colnames
+
+
+    # Missing data rule
+    # For data which was originally mutually-exclusive,
+    # cases are assigned missing values in the new columns
+    # when the case has missing data in the existing columns.
+    # In this case the row will always be entirely missing
+    # or entirely non-missing.
+    # For data which was already binary, new columns should be
+    # entirely missing unless overridden by the argument.
+    n.missing.per.case <- SumEachRow(is.na(binary.data))
+    missing.in.new.data = rep(TRUE, nrow(binary.data))
+    if (attr(binary.data, "originalquestiontype") == "Pick One" || !pick.any.all.missing) {
+        missing.in.new.data <- n.missing.per.case == ncol(binary.data)
+    } 
+
+    new.data[missing.in.new.data, ] <- NA
+
+    binary.data <- cbind(binary.data, new.data)
+    binary.data <- binary.data[, expected.columns]
+
+    binary.data 
+}
+
diff --git a/R/mergedatasetsbycase.R b/R/mergedatasetsbycase.R
@@ -1859,7 +1859,7 @@ mergedDataSet <- function(data.sets, matched.names, merged.names,
                  "Consider omitting variables or only keeping combined variables that contain input variables from a few data sets.")
         merged.data.set[[i]] <- v
     }
-    merged.data.set <- data.frame(merged.data.set)
+    merged.data.set <- data.frame(merged.data.set, check.names = FALSE)
     names(merged.data.set) <- merged.names
 
     mergesrc.name <- uniqueName("mergesrc", names(merged.data.set), "_")

diff --git a/R/mergedatasetsbyvariable.R b/R/mergedatasetsbyvariable.R
@@ -585,7 +585,7 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
         }
     }
     names(merged.data.set.variables) <- merged.data.set.variable.names
-    data.frame(merged.data.set.variables)
+    data.frame(merged.data.set.variables, check.names = FALSE)
 }
 
 #' @param id.variable.names See data dictionary.

diff --git a/R/stacking.R b/R/stacking.R
@@ -1177,7 +1177,7 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata,
                                      names(stacked.data.set))]] <- observation
     }
 
-    data.frame(stacked.data.set)
+    data.frame(stacked.data.set, check.names = FALSE)
 }
 
 stackedVariableName <- function(group.ind, input.variable.names, taken.names)

diff --git a/inst/testdata/cola19.sav b/inst/testdata/cola19.sav
diff --git a/man/CombineVariableSetsAsBinary.Rd b/man/CombineVariableSetsAsBinary.Rd