Skip to content

Commit

Permalink
Merge branch 'master' into DS-3773
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinCCYap committed Jul 4, 2022
2 parents e1bf67c + 8519d49 commit 4a5d3e7
Show file tree
Hide file tree
Showing 11 changed files with 422 additions and 6 deletions.
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ r:
dist: bionic
cache: packages
warnings_are_errors: false
latex: false

# install debian libraries to match R-servers
# update pre-installed packages to latest versions
before_install:
- export R_REMOTES_NO_ERRORS_FROM_WARNINGS="true"
- sudo add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable
- sudo apt-get -qq update
- sudo apt-get install -y libgdal-dev libproj-dev python-protobuf libprotoc-dev libprotobuf-dev libv8-dev librsvg2-dev libmpfr-dev libnlopt-dev
- sudo apt-get install -y libgdal-dev libproj-dev python-protobuf libprotoc-dev libprotobuf-dev libv8-dev librsvg2-dev libmpfr-dev libnlopt-dev
- sudo add-apt-repository -y ppa:cran/libgit2
- sudo apt-get -qq update
- sudo apt-get install libgit2-dev
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: flipData
Type: Package
Title: Functions for extracting and describing data
Version: 1.5.8
Version: 1.6.2
Author: Displayr <opensource@displayr.com>
Maintainer: Displayr <opensource@displayr.com>
Description: Functions for extracting data from formulas and
Expand All @@ -26,7 +26,7 @@ Imports: CVXR (>= 1.0.0),
stringr,
survey,
verbs
RoxygenNote: 7.1.2
RoxygenNote: 7.2.0
Encoding: UTF-8
Suggests: foreign,
gtools,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export(CheckPredictionVariables)
export(CleanBackticks)
export(CleanSubset)
export(CleanWeights)
export(CombineVariableSetsAsBinary)
export(DataFormula)
export(EffectiveSampleSize)
export(ErrorIfInfinity)
Expand Down Expand Up @@ -100,6 +101,8 @@ importFrom(survey,calibrate)
importFrom(survey,rake)
importFrom(survey,svydesign)
importFrom(utils,object.size)
importFrom(verbs,AnyOf)
importFrom(verbs,Count)
importFrom(verbs,Sum)
importFrom(verbs,SumEachColumn)
importFrom(verbs,SumEachRow)
217 changes: 217 additions & 0 deletions R/combinevariablesetsasbinary.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@


#' CombineVariableSetsAsBinary
#'
#' @description Combines a list of variable sets to binary variables, matching categories between them.
#' @param ... One or more variable sets which should be Nominal, Ordinal, Nominal/Ordinal - Multi,
#' Binary - Multi, or Binary - Multi (Compact)
#' @param compute.for.incomplete A boolean value. If \code{FALSE}, cases with any missing data
#' will have a missing vlaue. If \code{TRUE}, only cases whose data is entirely missing will
#' be assigned a missing value.
#' @param unmatched.pick.any.are.missing Boolean value. When one of the input variable sets
#' is binary (Pick Any variable set) and additonal columns need to be added, the new column is fillend
#' entirely with missing values when a value of \code{TRUE} is supplied. If set to \code{FALSE},
#' missing values will only be assigned for cases where all existing columns are missing. Note that for
#' mutually-exclusive input variables, new columns will be created such that only cases with entirely
#' missing values are assigned a missing value.
#' @importFrom verbs Count AnyOf SumEachRow
#' @importFrom flipTransformations AsNumeric
#' @export
CombineVariableSetsAsBinary <- function(..., compute.for.incomplete = TRUE, unmatched.pick.any.are.missing = TRUE) {

variable.set.list <- list(...)

# Check for duplicated labels which make life difficult when matching
duplicated.labels = lapply(variable.set.list, function(x) {
question.type <- attr(x, "questiontype")
if (is.factor(x)) {
question.type <- "PickOne"
}

# Consider generalizing in future
if (is.null(question.type)) {
stop("This function should only be applied to variable sets in Displayr.")
}


if (question.type == "PickOneMulti") {
x = x[[1]]
}

if (is.factor(x)) {
levs = levels(x)
return(levs[duplicated(levs)])
}

colnames(x)[duplicated(colnames(x))]
})

n.duplicates = vapply(duplicated.labels, FUN = length, FUN.VALUE = numeric(1))

if (any(n.duplicates > 0)) {
dup.qs = names(duplicated.labels)[n.duplicates > 0]
dup.labels = duplicated.labels[n.duplicates > 0]
stop("The input data contains duplicate labels and cannot be matched. Duplicated labels: " , dup.labels[[1]])
}

binary.versions <- lapply(variable.set.list, FUN = questionToBinary)

binary.versions <- flattenToSingleList(binary.versions)

n.cases <- vapply(binary.versions, FUN = NROW, FUN.VALUE = numeric(1))

if (!all(n.cases == n.cases[1])) {
stop("The number of cases is not the same in all of the input data.")
}

# If only one variable set then just return it
if (length(binary.versions) == 1) {
return(binary.versions[[1]] == 1)
}

# Check matching of column labels in binary data
all.labels = lapply(binary.versions, FUN = colnames)
unique.labels = unique(unlist(all.labels))
common.labels = Reduce(intersect, all.labels)

if (!setequal(unique.labels, common.labels)) {
binary.versions <- lapply(binary.versions,
FUN = fillInCategoriesWhenNotPresent,
expected.columns = unique.labels,
pick.any.all.missing = unmatched.pick.any.are.missing)
}

input.args = binary.versions
input.args[["match.elements"]] <- "Yes"
input.args[["elements.to.count"]] <- list(numeric = NA, categorical = NULL)
input.args[["ignore.missing"]] <- TRUE

# Count missing values for each case for each binary variable
n.missing <- do.call(Count, input.args)

# Combine the sets of binary variables using AnyOf
input.args[["elements.to.count"]] <- list(numeric = 1, categorical = NULL)
result <- do.call(AnyOf, input.args)

# Handle missing values
if (compute.for.incomplete) { # Only assign NA if all missing
result[n.missing == length(binary.versions)] <- NA
} else { # Assign NA if any missing
result[n.missing > 0] <- NA
}

# Replace blank level labels
c.names <- colnames(result)
c.names[c.names == "<BLANK>"] <- ""
colnames(result) <- c.names
result
}


# Function to identify the default NET column
# in Pick Any data.
isDefaultNet <- function(codes, unique.codes) {
all(unique.codes %in% codes) && length(codes) == length(unique.codes)
}


flattenToSingleList <- function(input.list)
{
args <- lapply(input.list, function(x) if (is.list(x) && ! is.data.frame(x)) flattenToSingleList(x) else list(x))
do.call(c, args)
}

# Convert Displayr variable sets to binary.
# Factors are split out with one column per level
questionToBinary <- function(x) {
question.type = attr(x, "questiontype")

# Standalone factor variables can retain the "questiontype"
# value of "PickOneMulti" inherited from their parent
# question
if (is.factor(x)) {
question.type <- "PickOne"
levs <- levels(x)
levs[levs == ""] <- "<BLANK>" # Protecting against blank level labels. Will be put back later.
levels(x) <- levs
}

# Consider generalizing in future
if (is.null(question.type)) {
stop("This function should only be applied to variable sets in Displayr.")
}

if (question.type %in% c("PickAny", "PickAnyCompact")) {
# Identify and remove the NET column basedon the codeframe attribute
cf <- attr(x, "codeframe")
if (!is.null(cf)) {
unique.codes = unique(unlist(cf))
net.cols = vapply(cf, isDefaultNet, FUN.VALUE = logical(1), unique.codes = unique.codes)
x <- x[, !net.cols]
}
attr(x, "originalquestiontype") <- "Pick Any"
return(x)
}

# Each variable in a Pick One - Multi is split separately
if (question.type == "PickOneMulti") {
return(lapply(x, questionToBinary))
}

# Split factor into binary columns, retaining
# levels as column names and replacing rows
# with missing values where the original variable
# had a missing value

if (is.factor(x)) {
# Remove the 'ordered' class so that AsNumeric
# works as intended
if (is.ordered(x))
class(x) <- class(x)[class(x) != "ordered"]
binary.version <- AsNumeric(x, binary = TRUE, name = levels(x))
colnames(binary.version) <- levels(x)
binary.version[is.na(x), ] <- NA
attr(binary.version, "originalquestiontype") <- "Pick One"
return(binary.version)
}

stop("Unsupported data type: ", question.type)
}

# Function to expand the number of columns in the binary data
# when there are fewer columns than expected. expected.columns

# should be a vector of column names.
fillInCategoriesWhenNotPresent <- function(binary.data, expected.columns, pick.any.all.missing = TRUE) {
current.colnames <- colnames(binary.data)

if (all(expected.columns %in% current.colnames))
return(binary.data)

new.colnames <- expected.columns[! expected.columns %in% current.colnames]
new.data <- matrix(FALSE, nrow = nrow(binary.data), ncol = length(new.colnames))
colnames(new.data) <- new.colnames


# Missing data rule
# For data which was originally mutually-exclusive,
# cases are assigned missing values in the new columns
# when the case has missing data in the existing columns.
# In this case the row will always be entirely missing
# or entirely non-missing.
# For data which was already binary, new columns should be
# entirely missing unless overridden by the argument.
n.missing.per.case <- SumEachRow(is.na(binary.data))
missing.in.new.data = rep(TRUE, nrow(binary.data))
if (attr(binary.data, "originalquestiontype") == "Pick One" || !pick.any.all.missing) {
missing.in.new.data <- n.missing.per.case == ncol(binary.data)
}

new.data[missing.in.new.data, ] <- NA

binary.data <- cbind(binary.data, new.data)
binary.data <- binary.data[, expected.columns]

binary.data
}

2 changes: 1 addition & 1 deletion R/mergedatasetsbycase.R
Original file line number Diff line number Diff line change
Expand Up @@ -1859,7 +1859,7 @@ mergedDataSet <- function(data.sets, matched.names, merged.names,
"Consider omitting variables or only keeping combined variables that contain input variables from a few data sets.")
merged.data.set[[i]] <- v
}
merged.data.set <- data.frame(merged.data.set)
merged.data.set <- data.frame(merged.data.set, check.names = FALSE)
names(merged.data.set) <- merged.names

mergesrc.name <- uniqueName("mergesrc", names(merged.data.set), "_")
Expand Down
2 changes: 1 addition & 1 deletion R/mergedatasetsbyvariable.R
Original file line number Diff line number Diff line change
Expand Up @@ -585,7 +585,7 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
}
}
names(merged.data.set.variables) <- merged.data.set.variable.names
data.frame(merged.data.set.variables)
data.frame(merged.data.set.variables, check.names = FALSE)
}

#' @param id.variable.names See data dictionary.
Expand Down
2 changes: 1 addition & 1 deletion R/stacking.R
Original file line number Diff line number Diff line change
Expand Up @@ -1177,7 +1177,7 @@ stackedDataSet <- function(input.data.set, input.data.set.metadata,
names(stacked.data.set))]] <- observation
}

data.frame(stacked.data.set)
data.frame(stacked.data.set, check.names = FALSE)
}

stackedVariableName <- function(group.ind, input.variable.names, taken.names)
Expand Down
Binary file added inst/testdata/cola19.sav
Binary file not shown.
30 changes: 30 additions & 0 deletions man/CombineVariableSetsAsBinary.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 4a5d3e7

Please sign in to comment.