From fc4e0dd6d1cd88b5215ffc4def15581cc7eaaedd Mon Sep 17 00:00:00 2001 From: Justin Yap Date: Thu, 11 Aug 2022 11:12:08 +1000 Subject: [PATCH] Handle variables with duplicated values and labels [revdep skip] --- DESCRIPTION | 2 +- R/mergedatasetsbycase.R | 103 ++++++++++++++---- tests/testthat/test-mergedatasetsbycase.R | 3 + tests/testthat/test-mergedatasetsbyvariable.R | 3 + .../test-mergingandstackingutilities.R | 3 - 5 files changed, 91 insertions(+), 23 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4716a06..2104b38 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: flipData Type: Package Title: Functions for extracting and describing data -Version: 1.6.4 +Version: 1.6.5 Author: Displayr Maintainer: Displayr Description: Functions for extracting data from formulas and diff --git a/R/mergedatasetsbycase.R b/R/mergedatasetsbycase.R index 5192386..4431749 100644 --- a/R/mergedatasetsbycase.R +++ b/R/mergedatasetsbycase.R @@ -134,6 +134,7 @@ MergeDataSetsByCase <- function(data.set.names, min.value.label.match.percentage = 90) { data.sets <- readDataSets(data.set.names, 2) + data.sets <- removeDuplicateValues(data.sets) input.data.sets.metadata <- metadataFromDataSets(data.sets) match.parameters <- list(auto.select.what.to.match.by = auto.select.what.to.match.by, @@ -2060,21 +2061,16 @@ combineAsCategoricalVariable <- function(var.list, data.sets, result } -#' @description Extract the value from val.attr given a label, also dealing -#' with the special case where label == "". If it wasn't for this special case -#' we could just call unname(val.attr[label]). +#' @description Extract the values from val.attr given a label. There could be +#' more than one value as multiple values could have the same label. #' @param val.attr A named numeric vector representing value attributes, where #' the value names are the labels. #' @param label A character scalar of a label for which a value is to be returned. -#' @return A numeric scalar of the value corresponding to the label. +#' @return A numeric vector of the values corresponding to the label. #' @noRd -labelValue <- function(val.attr, label) +labelValues <- function(val.attr, label) { - if (label != "") - unname(val.attr[label]) - else - # need to do this since val.attr[""] will return NA - unname(val.attr[names(val.attr) == ""]) + unname(val.attr[names(val.attr) == label]) } # Merge value attributes in value.attributes.list into one value attribute @@ -2086,6 +2082,7 @@ mergeValueAttributes <- function(value.attributes.list, merged.val.attr <- numeric(0) original.val.attr <- numeric(0) value.map.list <- vector("list", length = length(value.attributes.list)) + for (i in seq_along(value.attributes.list)) { # 2-column matrix representing a remapping of values where the @@ -2094,15 +2091,18 @@ mergeValueAttributes <- function(value.attributes.list, map <- matrix(nrow = 0, ncol = 2) val.attr <- value.attributes.list[[i]] - for (lbl in names(val.attr)) + for (j in seq_along(val.attr)) { - val <- labelValue(val.attr, lbl) + val <- unname(val.attr[j]) + lbl <- names(val.attr)[j] + is.label.duplicated <- isLabelDuplicated(lbl, value.attributes.list) merged.val.attr <- mergeValueAndLabelIntoValueAttributes(val, lbl, merged.val.attr, original.val.attr, map, when.multiple.labels.for.one.value, - match.parameters) + match.parameters, + is.label.duplicated) map <- attr(merged.val.attr, "map") } if (nrow(map) > 0) @@ -2152,28 +2152,44 @@ mergeValueAttributes <- function(value.attributes.list, #' different values when creating the merged variable. #' @param when.multiple.labels.for.one.value See documentation for this in MergeDataSetsByCase #' @param match.parameters Parameters used for fuzzy matching of names and labels. +#' @param is.label.duplicated Whether lbl appears more than once in any of the +#' value attributes that are to be merged. #' @return Returns a possibly augmented merged.val.attr, with the attribute "map" #' containing the matrix map. #' @noRd mergeValueAndLabelIntoValueAttributes <- function(val, lbl, merged.val.attr, original.val.attr, map, when.multiple.labels.for.one.value, - match.parameters) + match.parameters, + is.label.duplicated) { if (length(merged.val.attr) == 0) merged.val.attr[lbl] <- val else if (lbl %in% names(merged.val.attr)) { - merged.val <- labelValue(merged.val.attr, lbl) - if (val != merged.val) # same label with different values + vals.matching.lbl <- labelValues(merged.val.attr, lbl) + if (!(val %in% vals.matching.lbl)) # label exists in merged.val.attr but value doesn't match { - map <- rbind(map, c(val, merged.val), deparse.level = 0) # use the value in merged.val.attr + if (is.label.duplicated && !(val %in% merged.val.attr)) + { + # Add the new value and duplicate label. + # We do this so that duplicate labels are retained in the merged value attributes + named.val <- val + names(named.val) <- lbl + merged.val.attr <- c(merged.val.attr, named.val) + } + else + { + # use the (first) value in merged.val.attr that matches the label + map <- rbind(map, c(val, vals.matching.lbl[1]), deparse.level = 0) + } } - # else: same label, same value, no action required as it is already in merged.val.attr + # else: value/label pair exists in merged.val.attr, no action required } else { - if (length(original.val.attr) > 0) + # We don't fuzzy match duplicated labels because duplication might be lost + if (length(original.val.attr) > 0 && !is.label.duplicated) { lbls.to.compare.against <- names(original.val.attr) match.percentages <- matchPercentagesForValueLabels(lbl = lbl, @@ -2559,6 +2575,55 @@ omittedVariables <- function(input.data.sets.metadata, matched.names) }) } +# Remove duplicate values from value attributes in variables in data sets +removeDuplicateValues <- function(data.sets) +{ + data.set.names <- names(data.sets) + data.sets <- lapply(data.set.names, function(data.set.name) { + data.set <- data.sets[[data.set.name]] + var.names <- names(data.set) + mod.var.names <- character(0) + for (i in seq_along(data.set)) + { + val.attrs <- attr(data.set[[i]], "labels", exact = TRUE) + if (!is.null(val.attrs)) + { + dup <- duplicated(val.attrs) + if (any(dup)) + { + val.attrs <- val.attrs[!dup] + attr(data.set[[i]], "labels") <- val.attrs + mod.var.names <- c(mod.var.names, var.names[i]) + } + } + if (length(mod.var.names) > 0) + { + if (length(mod.var.names) == 1) + warning("Duplicate values have been removed from the following variable in data set ", + data.set.name, ": ", mod.var.names) + if (length(mod.var.names) <= 10) + warning("Duplicate values have been removed from the following variables in data set ", + data.set.name, ": ", paste0(mod.var.names, collapse = ", ")) + else + warning("Duplicate values have been removed from ", + length(mod.var.names), " variables in data set ", + data.set.name, ".") + } + } + data.set + }) + names(data.sets) <- data.set.names + data.sets +} + +# Determines whether lbl appears more than once in any of the value attributes +# in val.attrs.list +isLabelDuplicated <- function(lbl, val.attrs.list) +{ + any(vapply(val.attrs.list, function(val.attrs) sum(names(val.attrs) == lbl) > 1, + logical(1))) +} + # Convenience function: seq_len of nrow of matrix m seqRow <- function(m) { diff --git a/tests/testthat/test-mergedatasetsbycase.R b/tests/testthat/test-mergedatasetsbycase.R index 49af27a..75329f6 100644 --- a/tests/testthat/test-mergedatasetsbycase.R +++ b/tests/testthat/test-mergedatasetsbycase.R @@ -1174,3 +1174,6 @@ test_that("RS-9210: check that values merge correctly", { `I've heard only negative things` = 5)) expect_equal(result$value.map.list, list(NULL, NULL)) }) + +if (file.exists("Combined data set.sav")) + file.remove("Combined data set.sav") diff --git a/tests/testthat/test-mergedatasetsbyvariable.R b/tests/testthat/test-mergedatasetsbyvariable.R index ccd7b46..49ff810 100644 --- a/tests/testthat/test-mergedatasetsbyvariable.R +++ b/tests/testthat/test-mergedatasetsbyvariable.R @@ -254,3 +254,6 @@ test_that("exampleIDValues", { data.sets = data.sets), c("1", "21", "A")) }) + +if (file.exists("Combined data set.sav")) + file.remove("Combined data set.sav") diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R index 741c75f..73b5826 100644 --- a/tests/testthat/test-mergingandstackingutilities.R +++ b/tests/testthat/test-mergingandstackingutilities.R @@ -121,6 +121,3 @@ test_that("readDataSets: better error message when data file is invalid", { paste0("The data file 'bad.sav' could not be parsed. ", "Check the data set for issues and try again after fixing them or removing unnecessary variables.")) }) - -if (file.exists("Combined data set.sav")) - file.remove("Combined data set.sav")