Skip to content

Commit

Permalink
Handle variables with duplicated values and labels [revdep skip]
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinCCYap committed Aug 11, 2022
1 parent 3e620a8 commit fc4e0dd
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 23 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: flipData
Type: Package
Title: Functions for extracting and describing data
Version: 1.6.4
Version: 1.6.5
Author: Displayr <opensource@displayr.com>
Maintainer: Displayr <opensource@displayr.com>
Description: Functions for extracting data from formulas and
Expand Down
103 changes: 84 additions & 19 deletions R/mergedatasetsbycase.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ MergeDataSetsByCase <- function(data.set.names,
min.value.label.match.percentage = 90)
{
data.sets <- readDataSets(data.set.names, 2)
data.sets <- removeDuplicateValues(data.sets)
input.data.sets.metadata <- metadataFromDataSets(data.sets)

match.parameters <- list(auto.select.what.to.match.by = auto.select.what.to.match.by,
Expand Down Expand Up @@ -2060,21 +2061,16 @@ combineAsCategoricalVariable <- function(var.list, data.sets,
result
}

#' @description Extract the value from val.attr given a label, also dealing
#' with the special case where label == "". If it wasn't for this special case
#' we could just call unname(val.attr[label]).
#' @description Extract the values from val.attr given a label. There could be
#' more than one value as multiple values could have the same label.
#' @param val.attr A named numeric vector representing value attributes, where
#' the value names are the labels.
#' @param label A character scalar of a label for which a value is to be returned.
#' @return A numeric scalar of the value corresponding to the label.
#' @return A numeric vector of the values corresponding to the label.
#' @noRd
labelValue <- function(val.attr, label)
labelValues <- function(val.attr, label)
{
if (label != "")
unname(val.attr[label])
else
# need to do this since val.attr[""] will return NA
unname(val.attr[names(val.attr) == ""])
unname(val.attr[names(val.attr) == label])
}

# Merge value attributes in value.attributes.list into one value attribute
Expand All @@ -2086,6 +2082,7 @@ mergeValueAttributes <- function(value.attributes.list,
merged.val.attr <- numeric(0)
original.val.attr <- numeric(0)
value.map.list <- vector("list", length = length(value.attributes.list))

for (i in seq_along(value.attributes.list))
{
# 2-column matrix representing a remapping of values where the
Expand All @@ -2094,15 +2091,18 @@ mergeValueAttributes <- function(value.attributes.list,
map <- matrix(nrow = 0, ncol = 2)

val.attr <- value.attributes.list[[i]]
for (lbl in names(val.attr))
for (j in seq_along(val.attr))
{
val <- labelValue(val.attr, lbl)
val <- unname(val.attr[j])
lbl <- names(val.attr)[j]
is.label.duplicated <- isLabelDuplicated(lbl, value.attributes.list)
merged.val.attr <- mergeValueAndLabelIntoValueAttributes(val, lbl,
merged.val.attr,
original.val.attr,
map,
when.multiple.labels.for.one.value,
match.parameters)
match.parameters,
is.label.duplicated)
map <- attr(merged.val.attr, "map")
}
if (nrow(map) > 0)
Expand Down Expand Up @@ -2152,28 +2152,44 @@ mergeValueAttributes <- function(value.attributes.list,
#' different values when creating the merged variable.
#' @param when.multiple.labels.for.one.value See documentation for this in MergeDataSetsByCase
#' @param match.parameters Parameters used for fuzzy matching of names and labels.
#' @param is.label.duplicated Whether lbl appears more than once in any of the
#' value attributes that are to be merged.
#' @return Returns a possibly augmented merged.val.attr, with the attribute "map"
#' containing the matrix map.
#' @noRd
mergeValueAndLabelIntoValueAttributes <- function(val, lbl, merged.val.attr,
original.val.attr, map,
when.multiple.labels.for.one.value,
match.parameters)
match.parameters,
is.label.duplicated)
{
if (length(merged.val.attr) == 0)
merged.val.attr[lbl] <- val
else if (lbl %in% names(merged.val.attr))
{
merged.val <- labelValue(merged.val.attr, lbl)
if (val != merged.val) # same label with different values
vals.matching.lbl <- labelValues(merged.val.attr, lbl)
if (!(val %in% vals.matching.lbl)) # label exists in merged.val.attr but value doesn't match
{
map <- rbind(map, c(val, merged.val), deparse.level = 0) # use the value in merged.val.attr
if (is.label.duplicated && !(val %in% merged.val.attr))
{
# Add the new value and duplicate label.
# We do this so that duplicate labels are retained in the merged value attributes
named.val <- val
names(named.val) <- lbl
merged.val.attr <- c(merged.val.attr, named.val)
}
else
{
# use the (first) value in merged.val.attr that matches the label
map <- rbind(map, c(val, vals.matching.lbl[1]), deparse.level = 0)
}
}
# else: same label, same value, no action required as it is already in merged.val.attr
# else: value/label pair exists in merged.val.attr, no action required
}
else
{
if (length(original.val.attr) > 0)
# We don't fuzzy match duplicated labels because duplication might be lost
if (length(original.val.attr) > 0 && !is.label.duplicated)
{
lbls.to.compare.against <- names(original.val.attr)
match.percentages <- matchPercentagesForValueLabels(lbl = lbl,
Expand Down Expand Up @@ -2559,6 +2575,55 @@ omittedVariables <- function(input.data.sets.metadata, matched.names)
})
}

# Remove duplicate values from value attributes in variables in data sets
removeDuplicateValues <- function(data.sets)
{
data.set.names <- names(data.sets)
data.sets <- lapply(data.set.names, function(data.set.name) {
data.set <- data.sets[[data.set.name]]
var.names <- names(data.set)
mod.var.names <- character(0)
for (i in seq_along(data.set))
{
val.attrs <- attr(data.set[[i]], "labels", exact = TRUE)
if (!is.null(val.attrs))
{
dup <- duplicated(val.attrs)
if (any(dup))
{
val.attrs <- val.attrs[!dup]
attr(data.set[[i]], "labels") <- val.attrs
mod.var.names <- c(mod.var.names, var.names[i])
}
}
if (length(mod.var.names) > 0)
{
if (length(mod.var.names) == 1)
warning("Duplicate values have been removed from the following variable in data set ",
data.set.name, ": ", mod.var.names)
if (length(mod.var.names) <= 10)
warning("Duplicate values have been removed from the following variables in data set ",
data.set.name, ": ", paste0(mod.var.names, collapse = ", "))
else
warning("Duplicate values have been removed from ",
length(mod.var.names), " variables in data set ",
data.set.name, ".")
}
}
data.set
})
names(data.sets) <- data.set.names
data.sets
}

# Determines whether lbl appears more than once in any of the value attributes
# in val.attrs.list
isLabelDuplicated <- function(lbl, val.attrs.list)
{
any(vapply(val.attrs.list, function(val.attrs) sum(names(val.attrs) == lbl) > 1,
logical(1)))
}

# Convenience function: seq_len of nrow of matrix m
seqRow <- function(m)
{
Expand Down
3 changes: 3 additions & 0 deletions tests/testthat/test-mergedatasetsbycase.R
Original file line number Diff line number Diff line change
Expand Up @@ -1174,3 +1174,6 @@ test_that("RS-9210: check that values merge correctly", {
`I've heard only negative things` = 5))
expect_equal(result$value.map.list, list(NULL, NULL))
})

if (file.exists("Combined data set.sav"))
file.remove("Combined data set.sav")
3 changes: 3 additions & 0 deletions tests/testthat/test-mergedatasetsbyvariable.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,6 @@ test_that("exampleIDValues", {
data.sets = data.sets),
c("1", "21", "A"))
})

if (file.exists("Combined data set.sav"))
file.remove("Combined data set.sav")
3 changes: 0 additions & 3 deletions tests/testthat/test-mergingandstackingutilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,3 @@ test_that("readDataSets: better error message when data file is invalid", {
paste0("The data file 'bad.sav' could not be parsed. ",
"Check the data set for issues and try again after fixing them or removing unnecessary variables."))
})

if (file.exists("Combined data set.sav"))
file.remove("Combined data set.sav")

0 comments on commit fc4e0dd

Please sign in to comment.