From fc4e0dd6d1cd88b5215ffc4def15581cc7eaaedd Mon Sep 17 00:00:00 2001
From: Justin Yap <JustinCCYap@users.noreply.github.com>
Date: Thu, 11 Aug 2022 11:12:08 +1000
Subject: [PATCH] Handle variables with duplicated values and labels [revdep
 skip]

---
 DESCRIPTION                                   |   2 +-
 R/mergedatasetsbycase.R                       | 103 ++++++++++++++----
 tests/testthat/test-mergedatasetsbycase.R     |   3 +
 tests/testthat/test-mergedatasetsbyvariable.R |   3 +
 .../test-mergingandstackingutilities.R        |   3 -
 5 files changed, 91 insertions(+), 23 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 4716a06..2104b38 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: flipData
 Type: Package
 Title: Functions for extracting and describing data
-Version: 1.6.4
+Version: 1.6.5
 Author: Displayr <opensource@displayr.com>
 Maintainer: Displayr <opensource@displayr.com>
 Description: Functions for extracting data from formulas and
diff --git a/R/mergedatasetsbycase.R b/R/mergedatasetsbycase.R
index 5192386..4431749 100644
--- a/R/mergedatasetsbycase.R
+++ b/R/mergedatasetsbycase.R
@@ -134,6 +134,7 @@ MergeDataSetsByCase <- function(data.set.names,
                                 min.value.label.match.percentage = 90)
 {
     data.sets <- readDataSets(data.set.names, 2)
+    data.sets <- removeDuplicateValues(data.sets)
     input.data.sets.metadata <- metadataFromDataSets(data.sets)
 
     match.parameters <- list(auto.select.what.to.match.by = auto.select.what.to.match.by,
@@ -2060,21 +2061,16 @@ combineAsCategoricalVariable <- function(var.list, data.sets,
     result
 }
 
-#' @description  Extract the value from val.attr given a label, also dealing
-#'  with the special case where label == "". If it wasn't for this special case
-#'  we could just call unname(val.attr[label]).
+#' @description  Extract the values from val.attr given a label. There could be
+#' more than one value as multiple values could have the same label.
 #' @param val.attr A named numeric vector representing value attributes, where
 #'  the value names are the labels.
 #' @param label A character scalar of a label for which a value is to be returned.
-#' @return A numeric scalar of the value corresponding to the label.
+#' @return A numeric vector of the values corresponding to the label.
 #' @noRd
-labelValue <- function(val.attr, label)
+labelValues <- function(val.attr, label)
 {
-    if (label != "")
-        unname(val.attr[label])
-    else
-        # need to do this since val.attr[""] will return NA
-        unname(val.attr[names(val.attr) == ""])
+    unname(val.attr[names(val.attr) == label])
 }
 
 # Merge value attributes in value.attributes.list into one value attribute
@@ -2086,6 +2082,7 @@ mergeValueAttributes <- function(value.attributes.list,
     merged.val.attr <- numeric(0)
     original.val.attr <- numeric(0)
     value.map.list <- vector("list", length = length(value.attributes.list))
+
     for (i in seq_along(value.attributes.list))
     {
         # 2-column matrix representing a remapping of values where the
@@ -2094,15 +2091,18 @@ mergeValueAttributes <- function(value.attributes.list,
         map <- matrix(nrow = 0, ncol = 2)
 
         val.attr <- value.attributes.list[[i]]
-        for (lbl in names(val.attr))
+        for (j in seq_along(val.attr))
         {
-            val <- labelValue(val.attr, lbl)
+            val <- unname(val.attr[j])
+            lbl <- names(val.attr)[j]
+            is.label.duplicated <- isLabelDuplicated(lbl, value.attributes.list)
             merged.val.attr <- mergeValueAndLabelIntoValueAttributes(val, lbl,
                                                                      merged.val.attr,
                                                                      original.val.attr,
                                                                      map,
                                                                      when.multiple.labels.for.one.value,
-                                                                     match.parameters)
+                                                                     match.parameters,
+                                                                     is.label.duplicated)
             map <- attr(merged.val.attr, "map")
         }
         if (nrow(map) > 0)
@@ -2152,28 +2152,44 @@ mergeValueAttributes <- function(value.attributes.list,
 #'  different values when creating the merged variable.
 #' @param when.multiple.labels.for.one.value See documentation for this in MergeDataSetsByCase
 #' @param match.parameters Parameters used for fuzzy matching of names and labels.
+#' @param is.label.duplicated Whether lbl appears more than once in any of the
+#'  value attributes that are to be merged.
 #' @return Returns a possibly augmented merged.val.attr, with the attribute "map"
 #'  containing the matrix map.
 #' @noRd
 mergeValueAndLabelIntoValueAttributes <- function(val, lbl, merged.val.attr,
                                                   original.val.attr, map,
                                                   when.multiple.labels.for.one.value,
-                                                  match.parameters)
+                                                  match.parameters,
+                                                  is.label.duplicated)
 {
     if (length(merged.val.attr) == 0)
         merged.val.attr[lbl] <- val
     else if (lbl %in% names(merged.val.attr))
     {
-        merged.val <- labelValue(merged.val.attr, lbl)
-        if (val != merged.val) # same label with different values
+        vals.matching.lbl <- labelValues(merged.val.attr, lbl)
+        if (!(val %in% vals.matching.lbl)) # label exists in merged.val.attr but value doesn't match
         {
-            map <- rbind(map, c(val, merged.val), deparse.level = 0) # use the value in merged.val.attr
+            if (is.label.duplicated && !(val %in% merged.val.attr))
+            {
+                # Add the new value and duplicate label.
+                # We do this so that duplicate labels are retained in the merged value attributes
+                named.val <- val
+                names(named.val) <- lbl
+                merged.val.attr <- c(merged.val.attr, named.val)
+            }
+            else
+            {
+                # use the (first) value in merged.val.attr that matches the label
+                map <- rbind(map, c(val, vals.matching.lbl[1]), deparse.level = 0)
+            }
         }
-        # else: same label, same value, no action required as it is already in merged.val.attr
+        # else: value/label pair exists in merged.val.attr, no action required
     }
     else
     {
-        if (length(original.val.attr) > 0)
+        # We don't fuzzy match duplicated labels because duplication might be lost
+        if (length(original.val.attr) > 0 && !is.label.duplicated)
         {
             lbls.to.compare.against <- names(original.val.attr)
             match.percentages <- matchPercentagesForValueLabels(lbl = lbl,
@@ -2559,6 +2575,55 @@ omittedVariables <- function(input.data.sets.metadata, matched.names)
     })
 }
 
+# Remove duplicate values from value attributes in variables in data sets
+removeDuplicateValues <- function(data.sets)
+{
+    data.set.names <- names(data.sets)
+    data.sets <- lapply(data.set.names, function(data.set.name) {
+        data.set <- data.sets[[data.set.name]]
+        var.names <- names(data.set)
+        mod.var.names <- character(0)
+        for (i in seq_along(data.set))
+        {
+            val.attrs <- attr(data.set[[i]], "labels", exact = TRUE)
+            if (!is.null(val.attrs))
+            {
+                dup <- duplicated(val.attrs)
+                if (any(dup))
+                {
+                    val.attrs <- val.attrs[!dup]
+                    attr(data.set[[i]], "labels") <- val.attrs
+                    mod.var.names <- c(mod.var.names, var.names[i])
+                }
+            }
+            if (length(mod.var.names) > 0)
+            {
+                if (length(mod.var.names) == 1)
+                    warning("Duplicate values have been removed from the following variable in data set ",
+                            data.set.name, ": ", mod.var.names)
+                if (length(mod.var.names) <= 10)
+                    warning("Duplicate values have been removed from the following variables in data set ",
+                            data.set.name, ": ", paste0(mod.var.names, collapse = ", "))
+                else
+                    warning("Duplicate values have been removed from ",
+                            length(mod.var.names), " variables in data set ",
+                            data.set.name, ".")
+            }
+        }
+        data.set
+    })
+    names(data.sets) <- data.set.names
+    data.sets
+}
+
+# Determines whether lbl appears more than once in any of the value attributes
+# in val.attrs.list
+isLabelDuplicated <- function(lbl, val.attrs.list)
+{
+    any(vapply(val.attrs.list, function(val.attrs) sum(names(val.attrs) == lbl) > 1,
+               logical(1)))
+}
+
 # Convenience function: seq_len of nrow of matrix m
 seqRow <- function(m)
 {
diff --git a/tests/testthat/test-mergedatasetsbycase.R b/tests/testthat/test-mergedatasetsbycase.R
index 49af27a..75329f6 100644
--- a/tests/testthat/test-mergedatasetsbycase.R
+++ b/tests/testthat/test-mergedatasetsbycase.R
@@ -1174,3 +1174,6 @@ test_that("RS-9210: check that values merge correctly", {
                    `I've heard only negative things` = 5))
     expect_equal(result$value.map.list, list(NULL, NULL))
 })
+
+if (file.exists("Combined data set.sav"))
+    file.remove("Combined data set.sav")
diff --git a/tests/testthat/test-mergedatasetsbyvariable.R b/tests/testthat/test-mergedatasetsbyvariable.R
index ccd7b46..49ff810 100644
--- a/tests/testthat/test-mergedatasetsbyvariable.R
+++ b/tests/testthat/test-mergedatasetsbyvariable.R
@@ -254,3 +254,6 @@ test_that("exampleIDValues", {
                                  data.sets = data.sets),
                  c("1", "21", "A"))
 })
+
+if (file.exists("Combined data set.sav"))
+    file.remove("Combined data set.sav")
diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R
index 741c75f..73b5826 100644
--- a/tests/testthat/test-mergingandstackingutilities.R
+++ b/tests/testthat/test-mergingandstackingutilities.R
@@ -121,6 +121,3 @@ test_that("readDataSets: better error message when data file is invalid", {
                  paste0("The data file 'bad.sav' could not be parsed. ",
                         "Check the data set for issues and try again after fixing them or removing unnecessary variables."))
 })
-
-if (file.exists("Combined data set.sav"))
-    file.remove("Combined data set.sav")