DS-3773: various fixes to merging of data sets (#32)

* Temp file * Update * Fixes [revdep skip] * Improve matching [revdep skip] * Fix bug found by Lena * Fix tests [revdep skip] * Fix warning [revdep skip]
Displayr · Jul 6, 2022 · f621f88 · f621f88
1 parent 8519d49
commit f621f88
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 33 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: flipData
 Type: Package
 Title: Functions for extracting and describing data
-Version: 1.6.2
+Version: 1.6.3
 Author: Displayr <opensource@displayr.com>
 Maintainer: Displayr <opensource@displayr.com>
 Description: Functions for extracting data from formulas and

diff --git a/R/DS-3773.R b/R/DS-3773.R
@@ -0,0 +1,17 @@
+# d <- haven::read_sav("wincross exported file sr wave 1.sav")
+# attr(d[[1]], "labels") <- NULL
+# haven::write_sav(d, "wincross exported file sr wave mod.sav")
+#
+# d2 <- haven::read_sav("fs22001 sr brand wave 2.sav")
+# attr(d2[["Q5r97"]], "labels") <- NULL
+# haven::write_sav(d2, "fs22001 sr brand wave mod.sav")
+#
+# a <- MergeDataSetsByCase(c("wincross exported file sr wave mod.sav", "fs22001 sr brand wave mod.sav"),
+#                          auto.select.what.to.match.by = FALSE, match.by.variable.names = TRUE,
+#                          match.by.variable.labels = FALSE, match.by.value.labels = FALSE,
+#                          ignore.non.alphanumeric = FALSE)
+
+# Issues:
+# RECORD/record variables not merging (one is numeric, one categorical)
+# Q5R97 merged into Q5_r97 instead of Q5r97
+
diff --git a/R/mergedatasetsbycase.R b/R/mergedatasetsbycase.R
@@ -1259,19 +1259,78 @@ findMatchingVariable <- function(nms, lbls, val.attrs, candidate.names,
             }, logical(1))
             arr.indices.matrix <- arr.indices.matrix[is.numbers.preserved, , drop = FALSE]
 
-            if (nrow(arr.indices.matrix) == 1)
-            {
-                result <- candidate.names[arr.indices.matrix[1, 1]]
-                attr(result, "match.percentage") <- p
-                attr(result, "is.fuzzy.match") <- !is.exact.match
-                attr(result, "matched.by") <- "Variable name"
-                return(result)
-            }
-            else if (nrow(arr.indices.matrix) > 1)
+            if (nrow(arr.indices.matrix) > 0)
             {
-                candidate.names <- candidate.names[arr.indices.matrix[, 1]]
-                candidate.labels <- candidate.labels[arr.indices.matrix[, 1]]
-                candidate.val.attrs <- candidate.val.attrs[arr.indices.matrix[, 1]]
+                candidate.indices <- unique(arr.indices.matrix[, 1])
+                if (length(candidate.indices) == 1) # best candidate found
+                {
+                    result <- candidate.names[candidate.indices]
+                    attr(result, "match.percentage") <- p
+                    attr(result, "is.fuzzy.match") <- !is.exact.match
+                    attr(result, "matched.by") <- "Variable name"
+                    return(result)
+                }
+                else # no single best candidate
+                {
+                    candidate.names <- candidate.names[candidate.indices]
+                    candidate.labels <- candidate.labels[candidate.indices]
+                    candidate.val.attrs <- candidate.val.attrs[candidate.indices]
+
+                    # If we ignore alphanumeric characters, try to break the tie
+                    # by not ignoring alphanumeric characters
+                    if (ignore.non.alphanumeric) {
+                        match.percentages.matrix <- matchPercentages(strings.1 = candidate.names,
+                                                                     strings.2 = nms,
+                                                                     ignore.case = ignore.case,
+                                                                     ignore.non.alphanumeric = FALSE,
+                                                                     min.match.percentage = min.match.percentage)
+                        arr.indices.matrix <- which(match.percentages.matrix == max(match.percentages.matrix),
+                                                    arr.ind = TRUE)
+                        candidate.indices <- unique(arr.indices.matrix[, 1])
+                        if (length(candidate.indices) == 1)
+                        {
+                            result <- candidate.names[candidate.indices]
+                            attr(result, "match.percentage") <- p
+                            attr(result, "is.fuzzy.match") <- !is.exact.match
+                            attr(result, "matched.by") <- "Variable name"
+                            return(result)
+                        }
+                        else
+                        {
+                            candidate.names <- candidate.names[candidate.indices]
+                            candidate.labels <- candidate.labels[candidate.indices]
+                            candidate.val.attrs <- candidate.val.attrs[candidate.indices]
+                        }
+                    }
+
+                    # If we ignore case, try to break the tie by not ignoring
+                    # alphanumeric characters
+                    if (ignore.case) {
+                        match.percentages.matrix <- matchPercentages(strings.1 = candidate.names,
+                                                                     strings.2 = nms,
+                                                                     ignore.case = FALSE,
+                                                                     ignore.non.alphanumeric = FALSE,
+                                                                     min.match.percentage = min.match.percentage)
+                        arr.indices.matrix <- which(match.percentages.matrix == max(match.percentages.matrix),
+                                                    arr.ind = TRUE)
+                        candidate.indices <- unique(arr.indices.matrix[, 1])
+                        if (length(unique(arr.indices.matrix[, 1])) == 1)
+                        {
+                            result <- candidate.names[candidate.indices]
+                            attr(result, "match.percentage") <- p
+                            attr(result, "is.fuzzy.match") <- !is.exact.match
+                            attr(result, "matched.by") <- "Variable name"
+                            return(result)
+                        }
+                        else
+                        {
+                            candidate.names <- candidate.names[candidate.indices]
+                            candidate.labels <- candidate.labels[candidate.indices]
+                            candidate.val.attrs <- candidate.val.attrs[candidate.indices]
+                        }
+                    }
+                    break
+                }
             }
         }
     }
@@ -1693,8 +1752,9 @@ mergedVariableNames <- function(matched.names, use.names.and.labels.from)
     # Merged names may contain duplicate variable names due to the user
     # specifying variables with the same name to not be combined or variables
     # with the same name not being combined as their types are incompatible.
+    # Variables that only differ by case are also considered duplicate.
     # We rename variables so that the names are unique.
-    dup <- which(duplicated(merged.names))
+    dup <- which(duplicated(tolower(merged.names)))
     renamed.variables <- matrix(nrow = length(dup), ncol = 2)
     colnames(renamed.variables) <- c("Original name", "New name")
     for (i in seq_along(dup))
@@ -2299,7 +2359,8 @@ combineAsNonCategoricalVariable <- function(var.list, data.sets, v.types)
         else
             return(combineAsTextVariable(var.list, data.sets, v.types))
     }
-    else if (unique.v.types == TEXT.VARIABLE.TYPE)
+    else if (length(unique.v.types) == 1 &&
+             unique.v.types == TEXT.VARIABLE.TYPE)
     {
         return(combineAsTextVariable(var.list, data.sets, v.types))
     }
@@ -2396,7 +2457,7 @@ combineAsNumericVariable <- function(var.list, data.sets, v.types)
 # v.types can be text, numeric and categorical
 combineAsTextVariable <- function(var.list, data.sets, v.types)
 {
-    do.call("c", lapply(seq_along(data.sets), function(i) {
+    result <- do.call("c", lapply(seq_along(data.sets), function(i) {
         v <- var.list[[i]]
         if (is.null(v))
             rep(NA_character_, nrow(data.sets[[i]]))
@@ -2413,6 +2474,8 @@ combineAsTextVariable <- function(var.list, data.sets, v.types)
         else # v.types[i] == TEXT.VARIABLE.TYPE
             v
     }))
+    attr(result, "labels") <- NULL
+    result
 }
 
 #' @param text.variable A character vector representing a text variable.

diff --git a/R/mergedatasetsbyvariable.R b/R/mergedatasetsbyvariable.R
@@ -140,12 +140,16 @@ MergeDataSetsByVariable <- function(data.set.names,
     result$merged.data.set.metadata <- metadataFromDataSet(merged.data.set,
                                                            merged.data.set.name)
     result$source.data.set.indices <- attr(merged.data.set.var.names,
-                                           "source.data.set.indices")
+                                           "source.data.set.indices",
+                                           exact = TRUE)
     result$omitted.variable.names.list <- attr(merged.data.set.var.names,
-                                               "omitted.variable.names.list")
+                                               "omitted.variable.names.list",
+                                               exact = TRUE)
     result$merged.id.variable.name <- attr(merged.data.set.var.names,
-                                           "merged.id.variable.name")
-    result$id.variable.names <- attr(matched.cases.matrix, "id.variable.names")
+                                           "merged.id.variable.name",
+                                           exact = TRUE)
+    result$id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
+                                     exact = TRUE)
     result$example.id.values <- exampleIDValues(result$id.variable.names,
                                                 data.sets)
     result$is.saved.to.cloud <- is.saved.to.cloud
@@ -330,7 +334,7 @@ convertIDVariableType <- function(ids, id.variable.type,
         if (id.variable.type == CATEGORICAL.VARIABLE.TYPE)
         {
             converted.ids <- rep(NA_character_, length(ids))
-            val.attrs <- attr(ids, "labels")
+            val.attrs <- attr(ids, "labels", exact = TRUE)
             val.labels <- names(val.attrs)
             for (i in seq_along(val.attrs))
                 converted.ids[ids == val.attrs[i]] <- val.labels[i]
@@ -372,7 +376,7 @@ mergedDataSetVariableNames <- function(input.data.sets.metadata,
 {
     n.data.sets <- input.data.sets.metadata$n.data.sets
     v.names.list <- input.data.sets.metadata$variable.names.list
-    id.var.names <- attr(matched.cases, "id.variable.names")
+    id.var.names <- attr(matched.cases, "id.variable.names", exact = TRUE)
 
     v.names.to.include.or.omit.list <- lapply(seq_len(n.data.sets), function(i) {
         parseInputVariableTextForDataSet(variables.to.include.or.omit[[i]],
@@ -527,11 +531,14 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
     n.data.sets <- input.data.sets.metadata$n.data.sets
     n.merged.cases <- nrow(matched.cases.matrix)
     included.variable.names.list <- attr(merged.data.set.variable.names,
-                                         "included.variable.names.list")
-    id.variable.names <- attr(matched.cases.matrix, "id.variable.names")
+                                         "included.variable.names.list",
+                                         exact = TRUE)
+    id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
+                              exact = TRUE)
     merged.id.var.name <- attr(merged.data.set.variable.names,
-                               "merged.id.variable.name")
-    merged.id.variable <- attr(matched.cases.matrix, "merged.id.variable")
+                               "merged.id.variable.name", exact = TRUE)
+    merged.id.variable <- attr(matched.cases.matrix, "merged.id.variable",
+                               exact = TRUE)
 
     merged.data.set.variables <- vector(mode = "list",
                                         length = length(merged.data.set.variable.names))
@@ -546,7 +553,8 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
                 nm == merged.id.var.name) # ID variable
             {
                 merged.var <- merged.id.variable
-                attr(merged.var, "label") <- attr(data.sets[[data.set.ind]][[nm]], "label")
+                attr(merged.var, "label") <- attr(data.sets[[data.set.ind]][[nm]],
+                                                  "label", exact = TRUE)
             }
             else # Non-ID variable
             {
@@ -562,7 +570,7 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
                 v.type <- variableType(input.var)
                 if (v.type == CATEGORICAL.VARIABLE.TYPE)
                 {
-                    val.attr <- attr(input.var, "labels")
+                    val.attr <- attr(input.var, "labels", exact = TRUE)
                     if (is.integer(merged.var))
                     {
                         val.lbls <- names(val.attr)
@@ -572,7 +580,8 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
                     attr(merged.var, "labels") <- val.attr
                     class(merged.var) <- c(class(merged.var), "haven_labelled")
                 }
-                attr(merged.var, "label") <- attr(input.var, "label")
+                attr(merged.var, "label") <- attr(input.var, "label",
+                                                  exact = TRUE)
             }
 
             merged.data.set.size <- merged.data.set.size + object.size(merged.var)
@@ -600,7 +609,7 @@ exampleIDValues <- function(id.variable.names, data.sets)
 
     vapply(seq_along(data.sets), function(i) {
         v <- data.sets[[i]][[id.variable.names[i]]]
-        val.attr <- attr(v, "labels")
+        val.attr <- attr(v, "labels", exact = TRUE)
         # ID variables will have non-missing values as we checked for this
         if (!is.null(val.attr))
             names(val.attr)[val.attr == removeNA(v)[1]]

diff --git a/R/mergingandstackingutilities.R b/R/mergingandstackingutilities.R
@@ -434,7 +434,7 @@ throwVariableNotFoundError <- function(var.name, data.set.index = NULL)
 }
 
 #' @description Creates a name from new.name that does not exist in existing.names by
-#'  appending a numeric suffix if necessary
+#'  appending a numeric suffix if necessary. Case is ignored when comparing names.
 #' @param new.name Character scalar containing the candidate name that may need
 #'  to be renamed to be different from the names in existing.names.
 #' @param existing.names Character vector of existing names.
@@ -447,14 +447,16 @@ throwVariableNotFoundError <- function(var.name, data.set.index = NULL)
 #' @noRd
 uniqueName <- function(new.name, existing.names, delimiter = "")
 {
-    if (!(new.name %in% existing.names))
+    lower.case.new.name <- tolower(new.name)
+    lower.case.existing.names <- tolower(existing.names)
+    if (!(lower.case.new.name %in% lower.case.existing.names))
         return (new.name)
 
     i <- 1
     repeat
     {
         candidate.name <- paste0(new.name, delimiter, i)
-        if (!(candidate.name %in% existing.names))
+        if (!(tolower(candidate.name) %in% lower.case.existing.names))
             return(candidate.name)
         i <- i + 1
     }

diff --git a/tests/testthat/test-mergingandstackingutilities.R b/tests/testthat/test-mergingandstackingutilities.R
@@ -83,6 +83,9 @@ test_that("uniqueName", {
     expect_equal(uniqueName(new.name = "Q2",
                             existing.names = c("Q1", "Q2", "Q3"),
                             delimiter = "_"), "Q2_1")
+    expect_equal(uniqueName(new.name = "q2",
+                            existing.names = c("Q1", "Q2", "Q3"),
+                            delimiter = "_"), "q2_1")
 })
 
 test_that("parseVariableWildcardForMerging", {