Skip to content

Commit

Permalink
DS-3773: various fixes to merging of data sets (#32)
Browse files Browse the repository at this point in the history
* Temp file

* Update

* Fixes [revdep skip]

* Improve matching [revdep skip]

* Fix bug found by Lena

* Fix tests [revdep skip]

* Fix warning [revdep skip]
  • Loading branch information
JustinCCYap committed Jul 6, 2022
1 parent 8519d49 commit f621f88
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 33 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: flipData
Type: Package
Title: Functions for extracting and describing data
Version: 1.6.2
Version: 1.6.3
Author: Displayr <opensource@displayr.com>
Maintainer: Displayr <opensource@displayr.com>
Description: Functions for extracting data from formulas and
Expand Down
17 changes: 17 additions & 0 deletions R/DS-3773.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# d <- haven::read_sav("wincross exported file sr wave 1.sav")
# attr(d[[1]], "labels") <- NULL
# haven::write_sav(d, "wincross exported file sr wave mod.sav")
#
# d2 <- haven::read_sav("fs22001 sr brand wave 2.sav")
# attr(d2[["Q5r97"]], "labels") <- NULL
# haven::write_sav(d2, "fs22001 sr brand wave mod.sav")
#
# a <- MergeDataSetsByCase(c("wincross exported file sr wave mod.sav", "fs22001 sr brand wave mod.sav"),
# auto.select.what.to.match.by = FALSE, match.by.variable.names = TRUE,
# match.by.variable.labels = FALSE, match.by.value.labels = FALSE,
# ignore.non.alphanumeric = FALSE)

# Issues:
# RECORD/record variables not merging (one is numeric, one categorical)
# Q5R97 merged into Q5_r97 instead of Q5r97

93 changes: 78 additions & 15 deletions R/mergedatasetsbycase.R
Original file line number Diff line number Diff line change
Expand Up @@ -1259,19 +1259,78 @@ findMatchingVariable <- function(nms, lbls, val.attrs, candidate.names,
}, logical(1))
arr.indices.matrix <- arr.indices.matrix[is.numbers.preserved, , drop = FALSE]

if (nrow(arr.indices.matrix) == 1)
{
result <- candidate.names[arr.indices.matrix[1, 1]]
attr(result, "match.percentage") <- p
attr(result, "is.fuzzy.match") <- !is.exact.match
attr(result, "matched.by") <- "Variable name"
return(result)
}
else if (nrow(arr.indices.matrix) > 1)
if (nrow(arr.indices.matrix) > 0)
{
candidate.names <- candidate.names[arr.indices.matrix[, 1]]
candidate.labels <- candidate.labels[arr.indices.matrix[, 1]]
candidate.val.attrs <- candidate.val.attrs[arr.indices.matrix[, 1]]
candidate.indices <- unique(arr.indices.matrix[, 1])
if (length(candidate.indices) == 1) # best candidate found
{
result <- candidate.names[candidate.indices]
attr(result, "match.percentage") <- p
attr(result, "is.fuzzy.match") <- !is.exact.match
attr(result, "matched.by") <- "Variable name"
return(result)
}
else # no single best candidate
{
candidate.names <- candidate.names[candidate.indices]
candidate.labels <- candidate.labels[candidate.indices]
candidate.val.attrs <- candidate.val.attrs[candidate.indices]

# If we ignore alphanumeric characters, try to break the tie
# by not ignoring alphanumeric characters
if (ignore.non.alphanumeric) {
match.percentages.matrix <- matchPercentages(strings.1 = candidate.names,
strings.2 = nms,
ignore.case = ignore.case,
ignore.non.alphanumeric = FALSE,
min.match.percentage = min.match.percentage)
arr.indices.matrix <- which(match.percentages.matrix == max(match.percentages.matrix),
arr.ind = TRUE)
candidate.indices <- unique(arr.indices.matrix[, 1])
if (length(candidate.indices) == 1)
{
result <- candidate.names[candidate.indices]
attr(result, "match.percentage") <- p
attr(result, "is.fuzzy.match") <- !is.exact.match
attr(result, "matched.by") <- "Variable name"
return(result)
}
else
{
candidate.names <- candidate.names[candidate.indices]
candidate.labels <- candidate.labels[candidate.indices]
candidate.val.attrs <- candidate.val.attrs[candidate.indices]
}
}

# If we ignore case, try to break the tie by not ignoring
# alphanumeric characters
if (ignore.case) {
match.percentages.matrix <- matchPercentages(strings.1 = candidate.names,
strings.2 = nms,
ignore.case = FALSE,
ignore.non.alphanumeric = FALSE,
min.match.percentage = min.match.percentage)
arr.indices.matrix <- which(match.percentages.matrix == max(match.percentages.matrix),
arr.ind = TRUE)
candidate.indices <- unique(arr.indices.matrix[, 1])
if (length(unique(arr.indices.matrix[, 1])) == 1)
{
result <- candidate.names[candidate.indices]
attr(result, "match.percentage") <- p
attr(result, "is.fuzzy.match") <- !is.exact.match
attr(result, "matched.by") <- "Variable name"
return(result)
}
else
{
candidate.names <- candidate.names[candidate.indices]
candidate.labels <- candidate.labels[candidate.indices]
candidate.val.attrs <- candidate.val.attrs[candidate.indices]
}
}
break
}
}
}
}
Expand Down Expand Up @@ -1693,8 +1752,9 @@ mergedVariableNames <- function(matched.names, use.names.and.labels.from)
# Merged names may contain duplicate variable names due to the user
# specifying variables with the same name to not be combined or variables
# with the same name not being combined as their types are incompatible.
# Variables that only differ by case are also considered duplicate.
# We rename variables so that the names are unique.
dup <- which(duplicated(merged.names))
dup <- which(duplicated(tolower(merged.names)))
renamed.variables <- matrix(nrow = length(dup), ncol = 2)
colnames(renamed.variables) <- c("Original name", "New name")
for (i in seq_along(dup))
Expand Down Expand Up @@ -2299,7 +2359,8 @@ combineAsNonCategoricalVariable <- function(var.list, data.sets, v.types)
else
return(combineAsTextVariable(var.list, data.sets, v.types))
}
else if (unique.v.types == TEXT.VARIABLE.TYPE)
else if (length(unique.v.types) == 1 &&
unique.v.types == TEXT.VARIABLE.TYPE)
{
return(combineAsTextVariable(var.list, data.sets, v.types))
}
Expand Down Expand Up @@ -2396,7 +2457,7 @@ combineAsNumericVariable <- function(var.list, data.sets, v.types)
# v.types can be text, numeric and categorical
combineAsTextVariable <- function(var.list, data.sets, v.types)
{
do.call("c", lapply(seq_along(data.sets), function(i) {
result <- do.call("c", lapply(seq_along(data.sets), function(i) {
v <- var.list[[i]]
if (is.null(v))
rep(NA_character_, nrow(data.sets[[i]]))
Expand All @@ -2413,6 +2474,8 @@ combineAsTextVariable <- function(var.list, data.sets, v.types)
else # v.types[i] == TEXT.VARIABLE.TYPE
v
}))
attr(result, "labels") <- NULL
result
}

#' @param text.variable A character vector representing a text variable.
Expand Down
37 changes: 23 additions & 14 deletions R/mergedatasetsbyvariable.R
Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,16 @@ MergeDataSetsByVariable <- function(data.set.names,
result$merged.data.set.metadata <- metadataFromDataSet(merged.data.set,
merged.data.set.name)
result$source.data.set.indices <- attr(merged.data.set.var.names,
"source.data.set.indices")
"source.data.set.indices",
exact = TRUE)
result$omitted.variable.names.list <- attr(merged.data.set.var.names,
"omitted.variable.names.list")
"omitted.variable.names.list",
exact = TRUE)
result$merged.id.variable.name <- attr(merged.data.set.var.names,
"merged.id.variable.name")
result$id.variable.names <- attr(matched.cases.matrix, "id.variable.names")
"merged.id.variable.name",
exact = TRUE)
result$id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
exact = TRUE)
result$example.id.values <- exampleIDValues(result$id.variable.names,
data.sets)
result$is.saved.to.cloud <- is.saved.to.cloud
Expand Down Expand Up @@ -330,7 +334,7 @@ convertIDVariableType <- function(ids, id.variable.type,
if (id.variable.type == CATEGORICAL.VARIABLE.TYPE)
{
converted.ids <- rep(NA_character_, length(ids))
val.attrs <- attr(ids, "labels")
val.attrs <- attr(ids, "labels", exact = TRUE)
val.labels <- names(val.attrs)
for (i in seq_along(val.attrs))
converted.ids[ids == val.attrs[i]] <- val.labels[i]
Expand Down Expand Up @@ -372,7 +376,7 @@ mergedDataSetVariableNames <- function(input.data.sets.metadata,
{
n.data.sets <- input.data.sets.metadata$n.data.sets
v.names.list <- input.data.sets.metadata$variable.names.list
id.var.names <- attr(matched.cases, "id.variable.names")
id.var.names <- attr(matched.cases, "id.variable.names", exact = TRUE)

v.names.to.include.or.omit.list <- lapply(seq_len(n.data.sets), function(i) {
parseInputVariableTextForDataSet(variables.to.include.or.omit[[i]],
Expand Down Expand Up @@ -527,11 +531,14 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
n.data.sets <- input.data.sets.metadata$n.data.sets
n.merged.cases <- nrow(matched.cases.matrix)
included.variable.names.list <- attr(merged.data.set.variable.names,
"included.variable.names.list")
id.variable.names <- attr(matched.cases.matrix, "id.variable.names")
"included.variable.names.list",
exact = TRUE)
id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
exact = TRUE)
merged.id.var.name <- attr(merged.data.set.variable.names,
"merged.id.variable.name")
merged.id.variable <- attr(matched.cases.matrix, "merged.id.variable")
"merged.id.variable.name", exact = TRUE)
merged.id.variable <- attr(matched.cases.matrix, "merged.id.variable",
exact = TRUE)

merged.data.set.variables <- vector(mode = "list",
length = length(merged.data.set.variable.names))
Expand All @@ -546,7 +553,8 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
nm == merged.id.var.name) # ID variable
{
merged.var <- merged.id.variable
attr(merged.var, "label") <- attr(data.sets[[data.set.ind]][[nm]], "label")
attr(merged.var, "label") <- attr(data.sets[[data.set.ind]][[nm]],
"label", exact = TRUE)
}
else # Non-ID variable
{
Expand All @@ -562,7 +570,7 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
v.type <- variableType(input.var)
if (v.type == CATEGORICAL.VARIABLE.TYPE)
{
val.attr <- attr(input.var, "labels")
val.attr <- attr(input.var, "labels", exact = TRUE)
if (is.integer(merged.var))
{
val.lbls <- names(val.attr)
Expand All @@ -572,7 +580,8 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
attr(merged.var, "labels") <- val.attr
class(merged.var) <- c(class(merged.var), "haven_labelled")
}
attr(merged.var, "label") <- attr(input.var, "label")
attr(merged.var, "label") <- attr(input.var, "label",
exact = TRUE)
}

merged.data.set.size <- merged.data.set.size + object.size(merged.var)
Expand Down Expand Up @@ -600,7 +609,7 @@ exampleIDValues <- function(id.variable.names, data.sets)

vapply(seq_along(data.sets), function(i) {
v <- data.sets[[i]][[id.variable.names[i]]]
val.attr <- attr(v, "labels")
val.attr <- attr(v, "labels", exact = TRUE)
# ID variables will have non-missing values as we checked for this
if (!is.null(val.attr))
names(val.attr)[val.attr == removeNA(v)[1]]
Expand Down
8 changes: 5 additions & 3 deletions R/mergingandstackingutilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ throwVariableNotFoundError <- function(var.name, data.set.index = NULL)
}

#' @description Creates a name from new.name that does not exist in existing.names by
#' appending a numeric suffix if necessary
#' appending a numeric suffix if necessary. Case is ignored when comparing names.
#' @param new.name Character scalar containing the candidate name that may need
#' to be renamed to be different from the names in existing.names.
#' @param existing.names Character vector of existing names.
Expand All @@ -447,14 +447,16 @@ throwVariableNotFoundError <- function(var.name, data.set.index = NULL)
#' @noRd
uniqueName <- function(new.name, existing.names, delimiter = "")
{
if (!(new.name %in% existing.names))
lower.case.new.name <- tolower(new.name)
lower.case.existing.names <- tolower(existing.names)
if (!(lower.case.new.name %in% lower.case.existing.names))
return (new.name)

i <- 1
repeat
{
candidate.name <- paste0(new.name, delimiter, i)
if (!(candidate.name %in% existing.names))
if (!(tolower(candidate.name) %in% lower.case.existing.names))
return(candidate.name)
i <- i + 1
}
Expand Down
3 changes: 3 additions & 0 deletions tests/testthat/test-mergingandstackingutilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ test_that("uniqueName", {
expect_equal(uniqueName(new.name = "Q2",
existing.names = c("Q1", "Q2", "Q3"),
delimiter = "_"), "Q2_1")
expect_equal(uniqueName(new.name = "q2",
existing.names = c("Q1", "Q2", "Q3"),
delimiter = "_"), "q2_1")
})

test_that("parseVariableWildcardForMerging", {
Expand Down

0 comments on commit f621f88

Please sign in to comment.