Skip to content

Commit

Permalink
Throw user friendly error when memory limit is exceeded
Browse files Browse the repository at this point in the history
  • Loading branch information
JustinCCYap committed May 23, 2023
1 parent 563ff32 commit e479ee5
Show file tree
Hide file tree
Showing 4 changed files with 212 additions and 181 deletions.
116 changes: 63 additions & 53 deletions R/mergedatasetsbycase.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,55 +133,70 @@ MergeDataSetsByCase <- function(data.set.names,
data.sets.whose.variables.are.kept = seq_along(data.set.names),
min.value.label.match.percentage = 90)
{
data.sets <- readDataSets(data.set.names, 2)
data.sets <- removeDuplicateValues(data.sets)
input.data.sets.metadata <- metadataFromDataSets(data.sets)

match.parameters <- list(auto.select.what.to.match.by = auto.select.what.to.match.by,
match.by.variable.names = match.by.variable.names,
match.by.variable.labels = match.by.variable.labels,
match.by.value.labels = match.by.value.labels,
ignore.case = ignore.case,
ignore.non.alphanumeric = ignore.non.alphanumeric,
min.match.percentage = min.match.percentage,
min.value.label.match.percentage = min.value.label.match.percentage)

matched.names <- matchVariables(input.data.sets.metadata,
match.parameters,
variables.to.combine,
variables.to.not.combine,
variables.to.keep,
variables.to.omit, data.sets,
data.sets.whose.variables.are.kept,
use.names.and.labels.from)
merged.names <- mergedVariableNames(matched.names,
tryCatch({
data.sets <- readDataSets(data.set.names, 2)
data.sets <- removeDuplicateValues(data.sets)
input.data.sets.metadata <- metadataFromDataSets(data.sets)

match.parameters <- list(auto.select.what.to.match.by = auto.select.what.to.match.by,
match.by.variable.names = match.by.variable.names,
match.by.variable.labels = match.by.variable.labels,
match.by.value.labels = match.by.value.labels,
ignore.case = ignore.case,
ignore.non.alphanumeric = ignore.non.alphanumeric,
min.match.percentage = min.match.percentage,
min.value.label.match.percentage = min.value.label.match.percentage)

matched.names <- matchVariables(input.data.sets.metadata,
match.parameters,
variables.to.combine,
variables.to.not.combine,
variables.to.keep,
variables.to.omit, data.sets,
data.sets.whose.variables.are.kept,
use.names.and.labels.from)
merged.data.set <- mergedDataSet(data.sets, matched.names, merged.names,
use.names.and.labels.from,
when.multiple.labels.for.one.value,
match.parameters)
merged.data.set.name <- correctDataSetName(merged.data.set.name,
"Combined data set.sav")

is.saved.to.cloud <- IsDisplayrCloudDriveAvailable()
writeDataSet(merged.data.set, merged.data.set.name, is.saved.to.cloud)

result <- list()
if (include.merged.data.set.in.output)
result$merged.data.set <- merged.data.set

result$input.data.sets.metadata <- input.data.sets.metadata
result$merged.data.set.metadata <- metadataFromDataSet(merged.data.set,
merged.data.set.name)
result$matched.names <- matched.names
result$merged.names <- merged.names
result$omitted.variable.names.list <- omittedVariables(input.data.sets.metadata,
matched.names)
result$input.value.attributes.list <- lapply(merged.data.set, attr,
"input.value.attributes")
result$is.saved.to.cloud <- is.saved.to.cloud
class(result) <- "MergeDataSetByCase"
result
merged.names <- mergedVariableNames(matched.names,
use.names.and.labels.from)
}, error = function(e) {
if (grepl("cannot allocate vector of size ", e$message)) {
throwInputDataSetsTooLargeError()
} else
stop(e)
})

tryCatch({
merged.data.set <- mergedDataSet(data.sets, matched.names, merged.names,
use.names.and.labels.from,
when.multiple.labels.for.one.value,
match.parameters)
merged.data.set.name <- correctDataSetName(merged.data.set.name,
"Combined data set.sav")

is.saved.to.cloud <- IsDisplayrCloudDriveAvailable()
writeDataSet(merged.data.set, merged.data.set.name, is.saved.to.cloud)

result <- list()
if (include.merged.data.set.in.output)
result$merged.data.set <- merged.data.set

result$input.data.sets.metadata <- input.data.sets.metadata
result$merged.data.set.metadata <- metadataFromDataSet(merged.data.set,
merged.data.set.name)
result$matched.names <- matched.names
result$merged.names <- merged.names
result$omitted.variable.names.list <- omittedVariables(input.data.sets.metadata,
matched.names)
result$input.value.attributes.list <- lapply(merged.data.set, attr,
"input.value.attributes")
result$is.saved.to.cloud <- is.saved.to.cloud
class(result) <- "MergeDataSetByCase"
result
}, error = function(e) {
if (grepl("cannot allocate vector of size ", e$message)) {
throwCombinedDataSetTooLargeError()
} else
stop(e)
})
}

# Performs matching of variables and returns the matched.names character matrix
Expand Down Expand Up @@ -1910,17 +1925,12 @@ mergedDataSet <- function(data.sets, matched.names, merged.names,
n.data.set.cases <- vapply(data.sets, nrow, integer(1))

merged.data.set <- vector(mode = "list", length = n.vars)
data.set.size <- 0
for (i in seq_len(n.vars))
{
v <- compositeVariable(matched.names[i, ], data.sets,
use.names.and.labels.from,
when.multiple.labels.for.one.value,
match.parameters)
# data.set.size <- data.set.size + object.size(v)
# if (data.set.size > DATA.SET.SIZE.LIMIT)
# stop("The combined data set is too large to create. ",
# "Consider omitting variables or only keeping combined variables that contain input variables from a few data sets.")
merged.data.set[[i]] <- v
}
merged.data.set <- data.frame(merged.data.set, check.names = FALSE)
Expand Down
105 changes: 57 additions & 48 deletions R/mergedatasetsbyvariable.R
Original file line number Diff line number Diff line change
Expand Up @@ -113,48 +113,64 @@ MergeDataSetsByVariable <- function(data.set.names,
# are specified.
# merged.data.set: A data frame representing the merged data set.

data.sets <- readDataSets(data.set.names, 2)
input.data.sets.metadata <- metadataFromDataSets(data.sets)

matched.cases.matrix <- matchCases(input.data.sets.metadata, id.variables,
data.sets,
only.keep.cases.matched.to.all.data.sets)
merged.data.set.var.names <- mergedDataSetVariableNames(input.data.sets.metadata,
include.or.omit.variables,
variables.to.include.or.omit,
matched.cases.matrix)
merged.data.set <- doMergeByVariable(data.sets, matched.cases.matrix,
merged.data.set.var.names,
input.data.sets.metadata)
merged.data.set.name <- correctDataSetName(merged.data.set.name,
"Combined data set.sav")
is.saved.to.cloud <- IsDisplayrCloudDriveAvailable()
writeDataSet(merged.data.set, merged.data.set.name,
is.saved.to.cloud = is.saved.to.cloud)

result <- list()
if (include.merged.data.set.in.output)
result$merged.data.set <- merged.data.set

result$input.data.sets.metadata <- input.data.sets.metadata
result$merged.data.set.metadata <- metadataFromDataSet(merged.data.set,
merged.data.set.name)
result$source.data.set.indices <- attr(merged.data.set.var.names,
"source.data.set.indices",
exact = TRUE)
result$omitted.variable.names.list <- attr(merged.data.set.var.names,
"omitted.variable.names.list",
tryCatch({
data.sets <- readDataSets(data.set.names, 2)
input.data.sets.metadata <- metadataFromDataSets(data.sets)

matched.cases.matrix <- matchCases(input.data.sets.metadata, id.variables,
data.sets,
only.keep.cases.matched.to.all.data.sets)
merged.data.set.var.names <- mergedDataSetVariableNames(input.data.sets.metadata,
include.or.omit.variables,
variables.to.include.or.omit,
matched.cases.matrix)
}, error = function(e) {
if (grepl("cannot allocate vector of size ", e$message)) {
throwInputDataSetsTooLargeError()
} else
stop(e)
})


tryCatch({
merged.data.set <- doMergeByVariable(data.sets, matched.cases.matrix,
merged.data.set.var.names,
input.data.sets.metadata)
merged.data.set.name <- correctDataSetName(merged.data.set.name,
"Combined data set.sav")
is.saved.to.cloud <- IsDisplayrCloudDriveAvailable()
writeDataSet(merged.data.set, merged.data.set.name,
is.saved.to.cloud = is.saved.to.cloud)

result <- list()
if (include.merged.data.set.in.output)
result$merged.data.set <- merged.data.set

result$input.data.sets.metadata <- input.data.sets.metadata
result$merged.data.set.metadata <- metadataFromDataSet(merged.data.set,
merged.data.set.name)
result$source.data.set.indices <- attr(merged.data.set.var.names,
"source.data.set.indices",
exact = TRUE)
result$merged.id.variable.name <- attr(merged.data.set.var.names,
"merged.id.variable.name",
exact = TRUE)
result$id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
exact = TRUE)
result$example.id.values <- exampleIDValues(result$id.variable.names,
data.sets)
result$is.saved.to.cloud <- is.saved.to.cloud
class(result) <- "MergeDataSetByVariable"
result
result$omitted.variable.names.list <- attr(merged.data.set.var.names,
"omitted.variable.names.list",
exact = TRUE)
result$merged.id.variable.name <- attr(merged.data.set.var.names,
"merged.id.variable.name",
exact = TRUE)
result$id.variable.names <- attr(matched.cases.matrix, "id.variable.names",
exact = TRUE)
result$example.id.values <- exampleIDValues(result$id.variable.names,
data.sets)
result$is.saved.to.cloud <- is.saved.to.cloud
class(result) <- "MergeDataSetByVariable"
result
}, error = function(e) {
if (grepl("cannot allocate vector of size ", e$message)) {
throwCombinedDataSetTooLargeError()
} else
stop(e)
})
}

#' @param input.data.sets.metadata See data dictionary.
Expand Down Expand Up @@ -542,7 +558,6 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,

merged.data.set.variables <- vector(mode = "list",
length = length(merged.data.set.variable.names))
merged.data.set.size <- 0

j <- 1
for (data.set.ind in seq_len(n.data.sets))
Expand Down Expand Up @@ -583,12 +598,6 @@ doMergeByVariable <- function(data.sets, matched.cases.matrix,
attr(merged.var, "label") <- attr(input.var, "label",
exact = TRUE)
}

merged.data.set.size <- merged.data.set.size + object.size(merged.var)
if (merged.data.set.size > DATA.SET.SIZE.LIMIT)
stop("The combined data set is too large to create. ",
"Consider omitting variables or only keeping combined variables that contain input variables from a few data sets.")

merged.data.set.variables[[j]] <- merged.var
j <- j + 1
}
Expand Down
12 changes: 9 additions & 3 deletions R/mergingandstackingutilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -491,9 +491,6 @@ parseVariableWildcardForMerging <- function(wildcard.text, variable.names,
variable.names[is.match]
}

# Set to 2GB as I found that memory issues start to occur beyond here
DATA.SET.SIZE.LIMIT <- 2 * 1e9

sanitizeSPSSVariableNames <- function(variable.names) {
# Can't begin with or end with a period
forbidden.period <- startsWith(variable.names, ".")
Expand Down Expand Up @@ -572,5 +569,14 @@ addSuffixFittingByteLimit <- function(string, suffix = "", byte.limit = 64) {
j <- j - 1
new.string = paste0(substr(string, 1, j), suffix)
new.string
}

throwInputDataSetsTooLargeError <- function() {
stop("The input data sets are too large to process.",
"Consider reducing their size or only combining a subset of the data sets.")
}

throwCombinedDataSetTooLargeError <- function() {
stop("The combined data set is too large to create. ",
"Consider omitting variables from the combined data set.")
}

0 comments on commit e479ee5

Please sign in to comment.