diff --git a/DESCRIPTION b/DESCRIPTION index 1b4efaa..5331ad5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: flipData Type: Package Title: Functions for extracting and describing data -Version: 1.2.5 +Version: 1.2.6 Author: Displayr Maintainer: Displayr Description: Functions for extracting data from formulas and diff --git a/R/estimationdata.R b/R/estimationdata.R index 657fb7c..b71b3f0 100644 --- a/R/estimationdata.R +++ b/R/estimationdata.R @@ -84,7 +84,7 @@ EstimationData <- function(formula = NULL, .filter <- if (weighted) subset & weights > 0 & some.data else subset & some.data # Name to avoid bug in subset.data.frame data.subset <- subset(data, .filter) data.subset <- CopyAttributes(data.subset, data) - + dummy.adjusted <- FALSE ############## ## Imputation single.imputation <- missing == "Imputation (replace missing values with estimates)" @@ -132,7 +132,10 @@ EstimationData <- function(formula = NULL, stop(paste("Unknown 'missing' method:", missing))) data.for.estimation <- CopyAttributes(data.for.estimation, data.subset) if (missing == "Dummy variable adjustment") + { # Dummy variable adjustment can be selected but sometimes not used + dummy.adjusted <- any(grepl(".dummy.var_GQ9KqD7YOf$", colnames(data.for.estimation))) data.cols <- names(data.for.estimation) %in% names(data.subset) + } else data.cols <- rep(TRUE, ncol(data.for.estimation)) levels.pre <- paste0(rep(labels, vapply(data.for.estimation[data.cols], nlevels, 0L)), ": ", @@ -163,11 +166,30 @@ EstimationData <- function(formula = NULL, if (error.if.insufficient.obs && n.estimation < length(variable.names)) stop(gettextf("There are fewer observations (%d)%s(%d)", n.estimation, " than there are variables ", length(variable.names))) - description <- SampleDescription(n.total, n.subset, n.estimation, Labels(subset), weighted, weight.label, missing, imputation.label, m, - if(HasOutcome(formula)) "predictor" else "") - + if(HasOutcome(formula)) "predictor" else "", + dummy.adjusted = dummy.adjusted) + # Add statements about removing cases with missing outcomes and/or all predictors missing + if (missing == "Dummy variable adjustment") + { + outcome.name <- OutcomeName(formula) + missing.outcomes <- is.na(data[[outcome.name]]) + outcome.index <- which(names(data) == outcome.name) + if (ncol(data) == 2) + all.predictors.missing <- rep(FALSE, nrow(data)) + else + all.predictors.missing <- apply(data[-outcome.index], 1, function(x) all(is.na(x))) + if (any(missing.outcomes | all.predictors.missing)) + { + missing.outcomes <- if (any(missing.outcomes)) "an outcome variable" else NULL + all.predictors.missing <- if (any(all.predictors.missing)) "all predictor variables" else NULL + description <- paste0(description, paste0(" cases missing ", + paste0(c(missing.outcomes, all.predictors.missing), + collapse = " or missing "), + " have been excluded;")) + } + } list(estimation.data = data.for.estimation, weights = weights, unfiltered.weights = unfiltered.weights, diff --git a/R/missingdata.R b/R/missingdata.R index a982171..51dfc50 100644 --- a/R/missingdata.R +++ b/R/missingdata.R @@ -211,7 +211,10 @@ AddDummyVariablesForNAs <- function(data, outcome.name, checks = TRUE) else return(data) } - cases.all.predictors.missing <- apply(predictor.df, 1, function(x) all(is.na(x))) + if (ncol(data) == 2) + cases.all.predictors.missing <- rep(FALSE, nrow(predictor.df)) + else + cases.all.predictors.missing <- apply(predictor.df, 1, function(x) all(is.na(x))) dummy.variable.df <- data.frame(dummy.variable.df, check.names = FALSE) names(dummy.variable.df) <- paste0(names(dummy.variable.df), ".dummy.var_GQ9KqD7YOf") diff --git a/tests/testthat/test-estimationdata.R b/tests/testthat/test-estimationdata.R index 20f2ad8..e9df435 100644 --- a/tests/testthat/test-estimationdata.R +++ b/tests/testthat/test-estimationdata.R @@ -78,16 +78,65 @@ no.level.test <- data.frame(Y = c(1, 2, 2, 3, 3), X1 = c(NA, 1, NA, 3, 4), X2 = c(NA, 1, 2, 3, 4), X3 = c(NA, 3, 2, 1, 4)) +dummy.test <- data.frame(Y = c(1:10), X1 = c(NA, 2:10), X2 = c(1, NA, 3:10), X3 = c(1:2, NA, 4:10)) + +edge.case.dummy <- data.frame(Y = 1:10, X = 1:10) +edge.case.dummy.miss.outcome <- edge.case.dummy +edge.case.dummy.miss.outcome[1, 1] <- NA + +edge.case.dummy.miss.pred <- edge.case.dummy +edge.case.dummy.miss.pred[1, 2] <- NA test_that("Dummy variable adjustment", { expect_warning(missing.level.output <- EstimationData(Y ~ X1 + X2 + X3, data = missing.level.test, - missing = "Dummy variable adjustment")$estimation.data, + missing = "Dummy variable adjustment"), "Some categories do not appear in the data: 'Y: A'") - expect_equal(missing.level.output, expected.dummy.missing.level) + expect_equal(missing.level.output$estimation.data, expected.dummy.missing.level) expect_warning(EstimationData(Y ~ X1 + X2, data = no.level.test), NA) expect_error(EstimationData(Y ~ X1 + X2, no.level.test[1:4, ]), "There are fewer observations (2) than there are variables (3)", fixed = TRUE) + dummy.test.output <- expect_error(EstimationData(Y ~ X1 + X2 + X3, data = dummy.test, + missing = "Dummy variable adjustment"), + NA) + expect_equal(dummy.test.output$description, + paste0("n = 10 cases used in estimation; ", + "missing values of predictor variables have been adjusted using ", + "dummy variables;")) + # Set one case to have missing outcome variable + dummy.test.missing.outcome <- dummy.test + dummy.test.missing.outcome[1, 1] <- NA + dummy.test.output.missing.outcome <- expect_error(EstimationData(Y ~ X1 + X2 + X3, data = dummy.test.missing.outcome, + missing = "Dummy variable adjustment"), + NA) + expect_equal(dummy.test.output.missing.outcome$description, + paste0("n = 9 cases used in estimation of a total sample size of 10; ", + "missing values of predictor variables have been adjusted using ", + "dummy variables; cases missing an outcome variable have been excluded;")) + # Remove all predictors in one case + dummy.test.with.missing.preds <- dummy.test + dummy.test.with.missing.preds[4, -1] <- NA + dummy.test.output.missing.predictors <- expect_error(EstimationData(Y ~ X1 + X2 + X3, data = dummy.test.with.missing.preds, + missing = "Dummy variable adjustment"), + NA) + expect_equal(dummy.test.output.missing.predictors$description, + paste0("n = 9 cases used in estimation of a total sample size of 10; ", + "missing values of predictor variables have been adjusted using ", + "dummy variables; cases missing all predictor variables have been excluded;")) + # Test edge case + edge.case.output <- expect_error(EstimationData(Y ~ X, data = edge.case.dummy.miss.outcome, + missing = "Dummy variable adjustment"), + NA) + expect_equal(edge.case.output$description, + paste0("n = 9 cases used in estimation of a total sample size of 10; ", + "cases missing an outcome variable have been excluded;")) + edge.case.output <- expect_error(EstimationData(Y ~ X, data = edge.case.dummy.miss.pred, + missing = "Dummy variable adjustment"), + NA) + expect_equal(edge.case.output$description, + paste0("n = 10 cases used in estimation; ", + "missing values of predictor variables have been adjusted using ", + "dummy variables;")) }) diff --git a/tests/testthat/test-missingdata.R b/tests/testthat/test-missingdata.R index 3047b4e..bafa3de 100644 --- a/tests/testthat/test-missingdata.R +++ b/tests/testthat/test-missingdata.R @@ -14,6 +14,7 @@ test_that("Missing options", expect_equal(341, suppressWarnings(nrow(EstimationData(Overall ~ Branch, bank, missing = "Exclude cases with missing data")$estimation.data))) expect_equal(823, nrow(EstimationData(Overall ~ Branch, bank, missing = "Use partial data")$estimation.data)) expect_equal(823, nrow(EstimationData(Overall ~ Branch, bank, missing = "Use partial data (pairwise correlations)")$estimation.data)) + expet_equal(759, nrow(EstimationData(Overall ~ Fees + Interest, bank, missing = "Dummy variable adjustment")$estimation.data)) }) test_that("Infinity", @@ -66,6 +67,11 @@ expected.missing.factor <- data.frame(Y = 1:3, X1 = factor(c(2, 2:3), labels = L X1.dummy.var_GQ9KqD7YOf = c(1, 0, 0)) df.with.text <- data.frame(Y = 1:2, X1 = 1:2, X3 = c(NA, LETTERS[1]), stringsAsFactors = FALSE) +edge.case <- data.frame(Y = 1:5, X = 1:5) + +expected.edge.with.missing <- structure(list(Y = 1:5, X = c(4.5, 4.5, 4.5, 4, 5), + X.dummy.var_GQ9KqD7YOf = c(1L, 1L, 1L, 0L, 0L)), + class = "data.frame", row.names = c(NA, -5L)) test_that("Dummy variable adjustment", { expect_identical(AddDummyVariablesForNAs(no.missing.df, outcome.name = "Y"), @@ -101,4 +107,11 @@ test_that("Dummy variable adjustment", { expect_error(AddDummyVariablesForNAs(df.with.text, outcome.name = "Y"), "Unexpected class when using dummy variable adjustment.") + # Check edge case with one predictor + expect_identical(AddDummyVariablesForNAs(edge.case, outcome.name = "Y"), + edge.case) + # Recode some to missing in predictor + edge.case[1:3, 2] <- NA + expect_identical(AddDummyVariablesForNAs(edge.case, outcome.name = "Y"), + expected.edge.with.missing) })