Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DS-83 3 #5

Merged
merged 4 commits into from
Mar 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ before_install:
- sudo add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable
- sudo apt-get -qq update
- sudo apt-get install -y libgdal-dev libproj-dev python-protobuf libprotoc-dev libprotobuf-dev libv8-dev librsvg2-dev libmpfr-dev
- export flipFormat_BRANCH_NAME=DS-83
- rcode="tfile <- tempfile(); capture.output(res<-devtools::test(), file = tfile); out <- readLines(tfile); cat(out, sep = '\n'); "
- rcode+="n.fail <- as.numeric(sub('Failed:[[:space:]]', '', out[grep('Failed:[[:space:]]', out)])); "
- rcode+="res <- as.data.frame(res); out <- data.frame(file = unlist(res[['file']]), warning = unlist(res[['warning']])); "
Expand Down Expand Up @@ -47,6 +48,6 @@ notifications:
# Warning notifications and downstream package builds are implemented
# by calling R functions so they can be updated in this package without
# committing a new change to .travis.yml in each repository
after_success:
after_success:
- Rscript -e "require(flipDevTools); NotifyWarnings(); TriggerDownstreamBuilds()"
- travis_wait Rscript -e "flipDevTools::CheckCoverage()"
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: flipData
Type: Package
Title: Functions for extracting and describing data
Version: 1.2.5
Version: 1.2.6
Author: Displayr <opensource@displayr.com>
Maintainer: Displayr <opensource@displayr.com>
Description: Functions for extracting data from formulas and
Expand All @@ -11,7 +11,7 @@ Description: Functions for extracting data from formulas and
License: GPL-3
LazyData: TRUE
Imports: CVXR (>= 1.0.0),
flipFormat (>= 1.0.0),
flipFormat (>= 1.6.13),
flipImputation,
flipTime,
flipTransformations (>= 1.4.0),
Expand Down
30 changes: 26 additions & 4 deletions R/estimationdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ EstimationData <- function(formula = NULL,
.filter <- if (weighted) subset & weights > 0 & some.data else subset & some.data # Name to avoid bug in subset.data.frame
data.subset <- subset(data, .filter)
data.subset <- CopyAttributes(data.subset, data)

dummy.adjusted <- FALSE
##############
## Imputation
single.imputation <- missing == "Imputation (replace missing values with estimates)"
Expand Down Expand Up @@ -132,7 +132,10 @@ EstimationData <- function(formula = NULL,
stop(paste("Unknown 'missing' method:", missing)))
data.for.estimation <- CopyAttributes(data.for.estimation, data.subset)
if (missing == "Dummy variable adjustment")
{ # Dummy variable adjustment can be selected but sometimes not used
dummy.adjusted <- any(grepl(".dummy.var_GQ9KqD7YOf$", colnames(data.for.estimation)))
data.cols <- names(data.for.estimation) %in% names(data.subset)
}
else
data.cols <- rep(TRUE, ncol(data.for.estimation))
levels.pre <- paste0(rep(labels, vapply(data.for.estimation[data.cols], nlevels, 0L)), ": ",
Expand Down Expand Up @@ -163,11 +166,30 @@ EstimationData <- function(formula = NULL,
if (error.if.insufficient.obs && n.estimation < length(variable.names))
stop(gettextf("There are fewer observations (%d)%s(%d)", n.estimation,
" than there are variables ", length(variable.names)))

description <- SampleDescription(n.total, n.subset, n.estimation,
Labels(subset), weighted, weight.label, missing, imputation.label, m,
if(HasOutcome(formula)) "predictor" else "")

if(HasOutcome(formula)) "predictor" else "",
dummy.adjusted = dummy.adjusted)
# Add statements about removing cases with missing outcomes and/or all predictors missing
if (missing == "Dummy variable adjustment")
{
outcome.name <- OutcomeName(formula)
missing.outcomes <- is.na(data[[outcome.name]])
outcome.index <- which(names(data) == outcome.name)
if (ncol(data) == 2)
all.predictors.missing <- rep(FALSE, nrow(data))
else
all.predictors.missing <- apply(data[-outcome.index], 1, function(x) all(is.na(x)))
if (any(missing.outcomes | all.predictors.missing))
{
missing.outcomes <- if (any(missing.outcomes)) "an outcome variable" else NULL
all.predictors.missing <- if (any(all.predictors.missing)) "all predictor variables" else NULL
description <- paste0(description, paste0(" cases missing ",
paste0(c(missing.outcomes, all.predictors.missing),
collapse = " or missing "),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the " or missing " case covered by unit tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It wasn't, I missed that one. Thanks for picking it up. Added a unit test for that case in the latest commit 6c6ca28

" have been excluded;"))
}
}
list(estimation.data = data.for.estimation,
weights = weights,
unfiltered.weights = unfiltered.weights,
Expand Down
5 changes: 4 additions & 1 deletion R/missingdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,10 @@ AddDummyVariablesForNAs <- function(data, outcome.name, checks = TRUE)
else
return(data)
}
cases.all.predictors.missing <- apply(predictor.df, 1, function(x) all(is.na(x)))
if (ncol(data) == 2)
cases.all.predictors.missing <- rep(FALSE, nrow(predictor.df))
else
cases.all.predictors.missing <- apply(predictor.df, 1, function(x) all(is.na(x)))

dummy.variable.df <- data.frame(dummy.variable.df, check.names = FALSE)
names(dummy.variable.df) <- paste0(names(dummy.variable.df), ".dummy.var_GQ9KqD7YOf")
Expand Down
65 changes: 63 additions & 2 deletions tests/testthat/test-estimationdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,77 @@ no.level.test <- data.frame(Y = c(1, 2, 2, 3, 3),
X1 = c(NA, 1, NA, 3, 4),
X2 = c(NA, 1, 2, 3, 4),
X3 = c(NA, 3, 2, 1, 4))
dummy.test <- data.frame(Y = c(1:10), X1 = c(NA, 2:10), X2 = c(1, NA, 3:10), X3 = c(1:2, NA, 4:10))

edge.case.dummy <- data.frame(Y = 1:10, X = 1:10)
edge.case.dummy.miss.outcome <- edge.case.dummy
edge.case.dummy.miss.outcome[1, 1] <- NA

edge.case.dummy.miss.pred <- edge.case.dummy
edge.case.dummy.miss.pred[1, 2] <- NA


test_that("Dummy variable adjustment", {
expect_warning(missing.level.output <- EstimationData(Y ~ X1 + X2 + X3, data = missing.level.test,
missing = "Dummy variable adjustment")$estimation.data,
missing = "Dummy variable adjustment"),
"Some categories do not appear in the data: 'Y: A'")
expect_equal(missing.level.output, expected.dummy.missing.level)
expect_equal(missing.level.output$estimation.data, expected.dummy.missing.level)
expect_warning(EstimationData(Y ~ X1 + X2, data = no.level.test), NA)
expect_error(EstimationData(Y ~ X1 + X2, no.level.test[1:4, ]),
"There are fewer observations (2) than there are variables (3)", fixed = TRUE)
dummy.test.output <- expect_error(EstimationData(Y ~ X1 + X2 + X3, data = dummy.test,
missing = "Dummy variable adjustment"),
NA)
expect_equal(dummy.test.output$description,
paste0("n = 10 cases used in estimation; ",
"missing values of predictor variables have been adjusted using ",
"dummy variables;"))
# Set one case to have missing outcome variable
dummy.test.missing.outcome <- dummy.test
dummy.test.missing.outcome[1, 1] <- NA
dummy.test.output.missing.outcome <- expect_error(EstimationData(Y ~ X1 + X2 + X3, data = dummy.test.missing.outcome,
missing = "Dummy variable adjustment"),
NA)
expect_equal(dummy.test.output.missing.outcome$description,
paste0("n = 9 cases used in estimation of a total sample size of 10; ",
"missing values of predictor variables have been adjusted using ",
"dummy variables; cases missing an outcome variable have been excluded;"))
# Remove all predictors in one case
dummy.test.with.missing.preds <- dummy.test
dummy.test.with.missing.preds[4, -1] <- NA
dummy.test.output.missing.predictors <- expect_error(EstimationData(Y ~ X1 + X2 + X3, data = dummy.test.with.missing.preds,
missing = "Dummy variable adjustment"),
NA)
expect_equal(dummy.test.output.missing.predictors$description,
paste0("n = 9 cases used in estimation of a total sample size of 10; ",
"missing values of predictor variables have been adjusted using ",
"dummy variables; cases missing all predictor variables have been excluded;"))
# Have cases with missing outcome and all missing predictors.
dummy.test.miss.preds.outcome <- dummy.test
dummy.test.miss.preds.outcome[1, 1] <- NA
dummy.test.miss.preds.outcome[2, -1] <- NA
dummy.test.output <- expect_error(EstimationData(Y ~ X1 + X2 + X3, data = dummy.test.miss.preds.outcome,
missing = "Dummy variable adjustment"),
NA)
expect_equal(dummy.test.output$description,
paste0("n = 8 cases used in estimation of a total sample size of 10; ",
"missing values of predictor variables have been adjusted using ",
"dummy variables; cases missing an outcome variable or missing all predictor variables ",
"have been excluded;"))
# Test edge case
edge.case.output <- expect_error(EstimationData(Y ~ X, data = edge.case.dummy.miss.outcome,
missing = "Dummy variable adjustment"),
NA)
expect_equal(edge.case.output$description,
paste0("n = 9 cases used in estimation of a total sample size of 10; ",
"cases missing an outcome variable have been excluded;"))
edge.case.output <- expect_error(EstimationData(Y ~ X, data = edge.case.dummy.miss.pred,
missing = "Dummy variable adjustment"),
NA)
expect_equal(edge.case.output$description,
paste0("n = 10 cases used in estimation; ",
"missing values of predictor variables have been adjusted using ",
"dummy variables;"))
})


13 changes: 13 additions & 0 deletions tests/testthat/test-missingdata.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ test_that("Missing options",
expect_equal(341, suppressWarnings(nrow(EstimationData(Overall ~ Branch, bank, missing = "Exclude cases with missing data")$estimation.data)))
expect_equal(823, nrow(EstimationData(Overall ~ Branch, bank, missing = "Use partial data")$estimation.data))
expect_equal(823, nrow(EstimationData(Overall ~ Branch, bank, missing = "Use partial data (pairwise correlations)")$estimation.data))
expect_equal(759, nrow(EstimationData(Overall ~ Fees + Interest, bank, missing = "Dummy variable adjustment")$estimation.data))
})

test_that("Infinity",
Expand Down Expand Up @@ -66,6 +67,11 @@ expected.missing.factor <- data.frame(Y = 1:3, X1 = factor(c(2, 2:3), labels = L
X1.dummy.var_GQ9KqD7YOf = c(1, 0, 0))
df.with.text <- data.frame(Y = 1:2, X1 = 1:2, X3 = c(NA, LETTERS[1]),
stringsAsFactors = FALSE)
edge.case <- data.frame(Y = 1:5, X = 1:5)

expected.edge.with.missing <- structure(list(Y = 1:5, X = c(4.5, 4.5, 4.5, 4, 5),
X.dummy.var_GQ9KqD7YOf = c(1L, 1L, 1L, 0L, 0L)),
class = "data.frame", row.names = c(NA, -5L))

test_that("Dummy variable adjustment", {
expect_identical(AddDummyVariablesForNAs(no.missing.df, outcome.name = "Y"),
Expand Down Expand Up @@ -101,4 +107,11 @@ test_that("Dummy variable adjustment", {

expect_error(AddDummyVariablesForNAs(df.with.text, outcome.name = "Y"),
"Unexpected class when using dummy variable adjustment.")
# Check edge case with one predictor
expect_identical(AddDummyVariablesForNAs(edge.case, outcome.name = "Y"),
edge.case)
# Recode some to missing in predictor
edge.case[1:3, 2] <- NA
expect_identical(AddDummyVariablesForNAs(edge.case, outcome.name = "Y"),
expected.edge.with.missing)
})