From 5359ac47f716dae630766de044fbf72d0390af04 Mon Sep 17 00:00:00 2001 From: Luke Sonnet Date: Tue, 20 Mar 2018 12:31:16 -0700 Subject: [PATCH] spelling --- R/S3_confint.R | 30 +- R/S3_predict.R | 1 - R/S3_print.R | 22 +- R/S3_summary.R | 21 +- R/S3_tidy.R | 5 +- R/estimatr_difference_in_means.R | 555 ++++++++++--------- R/estimatr_horvitz_thompson.R | 915 +++++++++++++++---------------- R/estimatr_iv_robust.R | 11 +- R/estimatr_lm_robust.R | 2 +- R/helper_clean_model_data.R | 14 +- R/helper_condition_pr_matrix.R | 10 +- R/helper_extract.R | 17 +- R/helper_lm_robust_fit.R | 25 +- R/helper_na_omit_detailed.R | 8 +- R/zzz.R | 32 +- man/iv_robust.Rd | 8 +- man/lm_robust.Rd | 4 +- 17 files changed, 816 insertions(+), 864 deletions(-) diff --git a/R/S3_confint.R b/R/S3_confint.R index 8a42f194..d11444be 100644 --- a/R/S3_confint.R +++ b/R/S3_confint.R @@ -19,26 +19,24 @@ confint.iv_robust <- confint_lm_like #' @export -confint.difference_in_means <- - function(object, - parm = NULL, - level = NULL, - ...) { - cis <- get_ci_mat(object, level) +confint.difference_in_means <- function(object, + parm = NULL, + level = NULL, + ...) { + cis <- get_ci_mat(object, level) - return(cis) - } + return(cis) +} #' @export -confint.horvitz_thompson <- - function(object, - parm = NULL, - level = NULL, - ...) { - cis <- get_ci_mat(object, level, ttest = FALSE) +confint.horvitz_thompson <- function(object, + parm = NULL, + level = NULL, + ...) { + cis <- get_ci_mat(object, level, ttest = FALSE) - return(cis) - } + return(cis) +} ## internal method that builds confidence intervals and labels the matrix to be returned diff --git a/R/S3_predict.R b/R/S3_predict.R index 3a10e33f..564a83c9 100644 --- a/R/S3_predict.R +++ b/R/S3_predict.R @@ -119,7 +119,6 @@ predict.lm_robust <- function(object, interval <- match.arg(interval) if (se.fit || interval != "none") { - if (ncol(coefs) > 1) { stop("Can't set `se.fit` == TRUE with multivariate outcome") } diff --git a/R/S3_print.R b/R/S3_print.R index 45f31214..62d31f15 100644 --- a/R/S3_print.R +++ b/R/S3_print.R @@ -13,7 +13,8 @@ print_summary_lm_like <- function(x, ...) { cat( "\nCall:\n", paste(deparse(x$call, nlines = 5), sep = "\n", collapse = "\n"), - "\n\n", sep = "" + "\n\n", + sep = "" ) if (x$weighted) { cat("Weighted, ") @@ -67,18 +68,13 @@ print.summary.iv_robust <- function(x, } #' @export -print.difference_in_means <- - function( - x, - ...) { - cat("Design: ", x$design, "\n") - print(summarize_tidy(x)) - } +print.difference_in_means <- function(x, ...) { + cat("Design: ", x$design, "\n") + print(summarize_tidy(x)) +} #' @export -print.horvitz_thompson <- - function(x, - ...) { - print(summarize_tidy(x)) - } +print.horvitz_thompson <- function(x, ...) { + print(summarize_tidy(x)) +} diff --git a/R/S3_summary.R b/R/S3_summary.R index 47e7ba65..74121723 100644 --- a/R/S3_summary.R +++ b/R/S3_summary.R @@ -1,7 +1,5 @@ #' @export -summary.lm_robust <- function(object, - ...) { - +summary.lm_robust <- function(object, ...) { if (is.matrix(coef(object))) { ny <- ncol(coef(object)) @@ -29,7 +27,6 @@ summary.lm_robust <- function(object, all_models <- object for (i in seq(ny)) { - for (nm in names(object)) { if (nm %in% mat_objs) { object[[nm]] <- all_models[[nm]][, i, drop = TRUE] @@ -52,9 +49,7 @@ summary.lm_robust <- function(object, } #' @export -summary.iv_robust <- function(object, - ...) { - +summary.iv_robust <- function(object, ...) { summary_lm_model(object) } @@ -85,8 +80,7 @@ summary_lm_model <- function(object) { #' @export -summary.difference_in_means <- function(object, - ...) { +summary.difference_in_means <- function(object, ...) { return(list( coefficients = summarize_tidy(object), design = object$design @@ -95,12 +89,9 @@ summary.difference_in_means <- function(object, #' @export -summary.horvitz_thompson <- - function( - object, - ...) { - return(list(coefficients = summarize_tidy(object, "z"))) - } +summary.horvitz_thompson <- function(object, ...) { + return(list(coefficients = summarize_tidy(object, "z"))) +} summarize_tidy <- function(object, test = "t", ...) { remove_cols <- c("term", "outcome") diff --git a/R/S3_tidy.R b/R/S3_tidy.R index 8bb03b99..2b0f8195 100644 --- a/R/S3_tidy.R +++ b/R/S3_tidy.R @@ -83,7 +83,6 @@ tidy.horvitz_thompson <- function(object, ...) { tidy_data_frame <- function(object, digits = NULL) { - vec_cols <- c( "coefficients", @@ -94,7 +93,9 @@ tidy_data_frame <- function(object, digits = NULL) { "df" ) - tidy_mat <- do.call("cbind", lapply(vec_cols, function(x) {as.vector(object[[x]])})) + tidy_mat <- do.call("cbind", lapply(vec_cols, function(x) { + as.vector(object[[x]]) + })) vec_cols[which(vec_cols == "coefficients")] <- "estimate" colnames(tidy_mat) <- vec_cols return_frame <- data.frame( diff --git a/R/estimatr_difference_in_means.R b/R/estimatr_difference_in_means.R index e5feac58..1bac115d 100644 --- a/R/estimatr_difference_in_means.R +++ b/R/estimatr_difference_in_means.R @@ -205,278 +205,323 @@ #' lm_robust(Y ~ Z_comp, weights = w, data = dat) #' #' @export -difference_in_means <- - function(formula, - data, - blocks, - clusters, - weights, - subset, - se_type = c("default", "none"), - condition1 = NULL, - condition2 = NULL, - ci = TRUE, - alpha = .05) { - if (length(all.vars(f_rhs(eval_tidy(formula)))) > 1) { - stop( - "'formula' must have only one variable on the right-hand side: the ", - "treatment variable." - ) - } +difference_in_means <- function(formula, + data, + blocks, + clusters, + weights, + subset, + se_type = c("default", "none"), + condition1 = NULL, + condition2 = NULL, + ci = TRUE, + alpha = .05) { + if (length(all.vars(f_rhs(eval_tidy(formula)))) > 1) { + stop( + "'formula' must have only one variable on the right-hand side: the ", + "treatment variable." + ) + } - se_type <- match.arg(se_type) + se_type <- match.arg(se_type) + + datargs <- enquos( + formula = formula, + weights = weights, + subset = subset, + block = blocks, + cluster = clusters + ) + data <- enquo(data) + model_data <- clean_model_data(data = data, datargs, estimator = "dim") + + data <- data.frame( + y = model_data$outcome, + t = model_data$original_treatment, + stringsAsFactors = FALSE + ) + data$cluster <- model_data$cluster + # rescale weights for convenience + if (is.numeric(model_data$weights)) { + data$weights <- model_data$weights / mean(model_data$weights) + } + data$block <- model_data$block + + if (!is.null(data$weights) && length(unique(data$weights)) == 1 + && is.null(data$cluster) && is.null(data$block)) { + message( + "Constant `weights` passed to `difference_in_means` will ", + "unnecessarily trigger `lm_robust()` and the Welch-Satterthwaite ", + "approximation will not be used for the degrees of freedom." + ) + } + + rm(model_data) - datargs <- enquos( - formula = formula, - weights = weights, - subset = subset, - block = blocks, - cluster = clusters + # parse condition names + if (is.null(condition1) || is.null(condition2)) { + condition_names <- parse_conditions( + treatment = data$t, + condition1 = condition1, + condition2 = condition2, + estimator = "difference_in_means" ) - data <- enquo(data) - model_data <- clean_model_data(data = data, datargs, estimator = "dim") + condition2 <- condition_names[[2]] + condition1 <- condition_names[[1]] + } - data <- data.frame( - y = model_data$outcome, - t = model_data$original_treatment, - stringsAsFactors = FALSE + if (is.null(data$block)) { + return_frame <- difference_in_means_internal( + condition1 = condition1, + condition2 = condition2, + data = data, + alpha = alpha, + se_type = se_type ) - data$cluster <- model_data$cluster - # rescale weights for convenience - if (is.numeric(model_data$weights)) { - data$weights <- model_data$weights / mean(model_data$weights) + + if (is.null(data$cluster)) { + design <- "Standard" + } else { + design <- "Clustered" } - data$block <- model_data$block - - if (!is.null(data$weights) && length(unique(data$weights)) == 1 - && is.null(data$cluster) && is.null(data$block)) { - message( - "Constant `weights` passed to `difference_in_means` will ", - "unnecessarily trigger `lm_robust()` and the Welch-Satterthwaite ", - "approximation will not be used for the degrees of freedom." + } else { + pair_matched <- FALSE + + # When learning whether it is matched pairs, should only use relevant conditions + data <- subset.data.frame(data, t %in% c(condition1, condition2)) + + clust_per_block <- check_clusters_blocks(data) + + # Check if design is pair matched + if (any(clust_per_block == 1)) { + stop("All `blocks` must have multiple units (or `clusters`)") + } else if (all(clust_per_block == 2)) { + pair_matched <- TRUE + } else if (any(clust_per_block == 2) & any(clust_per_block > 2)) { + pair_matched <- TRUE + warning( + "Some `blocks` have two units/`clusters` while other blocks ", + "have more units/`clusters`. As standard variance estimates ", + "cannot be computed within blocks with two units, we use the ", + "matched pairs estimator of the variance." ) } - rm(model_data) + block_dfs <- split(data, data$block) - # parse condition names - if (is.null(condition1) || is.null(condition2)) { - condition_names <- parse_conditions( - treatment = data$t, + block_estimates <- lapply(block_dfs, function(x) { + difference_in_means_internal( + data = x, condition1 = condition1, condition2 = condition2, - estimator = "difference_in_means" - ) - condition2 <- condition_names[[2]] - condition1 <- condition_names[[1]] - } - - if (is.null(data$block)) { - return_frame <- difference_in_means_internal( - condition1 = condition1, - condition2 = condition2, - data = data, + pair_matched = pair_matched, alpha = alpha, se_type = se_type ) + }) - if (is.null(data$cluster)) { - design <- "Standard" - } else { - design <- "Clustered" - } - } else { - pair_matched <- FALSE - - # When learning whether it is matched pairs, should only use relevant conditions - data <- subset.data.frame(data, t %in% c(condition1, condition2)) - - clust_per_block <- check_clusters_blocks(data) - - # Check if design is pair matched - if (any(clust_per_block == 1)) { - stop("All `blocks` must have multiple units (or `clusters`)") - } else if (all(clust_per_block == 2)) { - pair_matched <- TRUE - } else if (any(clust_per_block == 2) & any(clust_per_block > 2)) { - pair_matched <- TRUE - warning( - "Some `blocks` have two units/`clusters` while other blocks ", - "have more units/`clusters`. As standard variance estimates ", - "cannot be computed within blocks with two units, we use the ", - "matched pairs estimator of the variance." - ) - } - - block_dfs <- split(data, data$block) + block_estimates <- do.call(rbind, block_estimates) - block_estimates <- lapply(block_dfs, function(x) { - difference_in_means_internal( - data = x, - condition1 = condition1, - condition2 = condition2, - pair_matched = pair_matched, - alpha = alpha, - se_type = se_type - ) - }) - - block_estimates <- do.call(rbind, block_estimates) + N_overall <- with(block_estimates, sum(N)) - N_overall <- with(block_estimates, sum(N)) + # Blocked design, (Gerber Green 2012, p73, eq3.10) + diff <- with(block_estimates, sum(coefficients * N / N_overall)) - # Blocked design, (Gerber Green 2012, p73, eq3.10) - diff <- with(block_estimates, sum(coefficients * N / N_overall)) + df <- NA + std.error <- NA + n_blocks <- nrow(block_estimates) - df <- NA - std.error <- NA - n_blocks <- nrow(block_estimates) + if (pair_matched) { + if (is.null(data$cluster)) { + design <- "Matched-pair" - if (pair_matched) { - if (is.null(data$cluster)) { - design <- "Matched-pair" - - # Pair matched, unit randomized (Gerber Green 2012, p77, eq3.16) - if (se_type != "none") { - std.error <- - with( - block_estimates, - sqrt((1 / (n_blocks * (n_blocks - 1))) * sum((coefficients - diff) ^ 2)) - ) - } - - } else { - design <- "Matched-pair clustered" - # Pair matched, cluster randomized (Imai, King, Nall 2009, p36, eq6) - if (se_type != "none") { - std.error <- - with( - block_estimates, - sqrt( - (n_blocks / ((n_blocks - 1) * N_overall ^ 2)) * - sum((N * coefficients - (N_overall * diff) / n_blocks) ^ 2) - ) - ) - } + # Pair matched, unit randomized (Gerber Green 2012, p77, eq3.16) + if (se_type != "none") { + std.error <- + with( + block_estimates, + sqrt((1 / (n_blocks * (n_blocks - 1))) * sum((coefficients - diff)^2)) + ) } - - # For pair matched, cluster randomized Imai et al. 2009 recommend (p. 37) - df <- n_blocks - 1 } else { - # Block randomized (Gerber and Green 2012, p. 74, footnote 17) + design <- "Matched-pair clustered" + # Pair matched, cluster randomized (Imai, King, Nall 2009, p36, eq6) if (se_type != "none") { - std.error <- with( - block_estimates, - sqrt(sum(std.error ^ 2 * (N / N_overall) ^ 2)) - ) + std.error <- + with( + block_estimates, + sqrt( + (n_blocks / ((n_blocks - 1) * N_overall^2)) * + sum((N * coefficients - (N_overall * diff) / n_blocks)^2) + ) + ) } + } - - ## we don't know if this is correct! - ## matches lm_lin, two estimates per block - if (is.null(data$cluster)) { - design <- "Blocked" - df <- nrow(data) - 2 * n_blocks - } else { - design <- "Block-clustered" - # Also matches lm_lin for even sized clusters, should be conservative - df <- sum(clust_per_block) - 2 * n_blocks - } + # For pair matched, cluster randomized Imai et al. 2009 recommend (p. 37) + df <- n_blocks - 1 + } else { + # Block randomized (Gerber and Green 2012, p. 74, footnote 17) + if (se_type != "none") { + std.error <- with( + block_estimates, + sqrt(sum(std.error^2 * (N / N_overall)^2)) + ) } - return_frame <- data.frame( - coefficients = diff, - std.error = std.error, - df = df, - N = N_overall, - stringsAsFactors = FALSE - ) - } - if (!is.null(data$weights)) { - design <- paste0(design, " (weighted)") + ## we don't know if this is correct! + ## matches lm_lin, two estimates per block + if (is.null(data$cluster)) { + design <- "Blocked" + df <- nrow(data) - 2 * n_blocks + } else { + design <- "Block-clustered" + # Also matches lm_lin for even sized clusters, should be conservative + df <- sum(clust_per_block) - 2 * n_blocks + } } - return_list <- add_cis_pvals(return_frame, alpha, ci) - - # print(c("Pair Matched? ", pair_matched)) - - return_list <- dim_like_return( - return_list, - alpha = alpha, - formula = formula, - conditions = list(condition1, condition2) + return_frame <- data.frame( + coefficients = diff, + std.error = std.error, + df = df, + N = N_overall, + stringsAsFactors = FALSE ) + } - return_list[["design"]] <- design + if (!is.null(data$weights)) { + design <- paste0(design, " (weighted)") + } - attr(return_list, "class") <- "difference_in_means" + return_list <- add_cis_pvals(return_frame, alpha, ci) - return(return_list) - } + # print(c("Pair Matched? ", pair_matched)) + return_list <- dim_like_return( + return_list, + alpha = alpha, + formula = formula, + conditions = list(condition1, condition2) + ) -difference_in_means_internal <- - function(condition1 = NULL, - condition2 = NULL, - data, - pair_matched = FALSE, - alpha = .05, - se_type = "default") { + return_list[["design"]] <- design - # Check that treatment status is uniform within cluster, checked here - # so that the treatment vector t doesn't have to be built anywhere else - if (!is.null(data$cluster)) { - if (is.factor(data$cluster)) { - data$cluster <- droplevels(data$cluster) - } + attr(return_list, "class") <- "difference_in_means" - if (any(!tapply(data$t, data$cluster, function(x) all(x == x[1])))) { - stop( - "All units within a cluster must have the same treatment condition." - ) - } - } + return(return_list) +} - Y2 <- data$y[data$t == condition2] - Y1 <- data$y[data$t == condition1] - N2 <- length(Y2) - N1 <- length(Y1) - N <- N2 + N1 +difference_in_means_internal <- function(condition1 = NULL, + condition2 = NULL, + data, + pair_matched = FALSE, + alpha = .05, + se_type = "default") { - if ((N1 == 0) || (N2 == 0)) { - stop("Must have units with both treatment conditions within each block.") + # Check that treatment status is uniform within cluster, checked here + # so that the treatment vector t doesn't have to be built anywhere else + if (!is.null(data$cluster)) { + if (is.factor(data$cluster)) { + data$cluster <- droplevels(data$cluster) } - ## Check to make sure multiple in each group if pair matched is false - if (!pair_matched & (N2 == 1 | N1 == 1)) { + if (any(!tapply(data$t, data$cluster, function(x) all(x == x[1])))) { stop( - "Must have least two treated/control units in each block if design is not ", - "pair-matched (i.e., every block is of size two). Only one treated or ", - "control unit in a block makes standard errors impossible to calculate" + "All units within a cluster must have the same treatment condition." ) } + } - df <- NA + Y2 <- data$y[data$t == condition2] + Y1 <- data$y[data$t == condition1] + + N2 <- length(Y2) + N1 <- length(Y1) + N <- N2 + N1 + + if ((N1 == 0) || (N2 == 0)) { + stop("Must have units with both treatment conditions within each block.") + } + + ## Check to make sure multiple in each group if pair matched is false + if (!pair_matched & (N2 == 1 | N1 == 1)) { + stop( + "Must have least two treated/control units in each block if design is not ", + "pair-matched (i.e., every block is of size two). Only one treated or ", + "control unit in a block makes standard errors impossible to calculate" + ) + } + + df <- NA + + if (!is.null(data$cluster) && !pair_matched) { + + # For now, all clustered cases go to lm_robust + # CR2 nests Gerber and Green 2012, p. 83, eq. 3.23 when clusters are + # equal sizes (we think) and is more appropriate when clusters are different sizes + + X <- cbind(1, t = as.numeric(data$t == condition2)) + + # print("Using lm_robust") + # TODO currently lm_robust_fit does too much, need to refactor it + # if it will be used here in the long run + cr2_out <- lm_robust_fit( + y = data$y, + X = cbind(1, t = as.numeric(data$t == condition2)), + cluster = data$cluster, + se_type = ifelse(se_type == "none", "none", "CR2"), + weights = data$weights, + ci = TRUE, + try_cholesky = TRUE, + alpha = alpha, + return_vcov = FALSE, + has_int = TRUE + ) + + diff <- coef(cr2_out)[2] + std.error <- cr2_out$std.error[2] + df <- cr2_out$df[2] + } else { + if (is.null(data$weights)) { + diff <- mean(Y2) - mean(Y1) + + if (pair_matched || se_type == "none") { + # Pair matched designs + std.error <- NA + } else { + # Non-pair matched designs, unit level randomization + var_Y2 <- var(Y2) + var_Y1 <- var(Y1) - if (!is.null(data$cluster) && !pair_matched) { + std.error <- sqrt(var_Y2 / N2 + var_Y1 / N1) - # For now, all clustered cases go to lm_robust - # CR2 nests Gerber and Green 2012, p. 83, eq. 3.23 when clusters are - # equal sizes (we think) and is more appropriate when clusters are different sizes + df <- std.error^4 / + ( + (var_Y2 / N2)^2 / (N2 - 1) + + (var_Y1 / N1)^2 / (N1 - 1) + ) + } + } else { + if (pair_matched) { + stop( + "Cannot use `weights` with matched pairs design at the moment" + ) + } X <- cbind(1, t = as.numeric(data$t == condition2)) # print("Using lm_robust") # TODO currently lm_robust_fit does too much, need to refactor it # if it will be used here in the long run - cr2_out <- lm_robust_fit( + w_hc2_out <- lm_robust_fit( y = data$y, X = cbind(1, t = as.numeric(data$t == condition2)), - cluster = data$cluster, - se_type = ifelse(se_type == "none", "none", "CR2"), + se_type = ifelse(se_type == "none", "none", "HC2"), weights = data$weights, + cluster = NULL, ci = TRUE, try_cholesky = TRUE, alpha = alpha, @@ -484,73 +529,25 @@ difference_in_means_internal <- has_int = TRUE ) - diff <- coef(cr2_out)[2] - std.error <- cr2_out$std.error[2] - df <- cr2_out$df[2] - } else { - if (is.null(data$weights)) { - diff <- mean(Y2) - mean(Y1) - - if (pair_matched || se_type == "none") { - # Pair matched designs - std.error <- NA - } else { - # Non-pair matched designs, unit level randomization - var_Y2 <- var(Y2) - var_Y1 <- var(Y1) - - std.error <- sqrt(var_Y2 / N2 + var_Y1 / N1) - - df <- std.error ^ 4 / - ( - (var_Y2 / N2) ^ 2 / (N2 - 1) + - (var_Y1 / N1) ^ 2 / (N1 - 1) - ) - } - } else { - if (pair_matched) { - stop( - "Cannot use `weights` with matched pairs design at the moment" - ) - } - - X <- cbind(1, t = as.numeric(data$t == condition2)) - - # print("Using lm_robust") - # TODO currently lm_robust_fit does too much, need to refactor it - # if it will be used here in the long run - w_hc2_out <- lm_robust_fit( - y = data$y, - X = cbind(1, t = as.numeric(data$t == condition2)), - se_type = ifelse(se_type == "none", "none", "HC2"), - weights = data$weights, - cluster = NULL, - ci = TRUE, - try_cholesky = TRUE, - alpha = alpha, - return_vcov = FALSE, - has_int = TRUE - ) - - diff <- coef(w_hc2_out)[2] - std.error <- w_hc2_out$std.error[2] - df <- w_hc2_out$df[2] - } + diff <- coef(w_hc2_out)[2] + std.error <- w_hc2_out$std.error[2] + df <- w_hc2_out$df[2] } + } - return_frame <- - data.frame( - coefficients = diff, - std.error = std.error, - df = df, - stringsAsFactors = FALSE - ) - - if (is.numeric(data$weights)) { - return_frame$N <- sum(data$weights) - } else { - return_frame$N <- N - } + return_frame <- + data.frame( + coefficients = diff, + std.error = std.error, + df = df, + stringsAsFactors = FALSE + ) - return(return_frame) + if (is.numeric(data$weights)) { + return_frame$N <- sum(data$weights) + } else { + return_frame$N <- N } + + return(return_frame) +} diff --git a/R/estimatr_horvitz_thompson.R b/R/estimatr_horvitz_thompson.R index db43e658..c8d7b77c 100644 --- a/R/estimatr_horvitz_thompson.R +++ b/R/estimatr_horvitz_thompson.R @@ -221,533 +221,452 @@ #' horvitz_thompson(y ~ z, data = dat, condition_pr_mat = arb_pr_mat) #' #' @export -horvitz_thompson <- - function(formula, - data, - blocks, - clusters, - simple = NULL, - condition_prs, - condition_pr_mat = NULL, - ra_declaration = NULL, - subset, - condition1 = NULL, - condition2 = NULL, - se_type = c("youngs", "constant", "none"), - ci = TRUE, - alpha = .05, - return_condition_pr_mat = FALSE) { - - # ----- - # Check arguments - # ----- - if (length(all.vars(f_rhs(formula))) > 1) { +horvitz_thompson <- function(formula, + data, + blocks, + clusters, + simple = NULL, + condition_prs, + condition_pr_mat = NULL, + ra_declaration = NULL, + subset, + condition1 = NULL, + condition2 = NULL, + se_type = c("youngs", "constant", "none"), + ci = TRUE, + alpha = .05, + return_condition_pr_mat = FALSE) { + + # ----- + # Check arguments + # ----- + if (length(all.vars(f_rhs(formula))) > 1) { + stop( + "'formula' must have only one variable on the right-hand side: the ", + "treatment variable" + ) + } + + se_type <- match.arg(se_type) + + # ----- + # Parse arguments, clean data + # ----- + # User can either use declaration or the arguments, not both! + if (!is.null(ra_declaration)) { + if (ncol(ra_declaration$probabilities_matrix) > 2) { stop( - "'formula' must have only one variable on the right-hand side: the ", - "treatment variable" + "Cannot use horvitz_thompson() with a `ra_declaration` with more than ", + "two treatment arms for now" ) } - se_type <- match.arg(se_type) - - # ----- - # Parse arguments, clean data - # ----- - # User can either use declaration or the arguments, not both! - if (!is.null(ra_declaration)) { - if (ncol(ra_declaration$probabilities_matrix) > 2) { - stop( - "Cannot use horvitz_thompson() with a `ra_declaration` with more than ", - "two treatment arms for now" - ) - } + if (!missing(clusters) | + !missing(condition_prs) | + !missing(blocks) | + !is.null(condition_pr_mat)) { + stop( + "Cannot use `ra_declaration` with any of `clusters`, `condition_prs`, ", + "`blocks`, `condition_pr_mat`" + ) + } - if (!missing(clusters) | - !missing(condition_prs) | - !missing(blocks) | - !is.null(condition_pr_mat)) { - stop( - "Cannot use `ra_declaration` with any of `clusters`, `condition_prs`, ", - "`blocks`, `condition_pr_mat`" - ) - } + # Add clusters, blocks, and treatment probabilities to data so they can be cleaned with clean_model_data + if (!is.null(ra_declaration$clusters)) { + .clusters_ddinternal <- ra_declaration$clusters + clusters <- quo(.clusters_ddinternal) + } - # Add clusters, blocks, and treatment probabilities to data so they can be cleaned with clean_model_data - if (!is.null(ra_declaration$clusters)) { - .clusters_ddinternal <- ra_declaration$clusters - clusters <- quo(.clusters_ddinternal) - } + if (!is.null(ra_declaration$blocks)) { + .blocks_ddinternal <- ra_declaration$blocks + blocks <- quo(.blocks_ddinternal) + } - if (!is.null(ra_declaration$blocks)) { - .blocks_ddinternal <- ra_declaration$blocks - blocks <- quo(.blocks_ddinternal) - } + if (!is.null(condition2)) { + treatnum <- + which(ra_declaration$cleaned_arguments$condition_names == condition2) - if (!is.null(condition2)) { - treatnum <- - which(ra_declaration$cleaned_arguments$condition_names == condition2) - - if (!(condition2 %in% ra_declaration$cleaned_arguments$condition_names)) { - stop( - "If `condition2` and `ra_declaration` are both specified, ", - "`condition2` must match the condition_names in `ra_declaration`.", - "\n`condition2`: ", condition2, "\n`condition_names`: ", - paste0( - ra_declaration$cleaned_arguments$condition_names, - collapse = ", " - ) + if (!(condition2 %in% ra_declaration$cleaned_arguments$condition_names)) { + stop( + "If `condition2` and `ra_declaration` are both specified, ", + "`condition2` must match the condition_names in `ra_declaration`.", + "\n`condition2`: ", condition2, "\n`condition_names`: ", + paste0( + ra_declaration$cleaned_arguments$condition_names, + collapse = ", " ) - } - - treatment_prob <- obtain( - ra_declaration, - condition2 ) - } else { - # assuming treatment is second column - treatment_prob <- ra_declaration$probabilities_matrix[, 2] } - .treatment_prob_ddinternal <- treatment_prob - condition_prs <- quo(.treatment_prob_ddinternal) - } - - ## Clean data - datargs <- enquos( - formula = formula, - subset = subset, - block = blocks, - cluster = clusters, - condition_pr = condition_prs - ) - data <- enquo(data) - model_data <- clean_model_data(data = data, datargs, estimator = "ht") - ## condition_pr_mat, if supplied, must be same length - if (!is.null(condition_pr_mat) && (2 * length(model_data$outcome) != nrow(condition_pr_mat))) { - stop( - "After cleaning the data, it has ", length(model_data$outcome), " ", - "while `condition_pr_mat` has ", nrow(condition_pr_mat), ". ", - "`condition_pr_mat` should have twice the rows" + treatment_prob <- obtain( + ra_declaration, + condition2 ) + } else { + # assuming treatment is second column + treatment_prob <- ra_declaration$probabilities_matrix[, 2] } + .treatment_prob_ddinternal <- treatment_prob + condition_prs <- quo(.treatment_prob_ddinternal) + } - data <- data.frame( - y = model_data$outcome, - t = model_data$original_treatment, - stringsAsFactors = FALSE - ) - - # Parse conditions - condition_names <- parse_conditions( - treatment = data$t, - condition1 = condition1, - condition2 = condition2, - estimator = "horvitz_thompson" + ## Clean data + datargs <- enquos( + formula = formula, + subset = subset, + block = blocks, + cluster = clusters, + condition_pr = condition_prs + ) + data <- enquo(data) + model_data <- clean_model_data(data = data, datargs, estimator = "ht") + + ## condition_pr_mat, if supplied, must be same length + if (!is.null(condition_pr_mat) && (2 * length(model_data$outcome) != nrow(condition_pr_mat))) { + stop( + "After cleaning the data, it has ", length(model_data$outcome), " ", + "while `condition_pr_mat` has ", nrow(condition_pr_mat), ". ", + "`condition_pr_mat` should have twice the rows" ) - condition2 <- condition_names[[2]] - condition1 <- condition_names[[1]] + } - data$clusters <- model_data$cluster - data$blocks <- model_data$block - if (!is.null(model_data$condition_pr)) { - data$condition_probabilities <- model_data$condition_pr - } + data <- data.frame( + y = model_data$outcome, + t = model_data$original_treatment, + stringsAsFactors = FALSE + ) + + # Parse conditions + condition_names <- parse_conditions( + treatment = data$t, + condition1 = condition1, + condition2 = condition2, + estimator = "horvitz_thompson" + ) + condition2 <- condition_names[[2]] + condition1 <- condition_names[[1]] + + data$clusters <- model_data$cluster + data$blocks <- model_data$block + if (!is.null(model_data$condition_pr)) { + data$condition_probabilities <- model_data$condition_pr + } - # ---------- - # Learn design - # ---------- + # ---------- + # Learn design + # ---------- - # Declaration is passed - if (!is.null(ra_declaration)) { + # Declaration is passed + if (!is.null(ra_declaration)) { - # Use output from clean_model_data to rebuild declaration - if (nrow(ra_declaration$probabilities_matrix) != length(data$y)) { - prob_names <- colnames(ra_declaration$probabilities_matrix) - ra_declaration$probabilities_matrix <- cbind( - 1 - data$condition_probabilities, - data$condition_probabilities - ) - colnames(ra_declaration$probabilities_matrix) <- prob_names + # Use output from clean_model_data to rebuild declaration + if (nrow(ra_declaration$probabilities_matrix) != length(data$y)) { + prob_names <- colnames(ra_declaration$probabilities_matrix) + ra_declaration$probabilities_matrix <- cbind( + 1 - data$condition_probabilities, + data$condition_probabilities + ) + colnames(ra_declaration$probabilities_matrix) <- prob_names + } - } + # If simple, just use condition probabilities shortcut + # Same if se not needed + if (ra_declaration$ra_type == "simple" || se_type == "none") { + condition_pr_mat <- NULL + } else { + # TODO to allow for declaration with multiple arms, get probability matrix + # and build it like decl$pr_mat <- cbind(decl$pr_mat[, c(cond1, cond2)]) + condition_pr_mat <- declaration_to_condition_pr_mat( + ra_declaration, + condition1, + condition2 + ) + } + } else if (is.null(condition_pr_mat)) { + # need to learn it if no declaration and not passed + # check simple arg + simple <- ifelse(is.null(simple), TRUE, simple) + + if (is.null(data$blocks) && is.null(data$clusters)) { + # no blocks or clusters + if (simple) { + # don't need condition_pr_mat, just the condition_prs + # if the user passed it, we're fine and can use just the + # marginal probabilities + # if user didn't pass, we have to guess + if (is.null(data$condition_probabilities)) { + data$condition_probabilities <- mean(data$t == condition2) - # If simple, just use condition probabilities shortcut - # Same if se not needed - if (ra_declaration$ra_type == "simple" || se_type == "none") { - condition_pr_mat <- NULL + message( + "Assuming simple random assignment with probability of treatment ", + "equal to the mean number of obs in `condition2`, which is roughly ", + round(data$condition_probabilities[1], 3) + ) + } } else { - # TODO to allow for declaration with multiple arms, get probability matrix - # and build it like decl$pr_mat <- cbind(decl$pr_mat[, c(cond1, cond2)]) - condition_pr_mat <- declaration_to_condition_pr_mat( - ra_declaration, - condition1, - condition2 - ) - } - } else if (is.null(condition_pr_mat)) { - # need to learn it if no declaration and not passed - # check simple arg - simple <- ifelse(is.null(simple), TRUE, simple) - - if (is.null(data$blocks) && is.null(data$clusters)) { - # no blocks or clusters - if (simple) { - # don't need condition_pr_mat, just the condition_prs - # if the user passed it, we're fine and can use just the - # marginal probabilities - # if user didn't pass, we have to guess - if (is.null(data$condition_probabilities)) { - data$condition_probabilities <- mean(data$t == condition2) - - message( - "Assuming simple random assignment with probability of treatment ", - "equal to the mean number of obs in `condition2`, which is roughly ", - round(data$condition_probabilities[1], 3) - ) + # If we don't know the prob, learn it + if (is.null(data$condition_probabilities)) { + data$condition_probabilities <- mean(data$t == condition2) + message( + "Learning probability of complete random assignment from data with ", + "prob = ", round(data$condition_probabilities[1], 3) + ) + if (se_type != "none") { + condition_pr_mat <- gen_pr_matrix_complete( + pr = data$condition_probabilities[1], + n_total = length(data$y) + ) } } else { - # If we don't know the prob, learn it - if (is.null(data$condition_probabilities)) { - data$condition_probabilities <- mean(data$t == condition2) - message( - "Learning probability of complete random assignment from data with ", - "prob = ", round(data$condition_probabilities[1], 3) + if (length(unique(data$condition_probabilities)) > 1) { + stop( + "Treatment probabilities must be fixed for complete randomized designs" ) - - if (se_type != "none") { - condition_pr_mat <- gen_pr_matrix_complete( - pr = data$condition_probabilities[1], - n_total = length(data$y) - ) - } - - } else { - if (length(unique(data$condition_probabilities)) > 1) { - stop( - "Treatment probabilities must be fixed for complete randomized designs" - ) - } - - if (se_type != "none") { - condition_pr_mat <- gen_pr_matrix_complete( - pr = data$condition_probabilities[1], - n_total = length(data$y) - ) - } - } - } - } else if (is.null(data$blocks)) { - # clustered case - message( - "`simple` = ", simple, ", using ", - ifelse(simple, "simple", "complete"), " cluster randomization" - ) - - if (is.null(data$condition_probabilities)) { - - # Split by cluster and get complete randomized values within each cluster - cluster_treats <- get_cluster_treats(data, condition2) - data$condition_probabilities <- mean(cluster_treats$treat_clust) - - message( - "`condition_prs` not found, estimating probability of treatment ", - "to be constant at mean of clusters in `condition2` at prob =", - data$condition_probabilities[1] - ) if (se_type != "none") { - # Some redundancy in following fn - condition_pr_mat <- gen_pr_matrix_cluster( - clusters = data$clusters, - treat_probs = data$condition_probabilities, - simple = simple + condition_pr_mat <- gen_pr_matrix_complete( + pr = data$condition_probabilities[1], + n_total = length(data$y) ) } - } else if (se_type != "none") { + } + } + } else if (is.null(data$blocks)) { + # clustered case + message( + "`simple` = ", simple, ", using ", + ifelse(simple, "simple", "complete"), " cluster randomization" + ) - # Just to check if cluster has same treatment within - get_cluster_treats(data, condition2) + if (is.null(data$condition_probabilities)) { + + # Split by cluster and get complete randomized values within each cluster + cluster_treats <- get_cluster_treats(data, condition2) + data$condition_probabilities <- mean(cluster_treats$treat_clust) - condition_pr_mat <- gen_pr_matrix_cluster( - clusters = data$clusters, - treat_probs = data$condition_probabilities, - simple = simple - ) - } - } else { - # blocked case message( - "Assuming complete random assignment of clusters within blocks. ", - "User can use `ra_declaration` or `condition_pr_mat` to have full ", - "control over the design." + "`condition_prs` not found, estimating probability of treatment ", + "to be constant at mean of clusters in `condition2` at prob =", + data$condition_probabilities[1] ) - if (is.null(data$condition_probabilities)) { - message( - "`condition_prs` not found, estimating probability of treatment ", - "to be proportion of units or clusters in condition2 in each block" - ) - condition_pr_mat <- gen_pr_matrix_block( - blocks = data$blocks, - clusters = data$clusters, - t = data$t, - condition2 = condition2 - ) - } else { - condition_pr_mat <- gen_pr_matrix_block( - blocks = data$blocks, + if (se_type != "none") { + # Some redundancy in following fn + condition_pr_mat <- gen_pr_matrix_cluster( clusters = data$clusters, - p2 = data$condition_probabilities + treat_probs = data$condition_probabilities, + simple = simple ) } - } - } - - # Need the marginal condition prs for later in rare cases where not yet set - if (is.null(data$condition_probabilities)) { - data$condition_probabilities <- - diag(condition_pr_mat)[(length(data$y) + 1):(2 * length(data$y))] - } + } else if (se_type != "none") { - # Check some things that must be true, could do this earlier - # but don't have condition_probabilities then, and unfortunatey - # this loops over data clusters a second time - if (!is.null(data$clusters)) { - if (any(!tapply( - data$condition_probabilities, - data$clusters, - function(x) all(x == x[1]) - ))) { - stop("`condition_prs` must be constant within `cluster`") - } - } + # Just to check if cluster has same treatment within + get_cluster_treats(data, condition2) - rm(model_data) - - # ----- - # Estimation - # ----- - - if (is.null(data$blocks)) { - return_frame <- - horvitz_thompson_internal( - condition_pr_mat = condition_pr_mat, - condition1 = condition1, - condition2 = condition2, - data = data, - se_type = se_type, - alpha = alpha + condition_pr_mat <- gen_pr_matrix_cluster( + clusters = data$clusters, + treat_probs = data$condition_probabilities, + simple = simple ) + } } else { - clust_per_block <- check_clusters_blocks(data) + # blocked case + message( + "Assuming complete random assignment of clusters within blocks. ", + "User can use `ra_declaration` or `condition_pr_mat` to have full ", + "control over the design." + ) - N <- nrow(data) + if (is.null(data$condition_probabilities)) { + message( + "`condition_prs` not found, estimating probability of treatment ", + "to be proportion of units or clusters in condition2 in each block" + ) + condition_pr_mat <- gen_pr_matrix_block( + blocks = data$blocks, + clusters = data$clusters, + t = data$t, + condition2 = condition2 + ) + } else { + condition_pr_mat <- gen_pr_matrix_block( + blocks = data$blocks, + clusters = data$clusters, + p2 = data$condition_probabilities + ) + } + } + } - data$index <- 1:N + # Need the marginal condition prs for later in rare cases where not yet set + if (is.null(data$condition_probabilities)) { + data$condition_probabilities <- + diag(condition_pr_mat)[(length(data$y) + 1):(2 * length(data$y))] + } - block_dfs <- split(data, data$blocks) + # Check some things that must be true, could do this earlier + # but don't have condition_probabilities then, and unfortunatey + # this loops over data clusters a second time + if (!is.null(data$clusters)) { + if (any(!tapply( + data$condition_probabilities, + data$clusters, + function(x) all(x == x[1]) + ))) { + stop("`condition_prs` must be constant within `cluster`") + } + } - block_estimates <- lapply(block_dfs, function(x) { - horvitz_thompson_internal( - data = x, - condition1 = condition1, - condition2 = condition2, - condition_pr_mat = condition_pr_mat[c(x$index, N + x$index), c(x$index, N + x$index)], - se_type = se_type, - alpha = alpha - ) - }) + rm(model_data) - block_estimates <- do.call(rbind, block_estimates) + # ----- + # Estimation + # ----- - N_overall <- with(block_estimates, sum(N)) + if (is.null(data$blocks)) { + return_frame <- + horvitz_thompson_internal( + condition_pr_mat = condition_pr_mat, + condition1 = condition1, + condition2 = condition2, + data = data, + se_type = se_type, + alpha = alpha + ) + } else { + clust_per_block <- check_clusters_blocks(data) - n_blocks <- nrow(block_estimates) + N <- nrow(data) - diff <- with(block_estimates, sum(coefficients * N / N_overall)) + data$index <- 1:N - if (se_type != "none") { - std.error <- with(block_estimates, sqrt(sum(std.error ^ 2 * (N / N_overall) ^ 2))) - } else { - std.error <- NA - } + block_dfs <- split(data, data$blocks) - return_frame <- data.frame( - coefficients = diff, - std.error = std.error, - N = N_overall, - stringsAsFactors = FALSE + block_estimates <- lapply(block_dfs, function(x) { + horvitz_thompson_internal( + data = x, + condition1 = condition1, + condition2 = condition2, + condition_pr_mat = condition_pr_mat[c(x$index, N + x$index), c(x$index, N + x$index)], + se_type = se_type, + alpha = alpha ) - } + }) - return_frame$df <- NA - return_list <- add_cis_pvals(return_frame, alpha, ci, ttest = FALSE) + block_estimates <- do.call(rbind, block_estimates) - # ----- - # Build and return output - # ----- + N_overall <- with(block_estimates, sum(N)) - return_list <- dim_like_return( - return_list, - alpha = alpha, - formula = formula, - conditions = list(condition1, condition2) - ) + n_blocks <- nrow(block_estimates) - if (return_condition_pr_mat) { - return_list[["condition_pr_mat"]] <- condition_pr_mat - } + diff <- with(block_estimates, sum(coefficients * N / N_overall)) - attr(return_list, "class") <- "horvitz_thompson" + if (se_type != "none") { + std.error <- with(block_estimates, sqrt(sum(std.error^2 * (N / N_overall)^2))) + } else { + std.error <- NA + } - return(return_list) + return_frame <- data.frame( + coefficients = diff, + std.error = std.error, + N = N_overall, + stringsAsFactors = FALSE + ) } -var_ht_total_no_cov <- - function(y, ps) { - sum((1 - ps) * ps * y ^ 2) - } + return_frame$df <- NA + return_list <- add_cis_pvals(return_frame, alpha, ci, ttest = FALSE) + # ----- + # Build and return output + # ----- -horvitz_thompson_internal <- - function(condition_pr_mat = NULL, - condition1 = NULL, - condition2 = NULL, - data, - pair_matched = FALSE, - se_type, - alpha = .05) { + return_list <- dim_like_return( + return_list, + alpha = alpha, + formula = formula, + conditions = list(condition1, condition2) + ) - # TODO, add estimator from Middleton & Aronow 2015 page 51 + if (return_condition_pr_mat) { + return_list[["condition_pr_mat"]] <- condition_pr_mat + } - t2 <- which(data$t == condition2) - t1 <- which(data$t == condition1) + attr(return_list, "class") <- "horvitz_thompson" - N <- length(t2) + length(t1) + return(return_list) +} - std.error <- NA +var_ht_total_no_cov <- function(y, ps) { + sum((1 - ps) * ps * y^2) +} - collapsed <- !is.null(data$clusters) - if (collapsed) { - if (se_type == "constant") { - stop( - "`se_type` = 'constant' only supported for simple random designs ", - "at the moment" - ) - } - # used for cluster randomized designs - k <- length(unique(data$clusters)) - y2_totals <- tapply(data$y[t2], data$clusters[t2], sum) - y1_totals <- tapply(data$y[t1], data$clusters[t1], sum) +horvitz_thompson_internal <- function(condition_pr_mat = NULL, + condition1 = NULL, + condition2 = NULL, + data, + pair_matched = FALSE, + se_type, + alpha = .05) { - to_drop <- which(duplicated(data$clusters)) - t2 <- which(data$t[-to_drop] == condition2) - t1 <- which(data$t[-to_drop] == condition1) + # TODO, add estimator from Middleton & Aronow 2015 page 51 - # reorder totals because tapply above sorts on cluster - y2_totals <- - y2_totals[as.character(data$clusters[-to_drop][t2])] - y1_totals <- - y1_totals[as.character(data$clusters[-to_drop][t1])] + t2 <- which(data$t == condition2) + t1 <- which(data$t == condition1) - prs <- data$condition_probabilities[-to_drop] - ps2 <- prs[k + t2] - ps1 <- 1 - prs[t1] + N <- length(t2) + length(t1) - # for now rescale, with joint pr need squared top alone - Y2 <- y2_totals / ps2 - Y1 <- y1_totals / ps1 + std.error <- NA - diff <- (sum(Y2) - sum(Y1)) / N + collapsed <- !is.null(data$clusters) + if (collapsed) { + if (se_type == "constant") { + stop( + "`se_type` = 'constant' only supported for simple random designs ", + "at the moment" + ) + } + # used for cluster randomized designs + k <- length(unique(data$clusters)) - if (se_type != "none") { + y2_totals <- tapply(data$y[t2], data$clusters[t2], sum) + y1_totals <- tapply(data$y[t1], data$clusters[t1], sum) - condition_pr_mat <- - condition_pr_mat[-c(to_drop, N + to_drop), -c(to_drop, N + to_drop)] + to_drop <- which(duplicated(data$clusters)) + t2 <- which(data$t[-to_drop] == condition2) + t1 <- which(data$t[-to_drop] == condition1) - std.error <- - sqrt( - sum(Y2 ^ 2) + - sum(Y1 ^ 2) + - ht_var_partial( - Y2, - condition_pr_mat[(k + t2), (k + t2), drop = FALSE] - ) + - ht_var_partial( - Y1, - condition_pr_mat[t1, t1, drop = FALSE] - ) - - 2 * ht_covar_partial( - Y2, - Y1, - condition_pr_mat[(k + t2), t1, drop = FALSE], - ps2, - ps1 - ) - ) / N - } - - } else { - # All non-clustered designs + # reorder totals because tapply above sorts on cluster + y2_totals <- + y2_totals[as.character(data$clusters[-to_drop][t2])] + y1_totals <- + y1_totals[as.character(data$clusters[-to_drop][t1])] - ps2 <- data$condition_probabilities[t2] - ps1 <- 1 - data$condition_probabilities[t1] + prs <- data$condition_probabilities[-to_drop] + ps2 <- prs[k + t2] + ps1 <- 1 - prs[t1] - Y2 <- data$y[t2] / ps2 - Y1 <- data$y[t1] / ps1 + # for now rescale, with joint pr need squared top alone + Y2 <- y2_totals / ps2 + Y1 <- y1_totals / ps1 - diff <- (sum(Y2) - sum(Y1)) / N + diff <- (sum(Y2) - sum(Y1)) / N - if (is.null(condition_pr_mat)) { - # Simple random assignment - # joint inclusion probabilities = product of marginals - if (se_type == "constant") { - # Scale again - y0 <- ifelse( - data$t == condition1, - data$y / (1 - data$condition_probabilities), - (data$y - diff) / (1 - data$condition_probabilities) - ) - y1 <- ifelse( - data$t == condition2, - data$y / data$condition_probabilities, - (data$y + diff) / data$condition_probabilities - ) + if (se_type != "none") { + condition_pr_mat <- + condition_pr_mat[-c(to_drop, N + to_drop), -c(to_drop, N + to_drop)] - - std.error <- - sqrt( - var_ht_total_no_cov(y1, data$condition_probabilities) + - var_ht_total_no_cov(y0, 1 - data$condition_probabilities) + - # TODO why is it +2 instead of - (looking at old samii/aronow) - 2 * sum(c(data$y[t2], data$y[t1] + diff) * c(data$y[t2] - diff, data$y[t1])) - ) / N - } else if (se_type == "youngs") { - # Young's inequality - std.error <- - sqrt(sum(Y2 ^ 2) + sum(Y1 ^ 2)) / N - } - } else { - # Complete random assignment - if (se_type == "constant") { - stop( - "`se_type` = 'constant' only supported for simple random designs ", - "at the moment" - ) - } else if (se_type == "youngs") { - # Young's inequality - # this is the "clustered" estimator where each unit is a cluster - # shouldn't apply to clustered designs but may if user passes a - # condition_pr_mat - varN2 <- - sum(Y2 ^ 2) + - sum(Y1 ^ 2) + + std.error <- + sqrt( + sum(Y2^2) + + sum(Y1^2) + ht_var_partial( Y2, - condition_pr_mat[(N + t2), (N + t2), drop = FALSE] + condition_pr_mat[(k + t2), (k + t2), drop = FALSE] ) + ht_var_partial( Y1, @@ -756,35 +675,107 @@ horvitz_thompson_internal <- 2 * ht_covar_partial( Y2, Y1, - condition_pr_mat[(N + t2), t1, drop = FALSE], + condition_pr_mat[(k + t2), t1, drop = FALSE], ps2, ps1 ) + ) / N + } + } else { + # All non-clustered designs - if (!is.nan(varN2)) { - if (varN2 < 0) { - warning("Variance below 0") - std.error <- NA - } else { - std.error <- sqrt(varN2) / N - } - } else { - warning( - "Variance is NaN. This is likely the result of a complex condition probability matrix" - ) + ps2 <- data$condition_probabilities[t2] + ps1 <- 1 - data$condition_probabilities[t1] + + Y2 <- data$y[t2] / ps2 + Y1 <- data$y[t1] / ps1 + + diff <- (sum(Y2) - sum(Y1)) / N + + if (is.null(condition_pr_mat)) { + # Simple random assignment + # joint inclusion probabilities = product of marginals + if (se_type == "constant") { + # Scale again + y0 <- ifelse( + data$t == condition1, + data$y / (1 - data$condition_probabilities), + (data$y - diff) / (1 - data$condition_probabilities) + ) + y1 <- ifelse( + data$t == condition2, + data$y / data$condition_probabilities, + (data$y + diff) / data$condition_probabilities + ) + + + std.error <- + sqrt( + var_ht_total_no_cov(y1, data$condition_probabilities) + + var_ht_total_no_cov(y0, 1 - data$condition_probabilities) + + # TODO why is it +2 instead of - (looking at old samii/aronow) + 2 * sum(c(data$y[t2], data$y[t1] + diff) * c(data$y[t2] - diff, data$y[t1])) + ) / N + } else if (se_type == "youngs") { + # Young's inequality + std.error <- + sqrt(sum(Y2^2) + sum(Y1^2)) / N + } + } else { + # Complete random assignment + if (se_type == "constant") { + stop( + "`se_type` = 'constant' only supported for simple random designs ", + "at the moment" + ) + } else if (se_type == "youngs") { + # Young's inequality + # this is the "clustered" estimator where each unit is a cluster + # shouldn't apply to clustered designs but may if user passes a + # condition_pr_mat + varN2 <- + sum(Y2^2) + + sum(Y1^2) + + ht_var_partial( + Y2, + condition_pr_mat[(N + t2), (N + t2), drop = FALSE] + ) + + ht_var_partial( + Y1, + condition_pr_mat[t1, t1, drop = FALSE] + ) - + 2 * ht_covar_partial( + Y2, + Y1, + condition_pr_mat[(N + t2), t1, drop = FALSE], + ps2, + ps1 + ) + + if (!is.nan(varN2)) { + if (varN2 < 0) { + warning("Variance below 0") std.error <- NA + } else { + std.error <- sqrt(varN2) / N } + } else { + warning( + "Variance is NaN. This is likely the result of a complex condition probability matrix" + ) + std.error <- NA } } } + } - return_frame <- - data.frame( - coefficients = diff, - std.error = std.error, - N = N, - stringsAsFactors = FALSE - ) + return_frame <- + data.frame( + coefficients = diff, + std.error = std.error, + N = N, + stringsAsFactors = FALSE + ) - return(return_frame) - } + return(return_frame) +} diff --git a/R/estimatr_iv_robust.R b/R/estimatr_iv_robust.R index 55fc04bb..7018af13 100644 --- a/R/estimatr_iv_robust.R +++ b/R/estimatr_iv_robust.R @@ -17,7 +17,7 @@ #' @param se_type The sort of standard error sought. If `clusters` is #' not specified the options are "HC0", "HC1" (or "stata", the equivalent), #' "HC2" (default), "HC3", or -#' "classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata". +#' "classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients. #' @param ci logical. Whether to compute and return p-values and confidence #' intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. @@ -74,9 +74,9 @@ #' \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} #' \item{rank}{the rank of the fitted model} #' \item{vcov}{the fitted variance covariance matrix} -#' \item{r.squared}{the \eqn{R^2}} -#' \item{adj.r.squared}{the \eqn{R^2} but penalized for having more parameters, \code{rank}} -#' \item{fstatistic}{a vector with the value of the F-statistic with the numerator and denominator degrees of freedom} +#' \item{r.squared}{the \eqn{R^2} of the second stage regrssion} +#' \item{adj.r.squared}{the \eqn{R^2} of the second stage regression, but penalized for having more parameters, \code{rank}} +#' \item{fstatistic}{a vector with the value of the second stage F-statistic with the numerator and denominator degrees of freedom} #' \item{weighted}{whether or not weights were applied} #' \item{call}{the original function call} #' We also return \code{terms} with the second stage terms and \code{terms_regressors} with the first stage terms, both of which used by \code{predict}. @@ -116,7 +116,6 @@ iv_robust <- function(formula, alpha = .05, return_vcov = TRUE, try_cholesky = FALSE) { - datargs <- enquos( formula = formula, weights = weights, @@ -126,7 +125,7 @@ iv_robust <- function(formula, data <- enquo(data) model_data <- clean_model_data(data = data, datargs, estimator = "iv") - if (ncol(model_data$instrument_matrix) < ncol(model_data$design_matrix)) { + if (ncol(model_data$instrument_matrix) < ncol(model_data$design_matrix)) { warning("More regressors than instruments") } diff --git a/R/estimatr_lm_robust.R b/R/estimatr_lm_robust.R index 00c2d0a7..acdb561a 100644 --- a/R/estimatr_lm_robust.R +++ b/R/estimatr_lm_robust.R @@ -14,7 +14,7 @@ #' @param se_type The sort of standard error sought. If `clusters` is #' not specified the options are "HC0", "HC1" (or "stata", the equivalent), #' "HC2" (default), "HC3", or -#' "classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata". +#' "classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients. #' @param ci logical. Whether to compute and return p-values and confidence #' intervals, TRUE by default. #' @param alpha The significance level, 0.05 by default. diff --git a/R/helper_clean_model_data.R b/R/helper_clean_model_data.R index 3e5db677..07aa7031 100644 --- a/R/helper_clean_model_data.R +++ b/R/helper_clean_model_data.R @@ -14,7 +14,7 @@ clean_model_data <- function(data, datargs, estimator = "") { # if data exists, evaluate it data <- if (quo_is_missing(data)) NULL else eval_tidy(data) - if(getOption("estimatr.debug.clean_model_data", FALSE)) browser() + if (getOption("estimatr.debug.clean_model_data", FALSE)) browser() mfargs <- Filter(Negate(quo_is_missing), datargs) @@ -26,7 +26,7 @@ clean_model_data <- function(data, datargs, estimator = "") { # subset is also non-standard eval to_process <- setdiff( names(mfargs), - setdiff( names(formals(stats::model.frame.default)),args_ignored) + setdiff(names(formals(stats::model.frame.default)), args_ignored) ) for (da in to_process) { @@ -38,10 +38,12 @@ clean_model_data <- function(data, datargs, estimator = "") { mfargs[["formula"]] <- Formula::as.Formula(m_formula) # Get model frame - mf <- eval_tidy(quo((stats::model.frame)(!!!mfargs, - data=data, - na.action=na.omit_detailed.data.frame, - drop.unused.levels=TRUE))) + mf <- eval_tidy(quo((stats::model.frame)( + !!!mfargs, + data = data, + na.action = na.omit_detailed.data.frame, + drop.unused.levels = TRUE + ))) local({ na.action <- attr(mf, "na.action") diff --git a/R/helper_condition_pr_matrix.R b/R/helper_condition_pr_matrix.R index 1d138bbd..06f641d6 100644 --- a/R/helper_condition_pr_matrix.R +++ b/R/helper_condition_pr_matrix.R @@ -113,7 +113,7 @@ declaration_to_condition_pr_mat <- function(ra_declaration, stop( "Cannot have `condition1 == NULL` and `condition2 != NULL`" ) - }else if (!is.null(condition1) && is.null(condition2)) { + } else if (!is.null(condition1) && is.null(condition2)) { stop( "Cannot have `condition2 == NULL` and `condition1 != NULL`" ) @@ -439,10 +439,10 @@ gen_pr_matrix_block <- function(blocks, clusters, p2 = NULL, p1 = NULL, t = NULL condition_pr_matrix[ ids, c(block_dat[[j]]$ids, n + block_dat[[j]]$ids) - ] <- tcrossprod( - c(block_dat[[i]]$p1, block_dat[[i]]$p2), - c(block_dat[[j]]$p1, block_dat[[j]]$p2) - ) + ] <- tcrossprod( + c(block_dat[[i]]$p1, block_dat[[i]]$p2), + c(block_dat[[j]]$p1, block_dat[[j]]$p2) + ) } } } diff --git a/R/helper_extract.R b/R/helper_extract.R index 0a097665..6eafd41a 100644 --- a/R/helper_extract.R +++ b/R/helper_extract.R @@ -15,9 +15,14 @@ #' @param include.rmse logical. Defaults to TRUE #' @param ... unused #' -extract.lm_robust <- function(model, include.ci = TRUE, include.rsquared = TRUE, include.adjrs = TRUE, - include.nobs = TRUE, include.fstatistic = FALSE, include.rmse = TRUE, ...) { - +extract.lm_robust <- function(model, + include.ci = TRUE, + include.rsquared = TRUE, + include.adjrs = TRUE, + include.nobs = TRUE, + include.fstatistic = FALSE, + include.rmse = TRUE, + ...) { s <- summary(model, ...) names <- rownames(s$coefficients) @@ -31,9 +36,9 @@ extract.lm_robust <- function(model, include.ci = TRUE, include.rsquared = TRUE, ciupper <- coef(s)[, 5] } - rs <- s$r.squared #extract R-squared - adj <- s$adj.r.squared #extract adjusted R-squared - n <- nobs(model) #extract number of observations + rs <- s$r.squared # extract R-squared + adj <- s$adj.r.squared # extract adjusted R-squared + n <- nobs(model) # extract number of observations gof <- numeric() gof.names <- character() diff --git a/R/helper_lm_robust_fit.R b/R/helper_lm_robust_fit.R index 9e302871..e7ea8084 100644 --- a/R/helper_lm_robust_fit.R +++ b/R/helper_lm_robust_fit.R @@ -29,7 +29,6 @@ lm_robust_fit <- function(y, return_unweighted_fit = FALSE, try_cholesky = FALSE, X_first_stage = NULL) { - y <- as.matrix(y) ny <- ncol(y) multivariate <- ny > 1 @@ -165,10 +164,9 @@ lm_robust_fit <- function(y, # ---------- if (se_type != "none" || return_fit) { - if (rank < ncol(X)) { X <- X[, covs_used, drop = FALSE] - if (weighted){ + if (weighted) { Xunweighted <- Xunweighted[, covs_used, drop = FALSE] } if (iv_second_stage) { @@ -202,7 +200,6 @@ lm_robust_fit <- function(y, } if (se_type != "none") { - if (se_type == "CR2") { vcov_fit <- lm_variance_cr2( X = X, @@ -217,8 +214,7 @@ lm_robust_fit <- function(y, ) vcov_fit[["res_var"]] <- colSums((y - X %*% fit$beta_hat)^2) / - (N - rank) - + (N - rank) } else { vcov_fit <- lm_variance( X = X, @@ -284,23 +280,22 @@ lm_robust_fit <- function(y, return_list[["rank"]] <- rank if (se_type != "none") { - if (weighted) { if (has_int) { return_list[["tot_var"]] <- colSums( - weights ^ 2 * - (yunweighted - weighted.mean(yunweighted, weights ^ 2)) ^ 2 + weights^2 * + (yunweighted - weighted.mean(yunweighted, weights^2))^2 ) * weight_mean } else { - return_list[["tot_var"]] <- colSums(y ^ 2 * weight_mean) + return_list[["tot_var"]] <- colSums(y^2 * weight_mean) } - return_list[["res_var"]] <- diag(as.matrix(colSums(ei ^ 2 * weight_mean) / (N - rank))) + return_list[["res_var"]] <- diag(as.matrix(colSums(ei^2 * weight_mean) / (N - rank))) } else { if (has_int) { - return_list[["tot_var"]] <- .rowSums(apply(y, 1, `-`, colMeans(y)) ^ 2, ny, N) + return_list[["tot_var"]] <- .rowSums(apply(y, 1, `-`, colMeans(y))^2, ny, N) } else { - return_list[["tot_var"]] <- colSums(y ^ 2) + return_list[["tot_var"]] <- colSums(y^2) } return_list[["res_var"]] <- diag(as.matrix(ifelse(vcov_fit$res_var < 0, NA, vcov_fit$res_var))) } @@ -308,8 +303,8 @@ lm_robust_fit <- function(y, return_list[["r.squared"]] <- 1 - ( - return_list[["df.residual"]] * return_list[["res_var"]] / - return_list[["tot_var"]] + return_list[["df.residual"]] * return_list[["res_var"]] / + return_list[["tot_var"]] ) return_list[["adj.r.squared"]] <- diff --git a/R/helper_na_omit_detailed.R b/R/helper_na_omit_detailed.R index b34bfa96..7182de52 100644 --- a/R/helper_na_omit_detailed.R +++ b/R/helper_na_omit_detailed.R @@ -8,14 +8,12 @@ #' column name, if any were dropped. #' #' @seealso \code{\link{na.omit}} -na.omit_detailed.data.frame <- function(object){ - +na.omit_detailed.data.frame <- function(object) { why_omit <- naomitwhy(object, function(x) is.na(x)) - if(length(why_omit)) { - object <- if(length(dim(object))) object[-why_omit, , drop=FALSE] else object[-why_omit] + if (length(why_omit)) { + object <- if (length(dim(object))) object[-why_omit, , drop = FALSE] else object[-why_omit] attr(object, "na.action") <- why_omit } object - } diff --git a/R/zzz.R b/R/zzz.R index 2bb0a09d..469b33b7 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,34 +1,14 @@ # This code modified from # https://github.com/atahk/bucky/blob/master/R/zzz.R (GPL 3.0) .onLoad <- function(libname, pkgname) { - if (suppressWarnings(requireNamespace("texreg", quietly=TRUE))) { + if (suppressWarnings(requireNamespace("texreg", quietly = TRUE))) { setGeneric("extract", function(model, ...) standardGeneric("extract"), - package = "texreg") + package = "texreg" + ) setMethod("extract", - signature = className("lm_robust", pkgname), - definition = extract.lm_robust) + signature = className("lm_robust", pkgname), + definition = extract.lm_robust + ) } invisible() } - - - - - - - - - - - - - - - - - - - - - - diff --git a/man/iv_robust.Rd b/man/iv_robust.Rd index 2f393c0a..389de9c8 100644 --- a/man/iv_robust.Rd +++ b/man/iv_robust.Rd @@ -26,7 +26,7 @@ corresponds to the clusters in the data.} \item{se_type}{The sort of standard error sought. If `clusters` is not specified the options are "HC0", "HC1" (or "stata", the equivalent), "HC2" (default), "HC3", or -"classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata".} +"classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} @@ -66,9 +66,9 @@ following components: \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)} \item{rank}{the rank of the fitted model} \item{vcov}{the fitted variance covariance matrix} - \item{r.squared}{the \eqn{R^2}} - \item{adj.r.squared}{the \eqn{R^2} but penalized for having more parameters, \code{rank}} - \item{fstatistic}{a vector with the value of the F-statistic with the numerator and denominator degrees of freedom} + \item{r.squared}{the \eqn{R^2} of the second stage regrssion} + \item{adj.r.squared}{the \eqn{R^2} of the second stage regression, but penalized for having more parameters, \code{rank}} + \item{fstatistic}{a vector with the value of the second stage F-statistic with the numerator and denominator degrees of freedom} \item{weighted}{whether or not weights were applied} \item{call}{the original function call} We also return \code{terms} with the second stage terms and \code{terms_regressors} with the first stage terms, both of which used by \code{predict}. diff --git a/man/lm_robust.Rd b/man/lm_robust.Rd index 05f9a6f1..d3385c47 100644 --- a/man/lm_robust.Rd +++ b/man/lm_robust.Rd @@ -24,7 +24,7 @@ corresponds to the clusters in the data.} \item{se_type}{The sort of standard error sought. If `clusters` is not specified the options are "HC0", "HC1" (or "stata", the equivalent), "HC2" (default), "HC3", or -"classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata".} +"classical". If `clusters` is specified the options are "CR0", "CR2" (default), or "stata". Can also specify "none", which may speed up estimation of the coefficients.} \item{ci}{logical. Whether to compute and return p-values and confidence intervals, TRUE by default.} @@ -55,7 +55,7 @@ Users who want to print the results in TeX of HTML can use the \code{\link[texreg]{extract}} function and the \pkg{texreg} package. If users specify a multivariate linear regression model (multiple outcomes), -then some of the below components will be of higher dimension to accomodate +then some of the below components will be of higher dimension to accommodate the additional models. An object of class \code{"lm_robust"} is a list containing at least the