Merge 8af9c6c into f9f5a93

DeclareDesign · Jan 23, 2018 · 256a571 · 256a571
2 parents f9f5a93 + 8af9c6c
commit 256a571
Show file tree

Hide file tree

Showing 9 changed files with 591 additions and 101 deletions.
diff --git a/R/estimatr_difference_in_means.R b/R/estimatr_difference_in_means.R
@@ -6,7 +6,7 @@
 #' @param data A data.frame.
 #' @param weights An optional bare (unquoted) name of the weights variable.
 #' @param subset An optional bare (unquoted) expression specifying a subset of observations to be used.
-#' #' @param ci A boolean for whether to compute and return pvalues and confidence intervals, TRUE by default.
+#' @param ci A boolean for whether to compute and return pvalues and confidence intervals, TRUE by default.
 #' @param alpha The significance level, 0.05 by default.
 #' @param condition1 names of the conditions to be compared. Effects are estimated with condition1 as control and condition2 as treatment. If unspecified, condition1 is the "first" condition and condition2 is the "second" according to r defaults.
 #' @param condition2 names of the conditions to be compared. Effects are estimated with condition1 as control and condition2 as treatment. If unspecified, condition1 is the "first" condition and condition2 is the "second" according to r defaults.

diff --git a/R/estimatr_lm_lin.R b/R/estimatr_lm_lin.R
@@ -1,21 +1,123 @@
 
 #' Linear regression with the Lin (2013) covariate adjustment
 #'
-#' @param formula An object of class "formula", such as Y ~ Z. Should only have the outcome and the treatment.
-#' @param covariates A one-sided formula with all of the covariates on the right hand side, such as ~ x1 + x2 + x3.
-#' @param data A data.frame.
-#' @param weights the bare (unquoted) names of the weights variable in the supplied data.
-#' @param subset An optional bare (unquoted) expression specifying a subset of observations to be used.
-#' @param clusters An optional bare (unquoted) name of the variable that corresponds to the clusters in the data.
-#' @param se_type The sort of standard error sought. Without clustering: "HC0", "HC1", "HC2" (default), "HC3", or "classical". With clustering: "BM" (default), "stata".
-#' @param ci A boolean for whether to compute and return pvalues and confidence intervals, TRUE by default.
+#' @description This function is a wrapper for \code{\link{lm_robust}} that
+#' is useful for estimating treatment effects with pre-treatment covariate
+#' data. This implements the method described by Lin (2013) to reduce the bias
+#' of such estimation
+#'
+#' @param formula an object of class formula, as in \code{\link{lm}}, such as
+#' \code{Y ~ Z} with only one variable on the right-hand side, the treatment
+#' @param covariates a right-sided formula with pre-treatment covaraites on
+#' the right hand side, such as \code{ ~ x1 + x2 + x3}.
+#' @param data A \code{data.frame}
+#' @param weights the bare (unquoted) names of the weights variable in the
+#' supplied data.
+#' @param subset An optional bare (unquoted) expression specifying a subset
+#' of observations to be used.
+#' @param clusters An optional bare (unquoted) name of the variable that
+#' corresponds to the clusters in the data.
+#' @param se_type The sort of standard error sought. Without clustering:
+#' "HC0", "HC1" (or "stata", the equivalent), "HC2" (default), "HC3", or
+#' "classical". With clustering: "CR0", "CR2" (default), or "stata". are
+#' permissible.
+#' @param ci A boolean for whether to compute and return pvalues and confidence
+#' intervals, TRUE by default.
 #' @param alpha The significance level, 0.05 by default.
-#' @param coefficient_name a character or character vector that indicates which coefficients should be reported. If left unspecified, returns all coefficients.
-#' @param return_vcov a boolean for whether to return the vcov matrix for later usage, TRUE by default.
-#' @param try_cholesky a boolean for whether to try using a cholesky decomposition to solve LS instead of a QR decomposition, FALSE by default. See 'details'.
-
-#' @export
+#' @param coefficient_name a character or character vector that indicates which
+#' coefficients should be reported. If left unspecified, returns all
+#' coefficients. Especially for models with clustering where only one
+#' coefficient is of interest, specifying a coefficient of interest may
+#' result in improvements in speed
+#' @param return_vcov a boolean for whether to return the variance-covariance
+#' matrix for later usage, TRUE by default.
+#' @param try_cholesky a boolean for whether to try using a Cholesky
+#' decomposition to solve LS instead of a QR decomposition, FALSE by default.
+#' Using a Cholesky decomposition may result in speed gains, but should only
+#' be used if users are sure their model is full-rank (i.e. there is no
+#' perfect multi-collinearity)
+#'
+#' @details
+#'
+#' This function is simply a wrapper for \code{\link{lm_robust}}. This method
+#' pre-processes the data by taking the covariates specified in the
+#' \code{`covariates`} argument, centering them by subtracting from each covariate
+#' its mean, and interacting them with the treatment. If the treatment has
+#' multiple values, a series of dummies for each value is created and each of
+#' those is interacted with the demeaned covariates. More details can be found
+#' in the
+#' \href{http://estimatr.declaredesign.org/articles/getting-started.html}{Getting Started vignette}
+#' and the
+#' \href{http://estimatr.declaredesign.org/articles/technical-notes.html}{technical notes}.
+#'
+#' @return \code{lm_lin} returns an object of class \code{"lm_robust"}.
+#'
+#' The functions \code{summary} and \code{\link{tidy}} can be used to get
+#' the results as a \code{data.frame}. To get useful data out of the return,
+#' you can use these data frames, you can use the resulting list directly, or
+#' you can use the generic accessor functions \code{coef}, \code{vcov},
+#' \code{confint}, and \code{predict}.
+#'
+#' An object of class \code{"lm_robust"} is a list containing at least the
+#' following components:
+#' \describe{
+#'   \item{est}{the estimated coefficients}
+#'   \item{se}{the estimated standard errors}
+#'   \item{df}{the estimated degrees of freedom}
+#'   \item{p}{the p-values from the t-test using \code{est}, \code{se}, and \code{df}}
+#'   \item{ci_lower}{the lower bound of the \code{1 - alpha} percent confidence interval}
+#'   \item{ci_upper}{the upper bound of the \code{1 - alpha} percent confidence interval}
+#'   \item{coefficient_name}{a character vector of coefficient names}
+#'   \item{alpha}{the significance level specified by the user}
+#'   \item{res_var}{the residual variance, used for uncertainty when using \code{predict}}
+#'   \item{N}{the number of observations used}
+#'   \item{k}{the number of columns in the design matrix (includes linearly dependent columns!)}
+#'   \item{rank}{the rank of the fitted model}
+#'   \item{vcov}{the fitted variance covariance matrix}
+#'   \item{weighted}{whether or not weights were applied}
+#'   \item{scaled_center}{the means of each of the covariates used for centering them}
+#' }
+#' We also return \code{terms} and \code{contrasts}, used by \code{predict}.
+#'
+#' @examples
+#' library(fabricatr)
+#' library(randomizr)
+#' dat <- fabricate(
+#'   N = 40,
+#'   x = rnorm(N, mean = 2.3),
+#'   x2 = rpois(N, lambda = 2),
+#'   x3 = runif(N),
+#'   y0 = rnorm(N) + x,
+#'   y1 = rnorm(N) + x + 0.35
+#' )
+#'
+#' dat$z <- simple_ra(N = nrow(dat))
+#' dat$y <- ifelse(dat$z == 1, dat$y1, dat$y0)
+#'
+#' # Same specification as `lm_robust()` with one additional argument
+#' lmlin_out <- lm_lin(y ~ z, covariates = ~ x, data = dat)
+#' tidy(lmlin_out)
+#'
+#' # Works with multiple pre-treatment covariates
+#' lm_lin(y ~ z, covariates = ~ x + x2, data = dat)
+#'
+#' # Also centers data AFTER evaluating any functions in formula
+#' lm_lin(y ~ z, covariates = ~ x + log(x3), data = dat)
+#'
+#' # Works easily with clusters
+#' dat$clusterID <- rep(1:20, each = 2)
+#' dat$z_clust <- cluster_ra(clusters = dat$clusterID)
+#'
+#' lm_lin(y ~ z_clust, covariates = ~ x, data = dat, clusters = clusterID)
 #'
+#' # Works with multi-valued treatments
+#' dat$z_multi <- sample(1:3, size = nrow(dat), replace = TRUE)
+#' lm_lin(y ~ z_multi, covariates = ~ x, data = dat)
+#'
+#' @references
+# ’ Lin, Winston. 2013. “Agnostic Notes on Regression Adjustments to Experimental Data: Reexamining Freedman’s Critique.” The Annals of Applied Statistics 7 (1). Institute of Mathematical Statistics: 295–318. \url{https://doi.org/10.1214/12-AOAS583}.
+#'
+#' @export
 lm_lin <- function(formula,
                    covariates,
                    data,
@@ -32,14 +134,14 @@ lm_lin <- function(formula,
   # Check formula
   if (length(all.vars(formula[[3]])) > 1) {
     stop(
-      "'formula' should only have one variable on the right-hand side: ",
+      "`formula` should only have one variable on the right-hand side: ",
       " the treatment variable."
     )
   }
 
   if (class(covariates) != "formula") {
     stop(
-      "'covariates' must be specified as a formula:\n",
+      "`covariates` must be specified as a formula:\n",
       "You passed an object of class ", class(covariates)
     )
   }
@@ -49,13 +151,13 @@ lm_lin <- function(formula,
   # Check covariates is right hand sided fn
   if (attr(cov_terms, "response") != 0) {
     stop(
-      "'covariates' must be right-sided formula only, such as '~ x1 + x2 + x3'"
+      "`covariates` must be right-sided formula only, such as '~ x1 + x2 + x3'"
     )
   }
 
   if (length(attr(cov_terms, "order")) == 0) {
     stop(
-      "'covariates' must have a variable on the right-hand side, not 0 or 1"
+      "`covariates` must have a variable on the right-hand side, not 0 or 1"
     )
   }
 
@@ -64,7 +166,7 @@ lm_lin <- function(formula,
     update(
       formula,
       reformulate(
-        c('.', labels(cov_terms), response = '.')
+        c(".", labels(cov_terms), response = ".")
       )
     )
 
@@ -94,34 +196,26 @@ lm_lin <- function(formula,
 
   # Check case where treatment is not factor and is not binary
   if (any(!(treatment %in% c(0, 1)))) {
-    if (ncol(treatment) > 1) {
-      stop(
-        "Treatment variable must be binary or a factor for the Lin estimator."
-      )
-    } else {
-      # create dummies for non-factor treatment variable
-
-      # Drop out first group if there is an intercept
-      vals <- sort(unique(treatment))
-      if (has_intercept) vals <- vals[-1]
-
-      n_treats <- length(vals)
-      # Should we warn if too many values?
-      # (ie. if there are as many treatments as observations)
+    # create dummies for non-factor treatment variable
 
-      names(vals) <- paste0(colnames(design_matrix)[treat_col], vals)
+    # Drop out first group if there is an intercept
+    vals <- sort(unique(treatment))
+    if (has_intercept) vals <- vals[-1]
 
+    n_treats <- length(vals)
+    # Should we warn if too many values?
+    # (ie. if there are as many treatments as observations)
 
-      # Create matrix of dummies
-      treatment <-
-        outer(
-          drop(treatment),
-          vals,
-          function(x, y) as.numeric(x == y)
-        )
+    names(vals) <- paste0(colnames(design_matrix)[treat_col], vals)
 
-    }
 
+    # Create matrix of dummies
+    treatment <-
+      outer(
+        drop(treatment),
+        vals,
+        function(x, y) as.numeric(x == y)
+      )
   }
 
   # center all covariates
@@ -145,12 +239,11 @@ lm_lin <- function(formula,
   n_covars <- ncol(demeaned_covars)
 
   # Interacted
-  #n_int_covar_cols <- n_covars * (n_treat_cols + has_intercept)
+  # n_int_covar_cols <- n_covars * (n_treat_cols + has_intercept)
   n_int_covar_cols <- n_covars * (n_treat_cols)
   interacted_covars <- matrix(0, nrow = n, ncol = n_int_covar_cols)
   interacted_covars_names <- character(n_int_covar_cols)
   for (i in 1:n_covars) {
-
     covar_name <- colnames(demeaned_covars)[i]
 
     cols <- (i - 1) * n_treat_cols + (1:n_treat_cols)
@@ -161,21 +254,26 @@ lm_lin <- function(formula,
 
   if (has_intercept) {
     # Have to manually create intercept if treatment wasn't a factor
-    X <- cbind(matrix(1, nrow = n, ncol = 1, dimnames = list(NULL, "(Intercept)")),
-               treatment,
-               demeaned_covars,
-               interacted_covars)
+    X <- cbind(
+      matrix(1, nrow = n, ncol = 1, dimnames = list(NULL, "(Intercept)")),
+      treatment,
+      demeaned_covars,
+      interacted_covars
+    )
   } else {
     # If no intercept, but treatment is only one column, need to add base terms for covariates
     if (n_treat_cols == 1) {
-      X <- cbind(treatment,
-                 demeaned_covars,
-                 interacted_covars)
+      X <- cbind(
+        treatment,
+        demeaned_covars,
+        interacted_covars
+      )
     } else {
-      X <- cbind(treatment,
-                 interacted_covars)
+      X <- cbind(
+        treatment,
+        interacted_covars
+      )
     }
-
   }
 
   return_list <-
@@ -192,9 +290,11 @@ lm_lin <- function(formula,
       try_cholesky = try_cholesky
     )
 
-  return_list <- lm_return(return_list,
-                           model_data = model_data,
-                           formula = formula)
+  return_list <- lm_return(
+    return_list,
+    model_data = model_data,
+    formula = formula
+  )
 
   return_list[["scaled_center"]] <- attr(demeaned_covars, "scaled:center")
   setNames(return_list[["scaled_center"]], original_covar_names)