Merge pull request #169 from DeclareDesign/ls/weight_lin

Fixes weighting for lm_lin and blocked difference_in_means
DeclareDesign · Mar 17, 2018 · f73db70 · f73db70
2 parents cd4f7b4 + 33e3cd7
commit f73db70
Show file tree

Hide file tree

Showing 4 changed files with 84 additions and 15 deletions.
diff --git a/R/estimatr_difference_in_means.R b/R/estimatr_difference_in_means.R
@@ -238,7 +238,10 @@ difference_in_means <-
       stringsAsFactors = FALSE
     )
     data$cluster <- model_data$cluster
-    data$weights <- model_data$weights
+    # rescale weights for convenience
+    if (is.numeric(model_data$weights)) {
+      data$weights <- model_data$weights / mean(model_data$weights)
+    }
     data$block <- model_data$block
 
     if (!is.null(data$weights) && length(unique(data$weights)) == 1
@@ -355,7 +358,7 @@ difference_in_means <-
         ## matches lm_lin, two estimates per block
         if (is.null(data$cluster)) {
           design <- "Blocked"
-          df <- N_overall - 2 * n_blocks
+          df <- nrow(data) - 2 * n_blocks
         } else {
           design <- "Block-clustered"
           # Also matches lm_lin for even sized clusters, should be conservative
@@ -520,10 +523,15 @@ difference_in_means_internal <-
       data.frame(
         coefficients = diff,
         se = se,
-        N = N,
         df = df,
         stringsAsFactors = FALSE
       )
 
+    if (is.numeric(data$weights)) {
+      return_frame$N <- sum(data$weights)
+    } else {
+      return_frame$N <- N
+    }
+
     return(return_frame)
   }
diff --git a/R/estimatr_lm_lin.R b/R/estimatr_lm_lin.R
@@ -253,16 +253,22 @@ lm_lin <- function(formula,
   # Center and interact variables
   # ----------
 
+  # Initialize as non-demeaned
   demeaned_covars <-
-    scale(
-      design_matrix[
-        ,
-        setdiff(colnames(design_matrix), c(design_mat_treatment, "(Intercept)")),
-        drop = FALSE
-      ],
-      center = TRUE,
-      scale = FALSE
-    )
+    design_matrix[
+      ,
+      setdiff(colnames(design_matrix), c(design_mat_treatment, "(Intercept)")),
+      drop = FALSE
+    ]
+
+  # Choose what to center on!
+  if (is.numeric(weights)) {
+    center <- apply(demeaned_covars, 2, weighted.mean, weights)
+  } else {
+    center <- colMeans(demeaned_covars)
+  }
+
+  demeaned_covars <- sweep(demeaned_covars, 2, center)
 
   original_covar_names <- colnames(demeaned_covars)
 
@@ -338,7 +344,7 @@ lm_lin <- function(formula,
     formula = formula
   )
 
-  return_list[["scaled_center"]] <- attr(demeaned_covars, "scaled:center")
+  return_list[["scaled_center"]] <- center
   setNames(return_list[["scaled_center"]], original_covar_names)
 
   return_list[["call"]] <- match.call()

diff --git a/tests/testthat/test-difference-in-means.R b/tests/testthat/test-difference-in-means.R
@@ -581,8 +581,6 @@ test_that("DIM matches lm_robust under certain conditions", {
   # With weights now, identical to lm_robust, HC2 by force! except for matched pairs which fails
   dat$w <- runif(nrow(dat))
 
-
-
   # simple W
   expect_equivalent(
     tidy(lm_robust(Y ~ z, data = dat, weights = w))[2, ],

diff --git a/tests/testthat/test-lm-lin.R b/tests/testthat/test-lm-lin.R
@@ -311,3 +311,60 @@ test_that("Test LM Lin", {
     c("z", "X1_c", "z:X1_c")
   )
 })
+
+test_that("lm_lin same as sampling perspective", {
+
+  # Unweighted matches sampling view
+  lmo <- lm_lin(mpg ~ am, ~ hp, data = mtcars)
+  m_hp <- mean(mtcars$hp)
+  areg <- lm(mpg ~ hp, data = mtcars, subset = am == 1)
+  breg <- lm(mpg ~ hp, data = mtcars, subset = am == 0)
+  ate <-
+    with(mtcars[mtcars$am == 1, ],
+         mean(mpg) + (m_hp - mean(hp)) * areg$coefficients[2]) -
+    with(mtcars[mtcars$am == 0, ],
+         mean(mpg) + (m_hp - mean(hp)) * breg$coefficients[2])
+
+  expect_equivalent(
+    ate,
+    lmo$coefficients["am"]
+  )
+})
+
+test_that("weighted lm_lin same as with one covar sampling view", {
+
+  # Weighted matches (one covar)
+  lmwo <- lm_lin(mpg ~ am, ~ hp, weights = wt, data = mtcars)
+  hp_wmean <- weighted.mean(mtcars$hp, mtcars$wt)
+  wareg <- lm(mpg ~ hp, data = mtcars, subset = am == 1, weights = wt)
+  wbreg <- lm(mpg ~ hp, data = mtcars, subset = am == 0, weights = wt)
+  wate <-
+    with(mtcars[mtcars$am == 1, ],
+         weighted.mean(mpg, wt) + (hp_wmean - weighted.mean(hp, wt)) * wareg$coefficients[2]) -
+    with(mtcars[mtcars$am == 0, ],
+         weighted.mean(mpg, wt) + (hp_wmean - weighted.mean(hp, wt)) * wbreg$coefficients[2])
+
+  expect_equivalent(
+    wate,
+    lmwo$coefficients["am"]
+  )
+})
+
+test_that("weighted lm_lin same as with two covar sampling view", {
+
+  # Weighted matches (two covars)
+  lmw2o <- lm_lin(mpg ~ am, ~ hp + cyl, weights = wt, data = mtcars)
+  hpcyl_wmean <- apply(mtcars[, c("hp", "cyl")], 2, weighted.mean, mtcars$wt)
+  w2areg <- lm(mpg ~ hp + cyl, data = mtcars, subset = am == 1, weights = wt)
+  w2breg <- lm(mpg ~ hp + cyl, data = mtcars, subset = am == 0, weights = wt)
+  w2ate <-
+    with(mtcars[mtcars$am == 1, ],
+         weighted.mean(mpg, wt) + (hpcyl_wmean - apply(cbind(hp, cyl), 2, weighted.mean, wt)) %*% w2areg$coefficients[2:3]) -
+    with(mtcars[mtcars$am == 0, ],
+         weighted.mean(mpg, wt) + (hpcyl_wmean - apply(cbind(hp, cyl), 2, weighted.mean, wt)) %*% w2breg$coefficients[2:3])
+
+  expect_equivalent(
+    w2ate,
+    lmw2o$coefficients["am"]
+  )
+})