Merge branch 'ls/gp' of github.com:DeclareDesign/estimatr into ls/gp

DeclareDesign · Mar 21, 2018 · e192430 · e192430
2 parents bc762ae + 0ea76fb
commit e192430
Show file tree

Hide file tree

Showing 4 changed files with 79 additions and 83 deletions.
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -29,7 +29,7 @@ lm_variance_cr2 <- function(X, Xunweighted, XtX_inv, ei, weight_mean, clusters,
     .Call(`_estimatr_lm_variance_cr2`, X, Xunweighted, XtX_inv, ei, weight_mean, clusters, J, ci, which_covs)
 }
 
-naomitwhy <- function(df, is_na_generic) {
-    .Call(`_estimatr_naomitwhy`, df, is_na_generic)
+naomitwhy <- function(df, recursive_subset) {
+    .Call(`_estimatr_naomitwhy`, df, recursive_subset)
 }
 
diff --git a/R/helper_na_omit_detailed.R b/R/helper_na_omit_detailed.R
@@ -8,12 +8,9 @@
 #' column name, if any were dropped.
 #'
 #' @seealso \code{\link{na.omit}}
-na.omit_detailed.data.frame <- function(object) {
-  why_omit <- naomitwhy(object, function(x) is.na(x))
+na.omit_detailed.data.frame <- function(object){
+
+  naomitwhy(object, function(x, w) x[w, , drop=FALSE])
 
-  if (length(why_omit)) {
-    object <- if (length(dim(object))) object[-why_omit, , drop = FALSE] else object[-why_omit]
-    attr(object, "na.action") <- why_omit
-  }
-  object
 }
+
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -107,14 +107,14 @@ BEGIN_RCPP
 END_RCPP
 }
 // naomitwhy
-IntegerVector naomitwhy(DataFrame df, Function is_na_generic);
-RcppExport SEXP _estimatr_naomitwhy(SEXP dfSEXP, SEXP is_na_genericSEXP) {
+DataFrame naomitwhy(DataFrame df, Function recursive_subset);
+RcppExport SEXP _estimatr_naomitwhy(SEXP dfSEXP, SEXP recursive_subsetSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< DataFrame >::type df(dfSEXP);
-    Rcpp::traits::input_parameter< Function >::type is_na_generic(is_na_genericSEXP);
-    rcpp_result_gen = Rcpp::wrap(naomitwhy(df, is_na_generic));
+    Rcpp::traits::input_parameter< Function >::type recursive_subset(recursive_subsetSEXP);
+    rcpp_result_gen = Rcpp::wrap(naomitwhy(df, recursive_subset));
     return rcpp_result_gen;
 END_RCPP
 }

diff --git a/src/naomit.cpp b/src/naomit.cpp
@@ -5,114 +5,113 @@
 using namespace Rcpp;
 
 
-SEXP generic_logical_subset( SEXP xin , LogicalVector w);
-
-
 template <int RTYPE>
 Vector<RTYPE> generic_logical_subset_impl( Vector<RTYPE> xin, LogicalVector w){
   return xin[w] ;
 }
 
 SEXP generic_logical_subset( SEXP xin , LogicalVector w){
-    RCPP_RETURN_VECTOR(generic_logical_subset_impl, xin, w) ;
+  RCPP_RETURN_VECTOR(generic_logical_subset_impl, xin, w) ;
 }
 
 // [[Rcpp::export]]
-IntegerVector naomitwhy(DataFrame df, Function is_na_generic) {
+DataFrame naomitwhy(DataFrame df, Function recursive_subset) {
   int m = df.nrow();
   int n = df.ncol();
-  CharacterVector df_names = df.names();
 
+  Function isna("is.na");
+
+  CharacterVector df_names = df.names();
 
   LogicalVector omit = LogicalVector(m);
 
+  int omit_count = 0;
+
   List why_omit(n);
   why_omit.names() = df_names;
   LogicalVector why_omit_idx(n);
 
-  bool anyomit = false;
-
-  for (int j =0; j < n; j++) {
-    std::string nm = as<std::string>(df_names(j));
 
-    LogicalVector isna = is_na_generic(df(j));
+  for (int j = 0; j < n; j++) {
 
-    SEXP d  = isna.attr("dim");
+    std::vector<int> why_omit_j;
 
-    if(!Rf_isNull(d) && Rf_length(d) == 2  && INTEGER(d)[1] > 1){
-      int dcols = INTEGER(d)[1];
+    LogicalVector v_isna = isna(df[j]);
 
-
-      LogicalVector na2(m);
-
-      for(int k = 0, ii=0; k < dcols; k++)
-      for(int i = 0; i < m; i++, ii++){
-        if(isna[ii]) na2[i] = true;
-      }
-
-      isna = na2;
+    for(int ii = m; ii < LENGTH(v_isna); ){
+      for(int i = 0; i < m; i++, ii++)
+        v_isna[i] |= v_isna[ii];
     }
 
-    int ii = 0;
-    IntegerVector why_omit_j(m);
-
-    for (int i = 0; i < m; i++){
-
-      if(isna[i]){
+    for(int i = 0; i < m; i++){
+      if(v_isna[i]){
         if(!omit[i]){
-          why_omit_j[ii++] = i + 1;
+          why_omit_j.push_back(i + 1);
         }
-
         omit[i] = true;
-      }
+      };
     }
 
-    if(ii > 0){
-      why_omit[j] = why_omit_j[seq(0,ii-1)];
+    if(why_omit_j.size() > 0){
+      why_omit[j] = wrap(why_omit_j);
       why_omit_idx[j] = true;
-      anyomit = true;
+      omit_count += why_omit_j.size();
     }
   }
-  if(!anyomit){ return(NULL); }
 
-  IntegerVector omit_idx = seq_len(m);
-  omit_idx = generic_logical_subset(omit_idx, omit);
-  omit_idx.attr("names") = generic_logical_subset(df.attr("row.names"), omit);
+  if(omit_count == 0){ return(df); }
+
+  IntegerVector omit_idx = IntegerVector(omit_count);
+  for(int i = 0, ii=0; ii < omit_count; i++){
+    if(omit[i]) omit_idx[ii++] = i+1;
+  }
+
+  CharacterVector rownames = df.attr("row.names");
+  omit_idx.attr("names") = rownames[omit];
+
   omit_idx.attr("why_omit") = why_omit[why_omit_idx];
   omit_idx.attr("class") = CharacterVector::create("omit", "detailed");
 
+  omit = !omit;
+
+  List out(n);
+
+  for(int i = 0; i < n; i++){
+    SEXP dfi = df(i);
+    if(LENGTH(dfi) == m){
+      out[i] = generic_logical_subset(dfi, omit);
+    } else {
+      out[i] = recursive_subset(dfi, omit);
+    }
+  }
 
+  out.names() = df_names;
+  out.attr("row.names") = rownames[omit];
+  out.attr("na.action") = omit_idx;
+  out.attr("class") = df.attr("class");
 
-  return(omit_idx);
+  return(out);
 }
 
-// # NJF 10/18
-// # Silly microbenchmark to make sure I didn't make it slower
-// # df <- expand.grid(x=c(1:100, NA), y=c(1:5, NA), z=c(1:8, NA), q=c(NA,2:5))
-//
-// # microbenchmark(stock=na.omit(df), hack1=na.omit_detailed.data.frame(df))
-// ## Unit: milliseconds
-// ## expr      min       lq     mean   median       uq        max neval
-// ## stock 6.114132 6.184318 7.744881 6.232744 6.961491 101.823530   100
-// ## hack1 5.360638 5.480531 6.525075 5.694078 7.752104   9.323943   100
-
-// > microbenchmark(
-//     +   out1 = estimatr:::naomitwhy(df),
-//     +   out2 = estimatr:::na.omit_detailed.data.frame(df)
-//     + )
-//     Unit: milliseconds
-//   expr      min        lq       mean    median       uq       max neval
-//   out1 2.830644 2.9078700 3.88764381 2.9900055 3.655617 67.368491   100
-// out2 3.463163 3.7196865 5.70407818 4.5768560 4.748749 70.595884   100
-// > df <- na.omit(df)
-//   > nrow(df)
-//   [1] 16000
-// > microbenchmark(
-//     +   out1 = estimatr:::naomitwhy(df),
-//     +   out2 = estimatr:::na.omit_detailed.data.frame(df)
-//     + )
-//     Unit: microseconds
-//   expr      min        lq       mean    median        uq       max neval
-//   out1  152.816  159.3180  214.70463  171.3470  185.2210  1023.877   100
-// out2 1200.924 1227.9935 2193.33799 1271.2825 2040.2235 64933.756   100
-// >
+
+  // require(microbenchmark)
+  // df <- expand.grid(x=c(1:100, NA), y=c(1:5, NA), z=c(1:8, NA), q=c(NA,2:5))
+  // df2 <- na.omit(df)
+  // microbenchmark(stock=na.omit(df), ours=estimatr:::na.omit_detailed.data.frame(df), unit="ms")
+  // microbenchmark(stock=na.omit(df2), ours=estimatr:::na.omit_detailed.data.frame(df2), unit="ms")
+
+  // df <- rbind(df, df2, df)
+  // df2 <- rbind(df2, df2, df2)
+  // microbenchmark(stock=na.omit(df), ours=estimatr:::na.omit_detailed.data.frame(df), unit="ms")
+  // microbenchmark(stock=na.omit(df2), ours=estimatr:::na.omit_detailed.data.frame(df2), unit="ms")
+
+  // df <- cbind(df, df,df)
+  // df2 <- cbind(df2, df2, df2)
+  // microbenchmark(stock=na.omit(df), ours=estimatr:::na.omit_detailed.data.frame(df), unit="ms")
+  // microbenchmark(stock=na.omit(df2), ours=estimatr:::na.omit_detailed.data.frame(df2), unit="ms")
+
+  // sleep[c("sleep", "foo")] = list(sleep, matrix(1:40, 20))
+  // sleep[cbind(c(1,5,9), c(2,1,3))] <- NA
+  // sleep$sleep[cbind(1+c(1,5,9), c(2,1,3))] <- NA
+  // sleep$foo[12,1] <- NA
+  // attributes(estimatr:::na.omit_detailed.data.frame(sleep))