Ditch auk_clean(), no longer needed

CornellLabofOrnithology · Sep 26, 2018 · ed60266 · ed60266
1 parent 2f1e215
commit ed60266
Show file tree

Hide file tree

Showing 19 changed files with 819 additions and 1,114 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -12,6 +12,7 @@ refer to the file name, rather than specifying the full path every time.
 - "Area" protocol added to `auk_protocol()` filter.
 - `auk_extent()` renamed `auk_bbox()`; `auk_extent()` deprecated and redirects to `auk_bbox()`
 - `auk_zerofill()` now checks for complete checklists and gives option to not rollup
+- `auk_clean()` deprecated
 - Fixed package load error when `EBD_PATH` is invalid
 - Fixed bug when reading files with a blank column using `readr`
 

diff --git a/R/auk-clean.r b/R/auk-clean.r
@@ -1,9 +1,7 @@
-#' Clean an eBird data file
+#' Clean an eBird data file (Deprecated)
 #'
-#' Some rows in the eBird Basic Dataset (EBD) may have an incorrect number of
-#' columns, often resulting from tabs embedded in the comments field. This
-#' function drops these problematic records. **Note that this function typically
-#' takes at least 3 hours to run on the full dataset**
+#' This function is no longer required by current versions of the eBird Basic 
+#' Dataset (EBD).
 #'
 #' @param f_in character; input file. If file is not found as specified, it will 
 #'   be looked for in the directory specified by the `EBD_PATH` environment 
@@ -18,15 +16,7 @@
 #'   increase the file size, yet are rarely valuable for analytical
 #'   applications, so may be removed. Setting this argument to `TRUE` can lead
 #'   to a significant reduction in file size.
-#' @param overwrite logical; overwrite output file if it already exists
-#'
-#' @details
-#'
-#' This function can clean a basic dataset file or a sampling file.
-#'
-#' Calling this function requires that the command line utility AWK is
-#' installed. Linux and Mac machines should have AWK by default, Windows users
-#' will likely need to install [Cygwin](https://www.cygwin.com).
+#' @param overwrite logical; overwrite output file if it already exists.
 #'
 #' @return If AWK ran without errors, the output filename is returned, however,
 #'   if an error was encountered the exit code is returned.
@@ -35,26 +25,19 @@
 #' @examples
 #' \dontrun{
 #' # get the path to the example data included in the package
-#' # in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt
-#' f <- system.file("extdata/ebd-sample_messy.txt", package = "auk")
+#' f <- system.file("extdata/ebd-sample.txt", package = "auk")
 #' # output to a temp file for example
 #' # in practice, provide path to output file
 #' # e.g. f_out <- "output/ebd_clean.txt"
 #' f_out <- tempfile()
 #'
 #' # clean file to remove problem rows
+#' # note: this function is deprecated and no longer does anything
 #' auk_clean(f, f_out)
-#' # number of lines in input
-#' length(readLines(f))
-#' # number of lines in output
-#' length(readLines(f_out))
-#'
-#' # note that the extra blank column has also been removed
-#' ncol(read.delim(f, nrows = 5, quote = ""))
-#' ncol(read.delim(f_out, nrows = 5, quote = ""))
 #' }
 auk_clean <- function(f_in, f_out, sep = "\t", remove_text = FALSE, 
                       overwrite = FALSE) {
+  .Deprecated()
   # checks
   awk_path <- auk_get_awk_path()
   if (is.na(awk_path)) {
@@ -129,7 +112,7 @@ BEGIN {
   # remove end of line tab
   sub(/\t$/, \"\", $0)
   # only keep rows with correct number of records
-  if (NF == ${ncols} || NR == 1) {
+  if (NF != ${ncols} || NR == 1) {
     print ${print_cols}
   }
 }

diff --git a/R/auk-ebd.r b/R/auk-ebd.r
@@ -58,7 +58,7 @@ auk_ebd <- function(file, file_sampling, sep = "\t") {
   file <- ebd_file(file)
   # read header rows
   header <- tolower(get_header(file, sep))
-  header <- stringr::str_replace_all(header, "_", " ")
+  header <- stringr::str_replace_all(header, "[^a-z0-9]+", " ")
   # fix for custom download
   header[header == "state province"] <- "state"
   header[header == "subnational1 code"] <- "state code"

diff --git a/R/read.r b/R/read.r
@@ -63,6 +63,7 @@ read_ebd.character <- function(x, reader, sep = "\t", unique = TRUE,
   reader <- choose_reader(reader)
   # get header
   header <- get_header(x, sep = sep)
+  blank <- (header[length(header)] == "")
 
   # read using fread, read_delim, or read.delim
   col_types <- get_col_types(header, reader = reader)
@@ -98,9 +99,9 @@ read_ebd.character <- function(x, reader, sep = "\t", unique = TRUE,
       }
     }
   }
+  out <- dplyr::as.tbl(out)
 
   # remove possible blank final column
-  blank <- grepl("^[xXvV][0-9]{2}$", names(out)[ncol(out)])
   if (blank) {
     out[ncol(out)] <- NULL
   }

diff --git a/README.Rmd b/README.Rmd
@@ -65,6 +65,11 @@ This package uses the command-line program AWK to extract subsets of the eBird B
 
 Because the eBird dataset is so large, step 3 typically takes several hours to run. Here's a simple example that extract all Canada Jay records from within Canada.
 
+```{r packages, include=FALSE}
+library(auk)
+```
+
+
 ```{r quickstart, eval = FALSE}
 library(auk)
 # path to the ebird data file, here a sample included in the package
@@ -99,27 +104,6 @@ ebd_df <- read_ebd(ebd_filtered)
 
 ## Usage
 
-### Cleaning
-
-Some rows in the dataset may have an incorrect number of columns, typically from problematic characters in the comments fields, and the dataset has an extra blank column at the end. The function `auk_clean()` drops these erroneous records and removes the blank column.
-
-```{r auk-clean, message = FALSE}
-library(auk)
-# get the path to the example data included in the package
-# in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt
-f <- system.file("extdata/ebd-sample_messy.txt", package = "auk")
-# output to a temp file for example -->
-# in practice, provide path to output file -->
-# e.g. f_out <- "output/ebd_clean.txt" -->
-f_out <- tempfile()
-# remove problem records
-auk_clean(f, f_out)
-# number of lines in input
-length(readLines(f))
-# number of lines in output
-length(readLines(f_out))
-```
-
 ### Filtering
 
 `auk` uses a [pipeline-based workflow](http://r4ds.had.co.nz/pipes.html) for defining filters, which can then be compiled into an AWK script. Users should start by defining a reference to the dataset file with `auk_ebd()`. Then any of the following filters can be applied:

diff --git a/README.md b/README.md
@@ -137,31 +137,6 @@ could be rewritten:
 Usage
 -----
 
-### Cleaning
-
-Some rows in the dataset may have an incorrect number of columns,
-typically from problematic characters in the comments fields, and the
-dataset has an extra blank column at the end. The function `auk_clean()`
-drops these erroneous records and removes the blank column.
-
-    library(auk)
-    # get the path to the example data included in the package
-    # in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt
-    f <- system.file("extdata/ebd-sample_messy.txt", package = "auk")
-    # output to a temp file for example -->
-    # in practice, provide path to output file -->
-    # e.g. f_out <- "output/ebd_clean.txt" -->
-    f_out <- tempfile()
-    # remove problem records
-    auk_clean(f, f_out)
-    #> [1] "/var/folders/mg/qh40qmqd7376xn8qxd6hm5lwjyy0h2/T//Rtmp5RRL0m/filedeaea145fb9"
-    # number of lines in input
-    length(readLines(f))
-    #> [1] 51
-    # number of lines in output
-    length(readLines(f_out))
-    #> [1] 47
-
 ### Filtering
 
 `auk` uses a [pipeline-based workflow](http://r4ds.had.co.nz/pipes.html)
@@ -231,7 +206,7 @@ defined, the filtering is actually conducted using `auk_filter()`.
       auk_complete()
     ebd
     #> Input 
-    #>   EBD: /Users/mes335/projects/auk/inst/extdata/ebd-sample.txt 
+    #>   EBD: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/auk/extdata/ebd-sample.txt 
     #> 
     #> Output 
     #>   Filters not executed

diff --git a/data-raw/ebd-samples.r b/data-raw/ebd-samples.r
@@ -53,19 +53,6 @@ readLines(f) %>%
   writeLines(f)
 stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
 stopifnot(all(read_ebd(f)$scientific_name %in% ebird_taxonomy$scientific_name))
-# prepare a smaller sample of messy data
-y <- read_tsv(f, quote = "", 
-              col_types = cols(.default = col_character()))
-y$empty_col <- NA_character_
-names(y)[length(y)] <- ""
-y <- sample_n(y, 50)
-f <- "inst/extdata/ebd-sample_messy.txt"
-write_tsv(y, f, na = "")
-readLines(f) %>% 
-  str_replace_all("\"", "") %>% 
-  writeLines(f)
-stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
-stopifnot(all(read_ebd(f, reader = "base")$scientific_name %in% ebird_taxonomy$scientific_name))
 
 # filter for zero-fill example
 filters <- auk_ebd(f_in, s_in) %>%
@@ -143,18 +130,4 @@ readLines(f) %>%
   str_replace_all("\"", "") %>% 
   writeLines(f)
 stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
-stopifnot(all(read_ebd(f)$scientific_name %in% ebird_taxonomy$scientific_name))
-
-# after script: edit the messy file to introduce errors, especially tabs in comment fields
-# keep top 5 rows intact
-f <- "inst/extdata/ebd-sample_messy.txt"
-stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
-stopifnot(all(read_ebd(f, reader = "base")$scientific_name %in% ebird_taxonomy$scientific_name))
-
-
-# f_ebd <- system.file("extdata/zerofill-ex_ebd.txt", package = "auk")
-# f_smpl <- system.file("extdata/zerofill-ex_sampling.txt", package = "auk")
-# a <- c(TRUE, as.logical(read_tsv(f_ebd)[["ALL SPECIES REPORTED"]]))
-# b <- c(TRUE, as.logical(read_tsv(f_smpl)[["ALL SPECIES REPORTED"]]))
-# readLines(f_ebd)[a] %>% writeLines("inst/extdata/zerofill-ex_ebd.txt")
-# readLines(f_smpl)[b] %>% writeLines("inst/extdata/zerofill-ex_sampling.txt")
+stopifnot(all(read_ebd(f)$scientific_name %in% ebird_taxonomy$scientific_name))