Skip to content

Commit

Permalink
Ditch auk_clean(), no longer needed
Browse files Browse the repository at this point in the history
  • Loading branch information
mstrimas committed Sep 26, 2018
1 parent 2f1e215 commit ed60266
Show file tree
Hide file tree
Showing 19 changed files with 819 additions and 1,114 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ refer to the file name, rather than specifying the full path every time.
- "Area" protocol added to `auk_protocol()` filter.
- `auk_extent()` renamed `auk_bbox()`; `auk_extent()` deprecated and redirects to `auk_bbox()`
- `auk_zerofill()` now checks for complete checklists and gives option to not rollup
- `auk_clean()` deprecated
- Fixed package load error when `EBD_PATH` is invalid
- Fixed bug when reading files with a blank column using `readr`

Expand Down
33 changes: 8 additions & 25 deletions R/auk-clean.r
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
#' Clean an eBird data file
#' Clean an eBird data file (Deprecated)
#'
#' Some rows in the eBird Basic Dataset (EBD) may have an incorrect number of
#' columns, often resulting from tabs embedded in the comments field. This
#' function drops these problematic records. **Note that this function typically
#' takes at least 3 hours to run on the full dataset**
#' This function is no longer required by current versions of the eBird Basic
#' Dataset (EBD).
#'
#' @param f_in character; input file. If file is not found as specified, it will
#' be looked for in the directory specified by the `EBD_PATH` environment
Expand All @@ -18,15 +16,7 @@
#' increase the file size, yet are rarely valuable for analytical
#' applications, so may be removed. Setting this argument to `TRUE` can lead
#' to a significant reduction in file size.
#' @param overwrite logical; overwrite output file if it already exists
#'
#' @details
#'
#' This function can clean a basic dataset file or a sampling file.
#'
#' Calling this function requires that the command line utility AWK is
#' installed. Linux and Mac machines should have AWK by default, Windows users
#' will likely need to install [Cygwin](https://www.cygwin.com).
#' @param overwrite logical; overwrite output file if it already exists.
#'
#' @return If AWK ran without errors, the output filename is returned, however,
#' if an error was encountered the exit code is returned.
Expand All @@ -35,26 +25,19 @@
#' @examples
#' \dontrun{
#' # get the path to the example data included in the package
#' # in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt
#' f <- system.file("extdata/ebd-sample_messy.txt", package = "auk")
#' f <- system.file("extdata/ebd-sample.txt", package = "auk")
#' # output to a temp file for example
#' # in practice, provide path to output file
#' # e.g. f_out <- "output/ebd_clean.txt"
#' f_out <- tempfile()
#'
#' # clean file to remove problem rows
#' # note: this function is deprecated and no longer does anything
#' auk_clean(f, f_out)
#' # number of lines in input
#' length(readLines(f))
#' # number of lines in output
#' length(readLines(f_out))
#'
#' # note that the extra blank column has also been removed
#' ncol(read.delim(f, nrows = 5, quote = ""))
#' ncol(read.delim(f_out, nrows = 5, quote = ""))
#' }
auk_clean <- function(f_in, f_out, sep = "\t", remove_text = FALSE,
overwrite = FALSE) {
.Deprecated()
# checks
awk_path <- auk_get_awk_path()
if (is.na(awk_path)) {
Expand Down Expand Up @@ -129,7 +112,7 @@ BEGIN {
# remove end of line tab
sub(/\t$/, \"\", $0)
# only keep rows with correct number of records
if (NF == ${ncols} || NR == 1) {
if (NF != ${ncols} || NR == 1) {
print ${print_cols}
}
}
Expand Down
2 changes: 1 addition & 1 deletion R/auk-ebd.r
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ auk_ebd <- function(file, file_sampling, sep = "\t") {
file <- ebd_file(file)
# read header rows
header <- tolower(get_header(file, sep))
header <- stringr::str_replace_all(header, "_", " ")
header <- stringr::str_replace_all(header, "[^a-z0-9]+", " ")
# fix for custom download
header[header == "state province"] <- "state"
header[header == "subnational1 code"] <- "state code"
Expand Down
3 changes: 2 additions & 1 deletion R/read.r
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ read_ebd.character <- function(x, reader, sep = "\t", unique = TRUE,
reader <- choose_reader(reader)
# get header
header <- get_header(x, sep = sep)
blank <- (header[length(header)] == "")

# read using fread, read_delim, or read.delim
col_types <- get_col_types(header, reader = reader)
Expand Down Expand Up @@ -98,9 +99,9 @@ read_ebd.character <- function(x, reader, sep = "\t", unique = TRUE,
}
}
}
out <- dplyr::as.tbl(out)

# remove possible blank final column
blank <- grepl("^[xXvV][0-9]{2}$", names(out)[ncol(out)])
if (blank) {
out[ncol(out)] <- NULL
}
Expand Down
26 changes: 5 additions & 21 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ This package uses the command-line program AWK to extract subsets of the eBird B

Because the eBird dataset is so large, step 3 typically takes several hours to run. Here's a simple example that extract all Canada Jay records from within Canada.

```{r packages, include=FALSE}
library(auk)
```


```{r quickstart, eval = FALSE}
library(auk)
# path to the ebird data file, here a sample included in the package
Expand Down Expand Up @@ -99,27 +104,6 @@ ebd_df <- read_ebd(ebd_filtered)

## Usage

### Cleaning

Some rows in the dataset may have an incorrect number of columns, typically from problematic characters in the comments fields, and the dataset has an extra blank column at the end. The function `auk_clean()` drops these erroneous records and removes the blank column.

```{r auk-clean, message = FALSE}
library(auk)
# get the path to the example data included in the package
# in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt
f <- system.file("extdata/ebd-sample_messy.txt", package = "auk")
# output to a temp file for example -->
# in practice, provide path to output file -->
# e.g. f_out <- "output/ebd_clean.txt" -->
f_out <- tempfile()
# remove problem records
auk_clean(f, f_out)
# number of lines in input
length(readLines(f))
# number of lines in output
length(readLines(f_out))
```

### Filtering

`auk` uses a [pipeline-based workflow](http://r4ds.had.co.nz/pipes.html) for defining filters, which can then be compiled into an AWK script. Users should start by defining a reference to the dataset file with `auk_ebd()`. Then any of the following filters can be applied:
Expand Down
27 changes: 1 addition & 26 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,31 +137,6 @@ could be rewritten:
Usage
-----

### Cleaning

Some rows in the dataset may have an incorrect number of columns,
typically from problematic characters in the comments fields, and the
dataset has an extra blank column at the end. The function `auk_clean()`
drops these erroneous records and removes the blank column.

library(auk)
# get the path to the example data included in the package
# in practice, provide path to ebd, e.g. f <- "data/ebd_relFeb-2018.txt
f <- system.file("extdata/ebd-sample_messy.txt", package = "auk")
# output to a temp file for example -->
# in practice, provide path to output file -->
# e.g. f_out <- "output/ebd_clean.txt" -->
f_out <- tempfile()
# remove problem records
auk_clean(f, f_out)
#> [1] "/var/folders/mg/qh40qmqd7376xn8qxd6hm5lwjyy0h2/T//Rtmp5RRL0m/filedeaea145fb9"
# number of lines in input
length(readLines(f))
#> [1] 51
# number of lines in output
length(readLines(f_out))
#> [1] 47

### Filtering

`auk` uses a [pipeline-based workflow](http://r4ds.had.co.nz/pipes.html)
Expand Down Expand Up @@ -231,7 +206,7 @@ defined, the filtering is actually conducted using `auk_filter()`.
auk_complete()
ebd
#> Input
#> EBD: /Users/mes335/projects/auk/inst/extdata/ebd-sample.txt
#> EBD: /Library/Frameworks/R.framework/Versions/3.5/Resources/library/auk/extdata/ebd-sample.txt
#>
#> Output
#> Filters not executed
Expand Down
29 changes: 1 addition & 28 deletions data-raw/ebd-samples.r
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,6 @@ readLines(f) %>%
writeLines(f)
stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
stopifnot(all(read_ebd(f)$scientific_name %in% ebird_taxonomy$scientific_name))
# prepare a smaller sample of messy data
y <- read_tsv(f, quote = "",
col_types = cols(.default = col_character()))
y$empty_col <- NA_character_
names(y)[length(y)] <- ""
y <- sample_n(y, 50)
f <- "inst/extdata/ebd-sample_messy.txt"
write_tsv(y, f, na = "")
readLines(f) %>%
str_replace_all("\"", "") %>%
writeLines(f)
stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
stopifnot(all(read_ebd(f, reader = "base")$scientific_name %in% ebird_taxonomy$scientific_name))

# filter for zero-fill example
filters <- auk_ebd(f_in, s_in) %>%
Expand Down Expand Up @@ -143,18 +130,4 @@ readLines(f) %>%
str_replace_all("\"", "") %>%
writeLines(f)
stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
stopifnot(all(read_ebd(f)$scientific_name %in% ebird_taxonomy$scientific_name))

# after script: edit the messy file to introduce errors, especially tabs in comment fields
# keep top 5 rows intact
f <- "inst/extdata/ebd-sample_messy.txt"
stopifnot(length(tools::showNonASCII(readLines(f))) == 0)
stopifnot(all(read_ebd(f, reader = "base")$scientific_name %in% ebird_taxonomy$scientific_name))


# f_ebd <- system.file("extdata/zerofill-ex_ebd.txt", package = "auk")
# f_smpl <- system.file("extdata/zerofill-ex_sampling.txt", package = "auk")
# a <- c(TRUE, as.logical(read_tsv(f_ebd)[["ALL SPECIES REPORTED"]]))
# b <- c(TRUE, as.logical(read_tsv(f_smpl)[["ALL SPECIES REPORTED"]]))
# readLines(f_ebd)[a] %>% writeLines("inst/extdata/zerofill-ex_ebd.txt")
# readLines(f_smpl)[b] %>% writeLines("inst/extdata/zerofill-ex_sampling.txt")
stopifnot(all(read_ebd(f)$scientific_name %in% ebird_taxonomy$scientific_name))
Loading

0 comments on commit ed60266

Please sign in to comment.