/
bootdht.R
339 lines (314 loc) · 14.6 KB
/
bootdht.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#' Bootstrap uncertainty estimation for distance sampling models
#'
#' Performs a bootstrap for simple distance sampling models using the same data
#' structures as [`dht`][mrds::dht]. Note that only geographical stratification
#' as supported in `dht` is allowed.
#'
#' @param model a model fitted by [`ds`] or a list of models
#' @param flatfile Data provided in the flatfile format. See [`flatfile`] for
#' details. Please note, it is a current limitation of bootdht that all
#' Sample.Label identifiers must be unique across all strata, i.e.transect
#' ids must not be re-used from one strata to another. An easy way to achieve
#' this is to paste together the stratum names and transect ids.
#' @param convert_units conversion between units for abundance estimation, see
#' "Units", below. (Defaults to 1, implying all of the units are "correct"
#' already.) This takes precedence over any unit conversion stored in `model`.
#' @param resample_strata should resampling happen at the stratum
#' (`Region.Label`) level? (Default `FALSE`)
#' @param resample_obs should resampling happen at the observation (`object`)
#' level? (Default `FALSE`)
#' @param resample_transects should resampling happen at the transect
#' (`Sample.Label`) level? (Default `TRUE`)
#' @param nboot number of bootstrap replicates
#' @param summary_fun function that is used to obtain summary statistics from
#' the bootstrap, see Summary Functions below. By default
#' [`bootdht_Nhat_summarize`] is used, which just extracts abundance estimates.
#' @param select_adjustments select the number of adjustments in each
#' bootstrap, when `FALSE` the exact detection function specified in `model` is
#' fitted to each replicate. Setting this option to `TRUE` can significantly
#' increase the runtime for the bootstrap. Note that for this to work `model`
#' must have been fitted with `adjustment!=NULL`.
#' @param sample_fraction what proportion of the transects was covered (e.g.,
#' 0.5 for one-sided line transects).
#' @param multipliers `list` of multipliers. See "Multipliers" below.
#' @param progress_bar which progress bar should be used? Default "base" uses
#' `txtProgressBar`, "none" suppresses output, "progress" uses the
#' `progress` package, if installed.
#' @param cores number of CPU cores to use to compute the estimates. See "Parallelization" below.
#' @param convert.units deprecated, see same argument with underscore, above.
#'
#' @section Summary Functions:
#' The function `summary_fun` allows the user to specify what summary
#' statistics should be recorded from each bootstrap. The function should take
#' two arguments, `ests` and `fit`. The former is the output from
#' `dht2`, giving tables of estimates. The latter is the fitted detection
#' function object. The function is called once fitting and estimation has been
#' performed and should return a `data.frame`. Those `data.frame`s
#' are then concatenated using `rbind`. One can make these functions
#' return any information within those objects, for example abundance or
#' density estimates or the AIC for each model. See Examples below.
#'
#' @section Multipliers:
#' It is often the case that we cannot measure distances to individuals or
#' groups directly, but instead need to estimate distances to something they
#' produce (e.g., for whales, their blows; for elephants their dung) -- this is
#' referred to as indirect sampling. We may need to use estimates of production
#' rate and decay rate for these estimates (in the case of dung or nests) or
#' just production rates (in the case of songbird calls or whale blows). We
#' refer to these conversions between "number of cues" and "number of animals"
#' as "multipliers".
#'
#' The `multipliers` argument is a `list`, with 3 possible elements (`creation`
#' and `decay`). Each element of which is either:
#' * `data.frame` and must have at least a column named `rate`, which abundance
#' estimates will be divided by (the term "multiplier" is a misnomer, but
#' kept for compatibility with Distance for Windows). Additional columns can
#' be added to give the standard error and degrees of freedom for the rate
#' if known as `SE` and `df`, respectively. You can use a multirow
#' `data.frame` to have different rates for different geographical areas
#' (for example). In this case the rows need to have a column (or columns)
#' to `merge` with the data (for example `Region.Label`).
#' * a `function` which will return a single estimate of the relevant
#' multiplier. See [`make_activity_fn`] for a helper function for use with the
#' `activity` package.
#'
#' @section Model selection:
#' Model selection can be performed on a per-replicate basis within the
#' bootstrap. This has three variations:
#' 1. when `select_adjustments` is `TRUE` then adjustment terms are selected
#' by AIC within each bootstrap replicate (provided that `model` had the
#' `order` and `adjustment` options set to non-`NULL`.
#' 2. if `model` is a list of fitted detection functions, each of these is
#' fitted to each replicate and results generated from the one with the
#' lowest AIC.
#' 3. when `select_adjustments` is `TRUE` and `model` is a list of fitted
#' detection functions, each model fitted to each replicate and number of
#' adjustments is selected via AIC.
#' This last option can be extremely time consuming.
#'
#' @section Parallelization:
#' If `cores`>1 then the `parallel`/`doParallel`/`foreach`/`doRNG` packages
#' will be used to run the computation over multiple cores of the computer. To
#' use this component you need to install those packages using:
#' `install.packages(c("foreach", "doParallel", "doRNG"))` It is advised that
#' you do not set `cores` to be greater than one less than the number of cores
#' on your machine. The `doRNG` package is required to make analyses
#' reproducible ([`set.seed`] can be used to ensure the same answers).
#'
#' It is also hard to debug any issues in `summary_fun` so it is best to run a
#' small number of bootstraps first in parallel to check that things work. On
#' Windows systems `summary_fun` does not have access to the global environment
#' when running in parallel, so all computations must be made using only its
#' `ests` and `fit` arguments (i.e., you can not use R objects from elsewhere
#' in that function, even if they are available to you from the console).
#'
#' Another consequence of the global environment being unavailable inside
#' parallel bootstraps is that any starting values in the model object passed
#' in to `bootdht` must be hard coded (otherwise you get back 0 successful
#' bootstraps). For a worked example showing this, see the camera trap distance
#' sampling online example at
#' <https://examples.distancesampling.org/Distance-cameratraps/camera-distill.html>.
#'
#' @importFrom utils txtProgressBar setTxtProgressBar getTxtProgressBar
#' @importFrom stats as.formula AIC
#' @importFrom mrds ddf dht
#' @seealso [`summary.dht_bootstrap`] for how to summarize the results,
#' [`bootdht_Nhat_summarize`] and [`bootdht_Dhat_summarize`] for an examples of
#' summary functions.
#' @export
#' @examples
#' \dontrun{
#' # fit a model to the minke data
#' data(minke)
#' mod1 <- ds(minke)
#'
#' # summary function to save the abundance estimate
#' Nhat_summarize <- function(ests, fit) {
#' return(data.frame(Nhat=ests$individuals$N$Estimate))
#' }
#'
#' # perform 5 bootstraps
#' bootout <- bootdht(mod1, flatfile=minke, summary_fun=Nhat_summarize, nboot=5)
#'
#' # obtain basic summary information
#' summary(bootout)
#' }
bootdht <- function(model,
flatfile,
resample_strata=FALSE,
resample_obs=FALSE,
resample_transects=TRUE,
nboot=100,
summary_fun=bootdht_Nhat_summarize,
convert_units=1,
select_adjustments=FALSE,
sample_fraction=1,
multipliers=NULL,
progress_bar="base",
cores=1, convert.units=NULL){
.deprecated_args(c("convert.units"), as.list(match.call()))
if(!any(c(resample_strata, resample_obs, resample_transects))){
stop("At least one of resample_strata, resample_obs, resample_transects must be TRUE")
}
# make everything a list...
if(!inherits(model, "list")){
models <- list(model)
# yes, I am a monster
}else{
models <- model
}
if(missing(convert_units)){
convert_units <- NULL
}
# only use valid ds models
for(i in seq_along(models)){
if(!is(models[[i]], "dsmodel")){
stop("Only models fitted by Distance::ds() may be used")
}
}
dat <- flatfile
if(!("object" %in% names(dat))){
dat$object <- 1:nrow(dat)
}
# if we're using the default summary function and have Area 0 then
# we're not going to have a good time
if(missing(summary_fun) &
(is.null(flatfile$Area) || all(flatfile$Area==0))){
stop("No Area in flatfile, densities will be returned and the default summary function records only abundances. You need to write your own summary_fun.")
}
# apply the sample fraction
dat <- dht2_sample_fraction(sample_fraction, dat)
dat$Effort <- dat$Effort * dat$sample_fraction
dat$sample_fraction <- NULL
# process non-function multipliers
multipliers_nofun <- Filter(Negate(is.function), multipliers)
if(length(multipliers_nofun) > 0){
dat <- dht2_multipliers(multipliers_nofun, dat)
}else{
dat$rate <- 1
}
# get any multiplier functions
multipliers_fun <- Filter(is.function, multipliers)
# this can be generalized later on
stratum_label <- "Region.Label"
obs_label <- "object"
sample_label <- "Sample.Label"
# which resamples are we going to make?
possible_resamples <- c(stratum_label, sample_label, obs_label)
our_resamples <- possible_resamples[c(resample_strata, resample_transects,
resample_obs)]
# make sure these are characters for resampling
dat[, our_resamples] <- lapply(dat[, our_resamples, drop=FALSE], as.character)
# process models
# this resolves all symbols in the call so arguments can be accessed when
# running in parallel
models <- lapply(models, function(model){
lpars <- as.list(model$call)
for(i in seq(2, length(lpars))){
if(is.symbol(model$call[[names(lpars)[i]]])){
model$call[[names(lpars)[i]]] <- eval(lpars[[i]],
envir=parent.frame(n=3))
}
}
model
})
message(paste0("Performing ", nboot, " bootstraps\n"))
if(cores > 1 & progress_bar != "none"){
progress_bar <- "none"
message("Progress bars cannot be shown when using cores>1")
}
# decide how to report progress
if(progress_bar == "base"){
pb <- list(pb = txtProgressBar(0, nboot, style=3),
increment = function(pb){
setTxtProgressBar(pb, getTxtProgressBar(pb)+1)
},
set = function(pb, n, max) setTxtProgressBar(pb, n),
done = function(pb){
setTxtProgressBar(pb, environment(pb$up)$max)
})
}else if(progress_bar == "none"){
pb <- list(pb = NA,
set = function(pb, n, max) invisible(),
increment = function(pb) invisible(),
done = function(pb) invisible())
}else if(progress_bar == "progress"){
if (!requireNamespace("progress", quietly = TRUE)){
stop("Package 'progress' not installed!")
}else{
pb <- list(pb = progress::progress_bar$new(
format=" [:bar] :percent eta: :eta",
total=nboot, clear=FALSE, width=80),
set = function(pb, n, max) pb$update(n/max),
increment = function(pb) pb$tick(),
done = function(pb) pb$update(1))
pb$pb$tick(0)
}
}else{
stop("progress_bar must be one of \"none\", \"base\" or \"progress\"")
}
# run the bootstrap
if(cores > 1){
if (!requireNamespace("foreach", quietly = TRUE) &
!requireNamespace("doParallel", quietly = TRUE) &
!requireNamespace("doRNG", quietly = TRUE) &
!requireNamespace("parallel", quietly = TRUE)){
stop("Packages 'parallel', 'foreach' and 'doParallel' need to be installed to use multiple cores.")
}
# build the cluster
cl <- parallel::makeCluster(cores)
doParallel::registerDoParallel(cl)
# shutdown cluster when the function exits
# (this works even if the function crashes)
on.exit(parallel::stopCluster(cl))
# load the activity package in the other sessions
if(length(multipliers_fun) > 0){
packages <- c("activity")
}else{
packages <- NULL
}
# needed to avoid a syntax error/check fail
`%dopar%` <- foreach::`%dopar%`
`%dorng2%` <- doRNG::`%dorng%`
# fit the model nboot times over cores cores
# note there is a bit of fiddling here with the progress bar to get it to
# work (updates happen in this loop rather than in bootit)
boot_ests <- foreach::foreach(i=1:nboot, .packages=packages) %dorng2% {
bootit(dat, models=models, our_resamples=our_resamples,
summary_fun=summary_fun, convert_units=convert_units,
pb=list(increment=function(pb){invisible()}),
multipliers_fun=multipliers_fun, sample_label=sample_label,
select_adjustments=select_adjustments)
}
}else{
boot_ests <- replicate(nboot,
bootit(dat, models=models, our_resamples,
summary_fun, convert_units=convert_units,
pb=pb, multipliers_fun=multipliers_fun,
sample_label=sample_label,
select_adjustments=select_adjustments),
simplify=FALSE)
}
# do some post-processing
fail_fun <- function(x) inherits(x, "bootstrap_failure")
# add replicate IDs
bootids <- seq_len(length(boot_ests))
# how many failures
failures <- length(Filter(fail_fun, boot_ests))
# remove failures from IDs
bootids <- as.list(bootids[unlist(lapply(boot_ests, Negate(fail_fun)))])
# get just the "good" results
boot_ests <- Filter(Negate(fail_fun), boot_ests)
# add IDs to estimates list
boot_ests <- mapply(cbind.data.frame,
boot_ests, bootstrap_ID=bootids,
SIMPLIFY=FALSE)
# the above is then a list of thingos, do the "right" thing and assume
# they are data.frames and then rbind them all together
boot_ests <- do.call(rbind.data.frame, boot_ests)
cat("\n")
attr(boot_ests, "nboot") <- nboot
attr(boot_ests, "failures") <- failures
class(boot_ests) <- c("dht_bootstrap", "data.frame")
return(boot_ests)
}