-
Notifications
You must be signed in to change notification settings - Fork 39
/
furrr-options.R
405 lines (347 loc) · 11.3 KB
/
furrr-options.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
#' Options to fine tune furrr
#'
#' These options fine tune furrr functions, such as [future_map()]. They
#' are either used by furrr directly, or are passed on to [future::future()].
#'
#' @param ... These dots are reserved for future extensibility and must
#' be empty.
#'
#' @param stdout A logical.
#'
#' - If `TRUE`, standard output of the underlying futures is captured and
#' relayed as soon as possible.
#'
#' - If `FALSE`, output is silenced by sinking it to the null device.
#'
#' - If `NA`, output is not intercepted. This is not recommended.
#'
#' @param conditions A character string of conditions classes to be captured
#' and relayed. The default is the same as the condition argument of
#' [future::Future()]. To not intercept conditions, use
#' `conditions = character(0L)`. Errors are always relayed.
#'
#' @param globals A logical, a character vector, a named list, or `NULL` for
#' controlling how globals are handled. For details, see the
#' `Global variables` section below.
#'
#' @param packages A character vector, or `NULL`. If supplied, this specifies
#' packages that are guaranteed to be attached in the R environment where the
#' future is evaluated.
#'
#' @param lazy A logical. Specifies whether futures should be resolved
#' lazily or eagerly.
#'
#' @param seed A logical, an integer of length `1` or `7`, a list of
#' `length(.x)` with pre-generated random seeds, or `NULL`. For details, see
#' the `Reproducible random number generation (RNG)` section below.
#'
#' @param scheduling A single integer, logical, or `Inf`. This argument
#' controls the average number of futures ("chunks") per worker.
#'
#' - If `0`, then a single future is used to process all elements of `.x`.
#'
#' - If `1` or `TRUE`, then one future per worker is used.
#'
#' - If `2`, then each worker will process two futures (provided there
#' are enough elements in `.x`).
#'
#' - If `Inf` or `FALSE`, then one future per element of `.x` is used.
#'
#' This argument is only used if `chunk_size` is `NULL`.
#'
#' @param chunk_size A single integer, `Inf`, or `NULL`. This argument
#' controls the average number of elements per future (`"chunk"`). If `Inf`,
#' then all elements are processed in a single future. If `NULL`, then
#' `scheduling` is used instead to determine how `.x` is chunked.
#'
#' @param prefix A single character string, or `NULL`. If a character string,
#' then each future is assigned a label as `{prefix}-{chunk-id}`. If `NULL`,
#' no labels are used.
#'
#'
#' @section Global variables:
#'
#' `globals` controls how globals are identified, similar to the `globals`
#' argument of [future::future()]. Since all function calls use the same set of
#' globals, furrr gathers globals upfront (once), which is more efficient than
#' if it was done for each future independently.
#'
#' * If `TRUE` or `NULL`, then globals are automatically identified and
#' gathered.
#'
#' * If a character vector of names is specified, then those globals are
#' gathered.
#'
#' * If a named list, then those globals are used as is.
#'
#' * In all cases, `.f` and any `...` arguments are automatically passed as
#' globals to each future created, as they are always needed.
#'
#' @section Reproducible random number generation (RNG):
#'
#' Unless `seed = FALSE`, furrr functions are guaranteed to generate
#' the exact same sequence of random numbers _given the same initial
#' seed / RNG state_ regardless of the type of futures and scheduling
#' ("chunking") strategy.
#'
#' Setting `seed = NULL` is equivalent to `seed = FALSE`, except that the
#' `future.rng.onMisuse` option is not consulted to potentially monitor the
#' future for faulty random number usage. See the `seed` argument of
#' [future::future()] for more details.
#'
#' RNG reproducibility is achieved by pre-generating the random seeds for all
#' iterations (over `.x`) by using L'Ecuyer-CMRG RNG streams. In each
#' iteration, these seeds are set before calling `.f(.x[[i]], ...)`.
#' _Note, for large `length(.x)` this may introduce a large overhead._
#'
#' A fixed `seed` may be given as an integer vector, either as a full
#' L'Ecuyer-CMRG RNG seed of length `7`, or as a seed of length `1` that
#' will be used to generate a full L'Ecuyer-CMRG seed.
#'
#' If `seed = TRUE`, then `.Random.seed` is returned if it holds a
#' L'Ecuyer-CMRG RNG seed, otherwise one is created randomly.
#'
#' If `seed = NA`, a L'Ecuyer-CMRG RNG seed is randomly created.
#'
#' If none of the function calls `.f(.x[[i]], ...)` use random number
#' generation, then `seed = FALSE` may be used.
#'
#' In addition to the above, it is possible to specify a pre-generated
#' sequence of RNG seeds as a list such that `length(seed) == length(.x)` and
#' where each element is an integer seed that can be assigned to `.Random.seed`.
#' Use this alternative with caution. _Note that `as.list(seq_along(.x))` is
#' not a valid set of such `.Random.seed` values._
#'
#' In all cases but `seed = FALSE`, after a furrr function returns, the RNG
#' state of the calling R process is guaranteed to be "forwarded one step" from
#' the RNG state before the call. This is true regardless of the future
#' strategy / scheduling used. This is done in order to guarantee that an R
#' script calling `future_map()` multiple times should be numerically
#' reproducible given the same initial seed.
#'
#' @export
#' @examples
#' furrr_options()
furrr_options <- function(...,
stdout = TRUE,
conditions = NULL,
globals = TRUE,
packages = NULL,
lazy = FALSE,
seed = FALSE,
scheduling = 1.0,
chunk_size = NULL,
prefix = NULL) {
ellipsis::check_dots_empty()
stdout <- validate_stdout(stdout)
conditions <- validate_conditions(conditions)
globals <- validate_globals(globals)
packages <- validate_packages(packages)
lazy <- validate_lazy(lazy)
seed <- validate_seed(seed)
scheduling <- validate_scheduling(scheduling)
chunk_size <- validate_chunk_size(chunk_size)
prefix <- validate_prefix(prefix)
out <- list(
stdout = stdout,
conditions = conditions,
globals = globals,
packages = packages,
lazy = lazy,
seed = seed,
scheduling = scheduling,
chunk_size = chunk_size,
prefix = prefix
)
structure(out, class = "furrr_options")
}
#' @export
print.furrr_options <- function(x, ...) {
cat_line("<furrr_options>")
}
# ------------------------------------------------------------------------------
#' Deprecated furrr options
#'
#' @description
#' `r lifecycle::badge("deprecated")`
#'
#' `future_options()` has been deprecated in favor of [furrr_options()] as
#' of furrr 0.2.0.
#'
#' @inheritParams furrr_options
#'
#' @keywords internal
#' @export
#' @examples
#' future_options()
future_options <- function(globals = TRUE,
packages = NULL,
seed = FALSE,
lazy = FALSE,
scheduling = 1.0) {
lifecycle::deprecate_warn("0.2.0", "future_options()", "furrr_options()")
furrr_options(
globals = globals,
packages = packages,
seed = seed,
lazy = lazy,
scheduling = scheduling
)
}
# ------------------------------------------------------------------------------
assert_furrr_options <- function(x) {
if (!is_furrr_options(x)) {
abort("`.options` must be created from `furrr_options()`.")
}
invisible(x)
}
is_furrr_options <- function(x) {
inherits(x, "furrr_options")
}
# ------------------------------------------------------------------------------
validate_stdout <- function(x) {
vctrs::vec_assert(x, size = 1L, arg = "stdout")
vctrs::vec_cast(x, logical(), x_arg = "stdout")
}
validate_conditions <- function(x) {
vctrs::vec_cast(x, character(), x_arg = "conditions")
}
validate_globals <- function(x) {
if (is.null(x)) {
return(TRUE)
}
if (is.logical(x)) {
if (!is_bool(x)) {
abort("A logical `globals` must be length 1 and can't be `NA`.")
}
return(x)
}
if (is.character(x)) {
return(x)
}
if (is.list(x)) {
if (!is_named(x)) {
abort("`globals` cannot be an unnamed or partially named list.")
}
return(x)
}
abort("`globals` must be `NULL`, a logical, a character vector, or a named list.")
}
validate_packages <- function(x) {
if (is.null(x)) {
return(x)
}
x <- vctrs::vec_cast(x, character(), x_arg = "packages")
if (any(is.na(x))) {
abort("`packages` can't be `NA`.")
}
x
}
validate_lazy <- function(x) {
vctrs::vec_assert(x, size = 1, arg = "lazy")
x <- vctrs::vec_cast(x, logical(), x_arg = "lazy")
if (is.na(x)) {
abort("`lazy` can't be `NA`.")
}
x
}
validate_seed <- function(x) {
if (is.null(x)) {
return(x)
}
# Size is validated when we have `.x`
if (is.list(x)) {
x <- validate_seed_list(x)
return(x)
}
if (is.logical(x)) {
if (!is_bool(x)) {
abort("A logical `seed` must be length 1 and must not be `NA`.")
}
return(x)
}
if (is.double(x)) {
x <- vctrs::vec_cast(x, integer(), x_arg = "seed")
}
if (is.integer(x)) {
if (length(x) != 1L && length(x) != 7L) {
abort("An integer `seed` must be length 1 or 7.")
}
if (any(is.na(x))) {
abort("An integer `seed` cannot have `NA` values.")
}
return(x)
}
abort("`seed` must be a logical, integer, or list.")
}
validate_seed_list <- function(x) {
seeds_are_integers <- purrr::map_lgl(x, ~typeof(.x) == "integer")
if (!all(seeds_are_integers)) {
abort("All elements of a list `seed` must be integers.")
}
unique_lengths <- unique(lengths(x))
if (length(unique_lengths) != 1L) {
abort("All elements of a list `seed` must have the same length.")
}
if (identical(unique_lengths, 1L)) {
abort(paste0(
"All pre-generated random seed elements of a list `seed` ",
"must be valid `.Random.seed` seeds, which means they should be all ",
"integers and consists of two or more elements, not just one."
))
}
# For efficiency, just check the first seed for validity
seed <- x[[1]]
if (!is_valid_random_seed(seed)) {
abort(paste0(
"All pre-generated random seed elements of a list `seed` ",
"must be valid `.Random.seed` seeds, but the first does not seem to be."
))
}
x
}
validate_scheduling <- function(x) {
if (length(x) != 1L) {
abort("`scheduling` must be length 1.")
}
if (identical(x, Inf)) {
return(x)
}
if (is.logical(x)) {
if (!is_bool(x)) {
abort("A logical `scheduling` value can't be `NA`.")
}
return(x)
}
x <- vctrs::vec_cast(x, integer(), x_arg = "scheduling")
if (x < 0L) {
abort("`scheduling` must be greater than or equal to zero.")
}
x
}
validate_chunk_size <- function(x) {
if (is.null(x)) {
return(x)
}
if (identical(x, Inf)) {
return(x)
}
vctrs::vec_assert(x, size = 1L, arg = "chunk_size")
x <- vctrs::vec_cast(x, integer(), x_arg = "chunk_size")
if (is.na(x)) {
abort("`chunk_size` can't be `NA`.")
}
if (x <= 0L) {
abort("`chunk_size` must be greater than zero.")
}
x
}
validate_prefix <- function(x) {
if (is.null(x)) {
return(x)
}
if (!is_string(x)) {
abort("`prefix` must be a character string, or `NULL`.")
}
x
}