-
Notifications
You must be signed in to change notification settings - Fork 3
/
atlas_media.R
175 lines (167 loc) · 6.39 KB
/
atlas_media.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#' Get metadata on images, sounds and videos
#'
#' In addition to text data describing individual occurrences and their
#' attributes, ALA stores images, sounds and videos associated with a given
#' record. `atlas_media` displays metadata for any and all of the media types.
#'
#' @param request optional `data_request` object: generated by a call to
#' [galah_call()].
#' @param identify `data.frame`: generated by a call to
#' [galah_identify()].
#' @param filter `data.frame`: generated by a call to
#' [galah_filter()]
#' @param select `list`: generated by a call to [galah_select()]
#' @param geolocate `string`: generated by a call to
#' [galah_geolocate()]
#' @param data_profile `string`: generated by a call to
#' [galah_apply_profile()]
#' @details [atlas_media()] works by first finding all occurrence records
#' matching the filter which contain media, then downloading the metadata for the
#' media. To actually download the files themselves, use [collect_media()].
#' It may be beneficial when requesting a large number of records to show a progress
#' bar by setting `verbose = TRUE` in [galah_config()].
#' @return An object of class `tbl_df` and `data.frame` (aka a tibble)
#' of metadata of the requested media.
#' @seealso [atlas_counts()] to find the number of records with media; but note this
#' is not necessarily the same as the number of media files, as each record can have
#' more than one media file associated with it (see examples section for how to do this).
#'
#' @examples \dontrun{
#' # Download Regent Honeyeater records with multimedia attached
#' galah_call() |>
#' galah_identify("Regent Honeyeater") |>
#' galah_filter(year == 2011) |>
#' atlas_media()
#'
#' # Download multimedia
#' galah_call() |>
#' galah_identify("Regent Honeyeater") |>
#' galah_filter(year == 2011) |>
#' atlas_media() |>
#' collect_media(path = "folder/your-directory")
#'
#' # Specify a single media type to download
#' galah_call() |>
#' galah_identify("Eolophus Roseicapilla") |>
#' galah_filter(multimedia == "Sound") |>
#' atlas_media()
#'
#' # It's good to check how many records have media files before downloading
#' galah_call() |>
#' galah_filter(multimedia == c("Image", "Sound", "Video")) |>
#' galah_group_by(multimedia) |>
#' atlas_counts()
#'
#'
#' # post version 2.0, it is possible to run all steps in sequence
#' # first, get occurrences, making sure to include media fields:
#' occurrences_df <- request_data() |>
#' identify("Regent Honeyeater") |>
#' filter(!is.na(images), year == 2011) |>
#' select(group = "media") |>
#' collect()
#'
#' # second, get media metadata
#' media_info <- request_metadata() |>
#' filter(media == occurrences_df) |>
#' collect()
#'
#' # the two steps above + `right_join()` are synonmous with `atlas_media()`
#' # third, get images
#' request_files() |>
#' filter(media == media_df) |>
#' collect(thumbnail = TRUE)
#'
#' # step three is synonymous with `collect_media()`
#'}
#' @importFrom dplyr any_of
#' @importFrom dplyr bind_rows
#' @importFrom dplyr relocate
#' @importFrom dplyr right_join
#' @importFrom dplyr join_by
#' @importFrom glue glue
#' @importFrom httr2 url_build
#' @importFrom httr2 url_parse
#' @importFrom potions pour
#' @importFrom rlang abort
#' @importFrom tibble tibble
#' @importFrom tidyr unnest_longer
#' @export
atlas_media <- function(request = NULL,
identify = NULL,
filter = NULL,
select = NULL,
geolocate = NULL,
data_profile = NULL
) {
# capture supplied arguments
args <- as.list(environment())
# convert to `data_request` object
.query <- check_atlas_inputs(args)
.query$type <- "occurrences" # default, but in case supplied otherwise
# ensure a filter is present (somewhat redundant with `collapse`)
if(is.null(.query$filter)){
abort("You must specify a valid `filter()` to use `atlas_media()`")
}
# ensure media columns are present in `select`
media_fields <- c("images", "videos", "sounds")
if(is.null(.query$select)){
.query <- update_data_request(.query,
select = galah_select(group = c("basic", "media")))
present_fields <- media_fields
# if `select` is present, ensure that at least one 'media' field is requested
}else{
x <- collapse(.query)
# now check whether valid fields are present
selected_fields <- x$url |>
url_parse() |>
pluck("query", "fields") |>
strsplit(split = ",") |>
pluck(!!!list(1))
# abort if none are given
if(!any(selected_fields %in% media_fields)){
selected_text <- paste(selected_fields, collapse = ", ")
bullets <- c("No media fields requested by `select()`",
i = glue("try `galah_select({selected_text}, group = 'media')` instead"))
abort(bullets)
}else{
present_fields <- selected_fields[selected_fields %in% media_fields]
.query <- x
}
} # end `select` checks
# `filter` to records that contain media of valid types
media_fq <- glue("({present_fields}:*)")
if(length(present_fields) > 1){
media_fq <- glue("({glue_collapse(media_fq, ' OR ')})")
}
# update .query with fields filter
# note that behaviour here depends on whether we have run compute_checks() above
if(inherits(.query, "data_request")){
.query$filter <- bind_rows(.query$filter,
tibble(variable = "media",
logical = "==",
value = paste(present_fields, collapse = "|"),
query = as.character(media_fq)))
}else if(inherits(.query, "query")){ # i.e. if .query is already a `query`
url <- url_parse(.query$url)
url$query$fq <- paste0(url$query$fq, "AND", media_fq)
.query$url <- url_build(url)
.query <- compute_occurrences(.query)
}else{
abort("unknown object class in `atlas_media()`")
}
# get occurrences
occ <- .query |>
collect(wait = TRUE) |>
unnest_longer(col = any_of(present_fields))
occ$media_id <- build_media_id(occ)
# collect media metadata
media <- request_metadata() |>
filter(media == occ) |>
collect()
# join and return
occ_media <- right_join(occ,
media,
by = join_by("media_id" == "image_id"))
relocate(occ_media, "media_id", 1)
}