-
Notifications
You must be signed in to change notification settings - Fork 0
/
pick_spectra.R
275 lines (264 loc) · 10.6 KB
/
pick_spectra.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# WARNING - Generated by {fusen} from dev/dereplicate-spectra.Rmd: do not edit by hand
#' Cherry-pick Bruker MALDI Biotyper spectra
#'
#' Using the clusters information, and potential additional metadata as external
#' criteria, spectra are labeled as to be picked for each cluster. Note that some
#' spectra and therefore clusters can be explicitly removed (_masked_)
#' from the picking decision if they have been previously picked
#' or should be discarded, using logical columns in the metadata table.
#' If no metadata are provided, the reference spectra of each cluster will be picked.
#'
#' @param cluster_df A tibble with clusters information
#' from the [delineate_with_similarity] or the [import_spede_clusters] function.
#' @param metadata_df Optional tibble with relevant metadata to guide the
#' picking process (e.g., OD600).
#' @param criteria_column Optional character indicating the column in `metadata_df`
#' to be used as a criteria.
#' @param hard_mask_column Column name in the `cluster_df` or `metadata_df` tibble indicating whether the spectra, **and the clusters to which they belong** should be discarded (`TRUE`) or not (`FALSE`) before the picking decision.
#' @param soft_mask_column Column name in the `cluster_df` or `metadata_df` tibble indicating whether the spectra should be discarded (`TRUE`) or not (`FALSE`) before the picking decision.
#' @param is_descending_order Optional logical indicating whether to sort the `criteria_column` from the highest-to-lowest value (`TRUE`) or lowest-to-highest (`FALSE`).
#' @param is_sorted Optional logical to indicate that the `cluster_df` is
#' already sorted by cluster based on (usually multiple) internal criteria to
#' pick the first of each cluster. This flag is **overridden** if a `metadata_df`
#' is provided.
#'
#' @return A tibble with as many rows as `cluster_df` with an additional logical
#' column named `to_pick` to indicate whether the colony associated to the spectra
#' should be picked. If `metadata_df` is provided, then additional columns from
#' this tibble are added to the returned tibble.
#'
#' @seealso [delineate_with_similarity], [set_reference_spectra]. For a useful utility function to soft-mask specific spectra: [is_well_on_edge].
#'
#' @export
#' @examples
#' # 0. Load a toy example of a tibble of clusters created by
#' # the `delineate_with_similarity` function.
#' clusters <- readRDS(
#' system.file("clusters_tibble.RDS",
#' package = "maldipickr"
#' )
#' )
#' # 1. By default and if no other metadata are provided,
#' # the function picks reference spectra for each clusters.
#' #
#' # N.B: The spectra `name` and `to_pick` columns are moved to the left
#' # only for clarity using the `relocate()` function.
#' #
#' pick_spectra(clusters) %>%
#' dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 2.1 Simulate OD600 values with uniform distribution
#' # for each of the colonies we measured with
#' # the Bruker MALDI Biotyper
#' set.seed(104)
#' metadata <- dplyr::transmute(
#' clusters,
#' name = name, OD600 = runif(n = nrow(clusters))
#' )
#' metadata
#'
#' # 2.2 Pick the spectra based on the highest
#' # OD600 value per cluster
#' pick_spectra(clusters, metadata, "OD600") %>%
#' dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 3.1 Say that the wells on the right side of the plate are
#' # used for negative controls and should not be picked.
#' metadata <- metadata %>% dplyr::mutate(
#' well = gsub(".*[A-Z]([0-9]{1,2}$)", "\\1", name) %>%
#' strtoi(),
#' is_edge = is_well_on_edge(
#' well_number = well, plate_layout = 96, edges = "right"
#' )
#' )
#'
#' # 3.2 Pick the spectra after discarding (or soft masking)
#' # the spectra indicated by the `is_edge` column.
#' pick_spectra(clusters, metadata, "OD600",
#' soft_mask_column = "is_edge"
#' ) %>%
#' dplyr::relocate(name, to_pick) # only for clarity
#'
#' # 4.1 Say that some spectra were picked before
#' # (e.g., in the column F) in a previous experiment.
#' # We do not want to pick clusters with those spectra
#' # included to limit redundancy.
#' metadata <- metadata %>% dplyr::mutate(
#' picked_before = grepl("_F", name)
#' )
#' # 4.2 Pick the spectra from clusters without spectra
#' # labeled as `picked_before` (hard masking).
#' pick_spectra(clusters, metadata, "OD600",
#' hard_mask_column = "picked_before"
#' ) %>%
#' dplyr::relocate(name, to_pick) # only for clarity
pick_spectra <- function(
cluster_df,
metadata_df = NULL, criteria_column = NULL,
hard_mask_column = NULL, soft_mask_column = NULL,
is_descending_order = TRUE,
is_sorted = FALSE) {
# Check that:
# - the mandatory column describing the cluster are present
# - clusters have a reference spectra to be picked if no metadata are provided
# - when metadata (resp. a criteria column) is provided,
# a criteria column (resp. metadata) should be provided as well
# - when masking columns are indicated, the metadata tibble should be added
# - the spectra name in the metadata table are also present in the clusters table
if (any(!c("name", "membership") %in% colnames(cluster_df))) {
stop(
"'cluster_df' lacks one of the following columns: name or membership"
)
}
if (is.null(metadata_df) & !"is_reference" %in% colnames(cluster_df)) {
stop(
"No additional metadata are provided ('metadata_df' tibble is NULL)",
" and there is no 'is_reference' column in the 'cluster_df' tibble.\n",
"Please do one of the following strategy:\n",
" 1. Either run the 'set_reference_spectra' or 'import_spede_clusters'",
" function beforehand to indicate a reference spectra\n",
" 2. Provide a 'metadata' tibble\n"
)
}
if (
(!is.null(metadata_df) & is.null(criteria_column)) ||
(is.null(metadata_df) & !is.null(criteria_column))
) {
stop(
"Additional metadata ('metadata_df' tibble) should be provided",
" with a criteria column ('criteria_column') to be sorted.",
"Please provide the two together."
)
}
if (
(is.null(metadata_df) & !is.null(hard_mask_column)) ||
(is.null(metadata_df) & !is.null(soft_mask_column))
) {
stop(
"Masking column(s) ('hard'/'soft') require an additional",
" metadata ('metadata_df')"
)
}
if (any(!metadata_df$name %in% cluster_df$name)) {
stop(
"The spectra names in the metadata ('metadata_df' tibble) table",
" do not match the names in the clusters table ('cluster_df')."
)
}
# Warn that metadata table are prioritize over the 'is_sorted' flag
if (is_sorted & !is.null(metadata_df)) {
message(
"The 'is_sorted' flag is set to TRUE, while a metadata table is",
" provided. ",
"Please note that the metadata table takes precedence and the cherry-picking",
" step will rely on the 'criteria_column'."
)
# Set the flag to FALSE for consistency
is_sorted <- FALSE
}
# Preparing the table by merging with the metadata if present
# and checking the presence of the columns
# Keep the initial order of the rows
cluster_df <- tibble::rowid_to_column(cluster_df, "rowid")
# And a copy of the unchanged data.frame
# orig_cluster_df <- dplyr::select(cluster_df, c("rowid", "name"))
orig_cluster_df <- cluster_df
if (!is.null(metadata_df)) {
cluster_df <- cluster_df %>%
dplyr::left_join(metadata_df, by = "name")
# Update the copy of the unchanged data.frame
orig_cluster_df <- cluster_df
if (!is.null(criteria_column) & !criteria_column %in% colnames(cluster_df)) {
stop(
"The 'criteria_column' is not present in the merged tibble."
)
}
if (!is.null(soft_mask_column)) {
if (!soft_mask_column %in% colnames(cluster_df)) {
stop(
"The 'soft_mask_column' is not present in the merged tibble."
)
}
}
if (!is.null(hard_mask_column)) {
if (!hard_mask_column %in% colnames(cluster_df)) {
stop(
"The 'hard_mask_column' is not present in the merged tibble."
)
}
}
# Hard mask: discard the spectra *and* their clusters
# Soft mask: discard the spectra only
# Soft mask
if (!is.null(soft_mask_column)) {
cluster_df <- cluster_df %>% filter(!.data[[soft_mask_column]])
}
# Hard masking is used to label a group that should be
# discarded in the dereplication process and not be chosen
# from:
# e.g. 24h in the 24h vs 48h comparison
# e.g. fast in the fast vs slow growers comparison
if (!is.null(hard_mask_column)) {
clusters_to_keep <- cluster_df %>%
dplyr::select(tidyselect::all_of(c("membership", hard_mask_column))) %>%
# Internally, we will label the clusters to be kept,
# meaning the clusters that:
# do NOT contain spectra from the discard group
# AND contain only spectra from one group (either all discard, or all keep)
dplyr::distinct() %>%
dplyr::add_count(.data$membership) %>%
dplyr::filter(n == 1 & !.data[[hard_mask_column]]) %>%
dplyr::pull(.data$membership)
# Remove the clusters where picking is forbidden
cluster_df <- dplyr::filter(cluster_df, .data$membership %in% clusters_to_keep)
}
}
# Sort the spectra within each cluster using the provided metadata
if (!is.null(metadata_df) & !is_sorted) {
cluster_df <- cluster_df %>%
dplyr::group_by(.data$membership)
if (is_descending_order) {
cluster_df <- dplyr::arrange(
cluster_df, dplyr::desc(.data[[criteria_column]]),
.by_group = TRUE
)
} else {
cluster_df <- dplyr::arrange(
cluster_df, .data[[criteria_column]],
.by_group = TRUE
)
}
}
# Pick the reference spectra if no metadata are present
if (is.null(metadata_df)) {
cluster_df <- cluster_df %>%
dplyr::mutate(
"to_pick" = .data$is_reference
)
} else {
# first() gives the name of the spectra so needs for an
# extra step to convert to a logical vector
cluster_df <- cluster_df %>%
dplyr::mutate(
"to_pick" = dplyr::first(.data$name),
"to_pick" = .data$to_pick == .data$name
)
}
# Merge with original tibble to keep potential masked/discarded spectra
# and same row number and order
# NA in 'to_pick' are replaced with FALSE in the case of discarded clusters
cols <- base::intersect(
colnames(cluster_df),
colnames(orig_cluster_df)
)
cluster_df %>%
dplyr::ungroup() %>%
dplyr::right_join(orig_cluster_df, by = cols) %>%
dplyr::mutate(
"to_pick" = tidyr::replace_na(.data$to_pick, FALSE)
) %>%
# Sort the tibble in the original order
dplyr::arrange(.data$rowid) %>%
dplyr::select(-c("rowid")) %>%
return()
}