(BREAKING) change the read_biotyper_report behavior

in order to make `pick_spectra()` easily available from the taxonomy identification reports. For single report: the first column is now read as `name` instead of `spot` For multiple reports: the `name` columns from all reports are renamed `original_name` and the `name` column is still the leftmost column
ClavelLab · Sep 4, 2023 · 38a6141 · 38a6141
1 parent 49efefa
commit 38a6141
Show file tree

Hide file tree

Showing 9 changed files with 64 additions and 46 deletions.
diff --git a/R/identification_to_clusters.R b/R/identification_to_clusters.R
@@ -12,7 +12,7 @@
 #' @details As all unknown identification are considered unique clusters _within one input tibble_, it is important to consider whether the taxonomic identifications come from a single report or multiple reports, depending on the research question. A message is displayed to confirm from which type of reports the delineation was done.
 #'
 #' @return A tibble of *n* rows for each spectra and 3 columns:
-#' * `name`: the spectra names either from the `spot` column when the input is from [read_biotyper_report()] or from the `name` column when from [read_many_biotyper_reports()].
+#' * `name`: the spectra names from the `name` column from the output of either [read_biotyper_report()] or [read_many_biotyper_reports()].
 #' * `membership`: integers stating the cluster number to which the spectra belong to. It starts from 1 to _c_, the total number of clusters.
 #' * `cluster_size`: integers indicating the total number of spectra in the corresponding cluster.
 #'
@@ -27,12 +27,15 @@
 identification_to_clusters <- function(tibble_report) {
   # check correct names and number of columns
   single_report_cols <- c(
-    "spot", "sample_name",
+    "name", "sample_name",
     "hit_rank", "bruker_quality",
     "bruker_species", "bruker_taxid",
     "bruker_hash", "bruker_log"
   )
-  many_reports_cols <- c("name", single_report_cols)
+  many_reports_cols <- c(
+    "name",
+    gsub("^name$", "original_name", single_report_cols)
+  )
   if (identical(
     base::colnames(tibble_report),
     single_report_cols

diff --git a/R/read_biotyper_report.R b/R/read_biotyper_report.R
@@ -11,7 +11,7 @@
 #' The header-less table contains identification information for each target processed by
 #' the Biotyper device and once processed by the `read_biotyper_report`,
 #' the following seven columns are available in the tibble, _when using the `best_hits = TRUE` option_:
-#' * `spot`: an integer indicating the spot number of the MALDI target (i.e., plate)
+#' * `name`: a character indicating the name of the spot of the MALDI target (i.e., plate)
 #' * `sample_name`: the character string provided during the preparation of the MALDI target (i.e., plate)
 #' * `hit_rank`: an integer indicating the rank of the hit for the corresponding target and identification
 #' * `bruker_quality`: a character encoding the quality of the identification with potentially multiple "+" symbol or only one "-"
@@ -23,7 +23,7 @@
 #' When all hits are returned (with `best_hits = FALSE`), the default output format is the long format (`long_format = TRUE`), meaning that the previous columns remain
 #' unchanged, but all hits are now returned, thus increasing the number of rows.
 #'
-#' When all hits are returned (with `best_hits = FALSE`) _using the wide format_ (`long_format = FALSE), the two columns `spot` and `sample_name`
+#' When all hits are returned (with `best_hits = FALSE`) _using the wide format_ (`long_format = FALSE), the two columns `name` and `sample_name`
 #' remains unchanged, but the five columns prefixed by `bruker_` contain the hit rank, **creating a tibble of 52 columns**:
 #'
 #' * `bruker_01_quality`
@@ -71,15 +71,17 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
 
   # Read in the report, usually warnings about problems and
   #  inconsistent number of columns are triggered
+  # Having name as first column always is to enable
+  #  taxonomic identification cherry-picking
   breport <- utils::read.delim(
     path,
-    col.names = c("spot", "sample_name", prep_names$col_names),
+    col.names = c("name", "sample_name", prep_names$col_names),
     sep = ";", header = FALSE,
     na = c("NA", "E1", "E2", "") # Added E1 identification in taxid as NA
   )
   no_peak_lgl <- breport$bruker_01_species == "no peaks found"
 
-  # Remove the spot for which no peaks were detected, and warn the user
+  # Remove the spot name for which no peaks were detected, and warn the user
   breport <- tibble::as_tibble(breport) %>%
     # Empty sample_name are considered logical and this is undesirable
     dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>%
@@ -97,7 +99,7 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
   # Can't subset columns that don't exist (quality for instance)
   if (nrow(breport) == 0) {
     tibble::tibble(
-      "spot" = character(), "sample_name" = character(), "hit_rank" = integer(),
+      "name" = character(), "sample_name" = character(), "hit_rank" = integer(),
       "bruker_quality" = character(), "bruker_species" = character(),
       "bruker_taxid" = numeric(), "bruker_hash" = character(),
       "bruker_log" = numeric()
@@ -122,23 +124,23 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
       # Subset the table with only the character variables
       report_chr <- breport %>%
         dplyr::select(
-          c("spot", "sample_name") |
+          c("name", "sample_name") |
             tidyselect::contains("bruker") & tidyselect::where(is.character)
         ) %>%
         tidyr::pivot_longer(
-          !c("spot", "sample_name"),
+          !c("name", "sample_name"),
           names_to = c("hit_rank", "type"),
           names_pattern = "bruker_(.*)_(.*)"
         ) %>%
         tidyr::pivot_wider(names_from = "type", values_from = "value")
 
       report_num <- breport %>%
         dplyr::select(
-          tidyselect::all_of(c("spot", "sample_name")) |
+          tidyselect::all_of(c("name", "sample_name")) |
             tidyselect::contains("bruker") & tidyselect::where(is.numeric)
         ) %>%
         tidyr::pivot_longer(
-          !tidyselect::all_of(c("spot", "sample_name")),
+          !tidyselect::all_of(c("name", "sample_name")),
           names_to = c("hit_rank", "type"),
           names_pattern = "bruker_(.*)_(.*)"
         ) %>%
@@ -148,22 +150,22 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
       breport <- dplyr::full_join(
         report_chr,
         report_num,
-        by = c("spot", "sample_name", "hit_rank")
+        by = c("name", "sample_name", "hit_rank")
       ) %>%
         dplyr::mutate("hit_rank" = strtoi(.data$hit_rank, base = 10L)) %>%
         dplyr::relocate(
           c(
-            "spot", "sample_name", "hit_rank",
+            "name", "sample_name", "hit_rank",
             "quality", "species", "taxid", "hash", "log"
           )
         ) %>%
         dplyr::rename_with(
           ~ paste0("bruker_", .x),
-          !c("spot", "sample_name", "hit_rank")
+          !c("name", "sample_name", "hit_rank")
         )
     }
     # when all hits are used, pivot the wide table
-    # to have the spot sample_name hit_number and the rest of the column
+    # to have the name sample_name hit_number and the rest of the column
     if (best_hits) {
       breport %>%
         dplyr::filter(.data$hit_rank == 1) %>%

diff --git a/R/read_many_biotyper_reports.R b/R/read_many_biotyper_reports.R
@@ -7,7 +7,7 @@
 #' @param best_hits A logical indicating whether to return only the best hit in the [read_biotyper_report()] function.
 #' @param ... Name-value pairs to be passed on to [dplyr::mutate()]
 #'
-#' @return A tibble just like the [read_biotyper_report()] function except for an additional column `name` with the `report_ids` used as a prefix of the `spot` name.
+#' @return A tibble just like the one returned by the [read_biotyper_report()] function, except that the name of the spot of the MALDI target (i.e., plate) is registered to the `original_name` column (instead of the `name` column), and the column `name` consist in the provided `report_ids` used as a prefix of the `original_name` column.
 #'
 #' @seealso [read_biotyper_report]
 #'
@@ -30,18 +30,22 @@
 #' )
 read_many_biotyper_reports <- function(path_to_reports, report_ids, best_hits = TRUE, ...) {
   # Import the Bruker Biotyper reports as a named list
+  # Having name as first column always is to enable
+  #  taxonomic identification cherry-picking
   breports <- lapply(
     path_to_reports,
-    read_biotyper_report,
-    best_hits
+    function(path) {
+      read_biotyper_report(path, best_hits) %>%
+        dplyr::rename("original_name" = "name")
+    }
   )
   names(breports) <- report_ids
   # Conversion of a named list of dataframe to the dataframe with the name as
   #  a column is now super easy with enframe()
   tibble::enframe(breports) %>%
     tidyr::unnest("value") %>%
     dplyr::mutate(
-      "name" = paste(gsub("-", "_", .data$name), .data$spot, sep = "_"),
+      "name" = paste(gsub("-", "_", .data$name), .data$original_name, sep = "_"),
       ...
     ) %>%
     return()

diff --git a/dev/dereplicate-spectra.Rmd b/dev/dereplicate-spectra.Rmd
@@ -722,7 +722,7 @@ To do so, we must use the Bruker MALDI Biotyper report from the Compass software
 #' @details As all unknown identification are considered unique clusters _within one input tibble_, it is important to consider whether the taxonomic identifications come from a single report or multiple reports, depending on the research question. A message is displayed to confirm from which type of reports the delineation was done.
 #'
 #' @return A tibble of *n* rows for each spectra and 3 columns:
-#' * `name`: the spectra names either from the `spot` column when the input is from [read_biotyper_report()] or from the `name` column when from [read_many_biotyper_reports()].
+#' * `name`: the spectra names from the `name` column from the output of either [read_biotyper_report()] or [read_many_biotyper_reports()].
 #' * `membership`: integers stating the cluster number to which the spectra belong to. It starts from 1 to _c_, the total number of clusters.
 #' * `cluster_size`: integers indicating the total number of spectra in the corresponding cluster.
 #'
@@ -732,12 +732,15 @@ To do so, we must use the Bruker MALDI Biotyper report from the Compass software
 identification_to_clusters <- function(tibble_report) {
   # check correct names and number of columns
   single_report_cols <- c(
-    "spot", "sample_name",
+    "name", "sample_name",
     "hit_rank", "bruker_quality",
     "bruker_species", "bruker_taxid",
     "bruker_hash", "bruker_log"
   )
-  many_reports_cols <- c("name", single_report_cols)
+  many_reports_cols <- c(
+    "name",
+    gsub("^name$", "original_name", single_report_cols)
+  )
   if (identical(
     base::colnames(tibble_report),
     single_report_cols
@@ -826,7 +829,7 @@ test_that("identification_to_clusters works with correct single report tibble",
 test_that("identification_to_clusters fails modified single report tibble", {
   expect_error(
     identification_to_clusters(
-      report_unknown %>% dplyr::select(!c("spot"))
+      report_unknown %>% dplyr::select(!c("name"))
     ),
     "Unexpected format of Biotyper report."
   )

diff --git a/dev/import-data.Rmd b/dev/import-data.Rmd
@@ -76,7 +76,7 @@ After inflating the template
 #' The header-less table contains identification information for each target processed by
 #' the Biotyper device and once processed by the `read_biotyper_report`,
 #' the following seven columns are available in the tibble, _when using the `best_hits = TRUE` option_:
-#' * `spot`: an integer indicating the spot number of the MALDI target (i.e., plate)
+#' * `name`: a character indicating the name of the spot of the MALDI target (i.e., plate)
 #' * `sample_name`: the character string provided during the preparation of the MALDI target (i.e., plate)
 #' * `hit_rank`: an integer indicating the rank of the hit for the corresponding target and identification
 #' * `bruker_quality`: a character encoding the quality of the identification with potentially multiple "+" symbol or only one "-"
@@ -88,7 +88,7 @@ After inflating the template
 #' When all hits are returned (with `best_hits = FALSE`), the default output format is the long format (`long_format = TRUE`), meaning that the previous columns remain
 #' unchanged, but all hits are now returned, thus increasing the number of rows.
 #'
-#' When all hits are returned (with `best_hits = FALSE`) _using the wide format_ (`long_format = FALSE), the two columns `spot` and `sample_name`
+#' When all hits are returned (with `best_hits = FALSE`) _using the wide format_ (`long_format = FALSE), the two columns `name` and `sample_name`
 #' remains unchanged, but the five columns prefixed by `bruker_` contain the hit rank, **creating a tibble of 52 columns**:
 #'
 #' * `bruker_01_quality`
@@ -130,15 +130,17 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
 
   # Read in the report, usually warnings about problems and
   #  inconsistent number of columns are triggered
+  # Having name as first column always is to enable
+  #  taxonomic identification cherry-picking
   breport <- utils::read.delim(
     path,
-    col.names = c("spot", "sample_name", prep_names$col_names),
+    col.names = c("name", "sample_name", prep_names$col_names),
     sep = ";", header = FALSE,
     na = c("NA", "E1", "E2", "") # Added E1 identification in taxid as NA
   )
   no_peak_lgl <- breport$bruker_01_species == "no peaks found"
 
-  # Remove the spot for which no peaks were detected, and warn the user
+  # Remove the spot name for which no peaks were detected, and warn the user
   breport <- tibble::as_tibble(breport) %>%
     # Empty sample_name are considered logical and this is undesirable
     dplyr::mutate("sample_name" = as.character(.data$sample_name)) %>%
@@ -156,7 +158,7 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
   # Can't subset columns that don't exist (quality for instance)
   if (nrow(breport) == 0) {
     tibble::tibble(
-      "spot" = character(), "sample_name" = character(), "hit_rank" = integer(),
+      "name" = character(), "sample_name" = character(), "hit_rank" = integer(),
       "bruker_quality" = character(), "bruker_species" = character(),
       "bruker_taxid" = numeric(), "bruker_hash" = character(),
       "bruker_log" = numeric()
@@ -181,23 +183,23 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
       # Subset the table with only the character variables
       report_chr <- breport %>%
         dplyr::select(
-          c("spot", "sample_name") |
+          c("name", "sample_name") |
             tidyselect::contains("bruker") & tidyselect::where(is.character)
         ) %>%
         tidyr::pivot_longer(
-          !c("spot", "sample_name"),
+          !c("name", "sample_name"),
           names_to = c("hit_rank", "type"),
           names_pattern = "bruker_(.*)_(.*)"
         ) %>%
         tidyr::pivot_wider(names_from = "type", values_from = "value")
 
       report_num <- breport %>%
         dplyr::select(
-          tidyselect::all_of(c("spot", "sample_name")) |
+          tidyselect::all_of(c("name", "sample_name")) |
             tidyselect::contains("bruker") & tidyselect::where(is.numeric)
         ) %>%
         tidyr::pivot_longer(
-          !tidyselect::all_of(c("spot", "sample_name")),
+          !tidyselect::all_of(c("name", "sample_name")),
           names_to = c("hit_rank", "type"),
           names_pattern = "bruker_(.*)_(.*)"
         ) %>%
@@ -207,22 +209,22 @@ read_biotyper_report <- function(path, best_hits = TRUE, long_format = TRUE) {
       breport <- dplyr::full_join(
         report_chr,
         report_num,
-        by = c("spot", "sample_name", "hit_rank")
+        by = c("name", "sample_name", "hit_rank")
       ) %>%
         dplyr::mutate("hit_rank" = strtoi(.data$hit_rank, base = 10L)) %>%
         dplyr::relocate(
           c(
-            "spot", "sample_name", "hit_rank",
+            "name", "sample_name", "hit_rank",
             "quality", "species", "taxid", "hash", "log"
           )
         ) %>%
         dplyr::rename_with(
           ~ paste0("bruker_", .x),
-          !c("spot", "sample_name", "hit_rank")
+          !c("name", "sample_name", "hit_rank")
         )
     }
     # when all hits are used, pivot the wide table
-    # to have the spot sample_name hit_number and the rest of the column
+    # to have the name sample_name hit_number and the rest of the column
     if (best_hits) {
       breport %>%
         dplyr::filter(.data$hit_rank == 1) %>%
@@ -338,7 +340,7 @@ Below is an example of such usage, where one report was artificially extended in
 #' @param best_hits A logical indicating whether to return only the best hit in the [read_biotyper_report()] function.
 #' @param ... Name-value pairs to be passed on to [dplyr::mutate()]
 #'
-#' @return A tibble just like the [read_biotyper_report()] function except for an additional column `name` with the `report_ids` used as a prefix of the `spot` name.
+#' @return A tibble just like the one returned by the [read_biotyper_report()] function, except that the name of the spot of the MALDI target (i.e., plate) is registered to the `original_name` column (instead of the `name` column), and the column `name` consist in the provided `report_ids` used as a prefix of the `original_name` column.
 #'
 #' @seealso [read_biotyper_report]
 #'
@@ -349,18 +351,22 @@ Below is an example of such usage, where one report was artificially extended in
 #' @examples
 read_many_biotyper_reports <- function(path_to_reports, report_ids, best_hits = TRUE, ...) {
   # Import the Bruker Biotyper reports as a named list
+  # Having name as first column always is to enable
+  #  taxonomic identification cherry-picking
   breports <- lapply(
     path_to_reports,
-    read_biotyper_report,
-    best_hits
+    function(path) {
+      read_biotyper_report(path, best_hits) %>%
+        dplyr::rename("original_name" = "name")
+    }
   )
   names(breports) <- report_ids
   # Conversion of a named list of dataframe to the dataframe with the name as
   #  a column is now super easy with enframe()
   tibble::enframe(breports) %>%
     tidyr::unnest("value") %>%
     dplyr::mutate(
-      "name" = paste(gsub("-", "_", .data$name), .data$spot, sep = "_"),
+      "name" = paste(gsub("-", "_", .data$name), .data$original_name, sep = "_"),
       ...
     ) %>%
     return()

diff --git a/man/identification_to_clusters.Rd b/man/identification_to_clusters.Rd