Breeding-Insight · Cristianetaniguti · Mar 27, 2026 · Mar 13, 2026 · Mar 14, 2026 · Mar 25, 2026
diff --git a/NAMESPACE b/NAMESPACE
@@ -21,7 +21,9 @@ export(merge_MADCs)
 export(solve_composition_poly)
 export(thinSNP)
 export(updog2vcf)
+export(vmsg)
 import(dplyr)
+import(ggplot2)
-import(ggplot2)
-import(ggplot2)
 import(janitor)
 import(parallel)
 import(quadprog)
@@ -34,6 +36,13 @@ importFrom(Biostrings,DNAString)
 importFrom(Biostrings,reverseComplement)
 importFrom(Rdpack,reprompt)
 importFrom(Rsamtools,bgzip)
+importFrom(dplyr,"%>%")
+importFrom(dplyr,across)
+importFrom(dplyr,group_by)
+importFrom(dplyr,mutate)
+importFrom(dplyr,select)
+importFrom(dplyr,summarise)
+importFrom(dplyr,where)
 importFrom(pwalign,nucleotideSubstitutionMatrix)
 importFrom(pwalign,pairwiseAlignment)
 importFrom(readr,read_csv)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,11 @@
+# BIGr 0.6.4
+
+- Add function `vmsg` to organize messages printed on the console
+- Add metadata to VCF header from madc2vcf_targets
+- Add argument `madc_object` to `get_countsMADC` to avoid reading the MADC file twice and to get directly the MADC fixed padding output from `check_botloci`
+- Organize messages from `madc2vcf_targets` checks
+- Add argument `collapse_matches_counts` and `verbose` to `madc2vcf_targets` function
+
 # BIGr 0.6.3
 
 - New function to check MADC files: `check_madc_sanity`. Currently, it checks for the presence of required columns, whether fixed allele IDs were assigned, the presence of IUPAC codes, lowercase sequence bases, indels, and chromosome and position information.

diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R
diff --git a/R/get_countsMADC.R b/R/get_countsMADC.R
@@ -1,31 +1,76 @@
 #' Obtain Read Counts from MADC File
 #'
-#' This function takes the MADC file as input and retrieves the ref and alt counts for each sample,
-#' and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment,
-#' only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored.
+#' Reads a DArTag MADC report and returns reference and total read count matrices
+#' per marker and sample. Only `Ref` and `Alt` target loci are retained;
+#' `|AltMatch` / `|RefMatch` rows are either discarded or collapsed depending on
+#' `collapse_matches_counts`.
 #'
+#' @details
+#' Either `madc_file` or `madc_object` must be provided (not both `NULL`).
+#' When `madc_object` is supplied it is passed directly to `get_counts()`,
+#' skipping file I/O. The function constructs:
+#' - `ref_matrix` — per-sample reference allele counts.
+#' - `size_matrix` — per-sample total counts (ref + alt).
+#'
+#' Markers whose `CloneID` appears only in the `Ref` or only in the `Alt` rows
+#' are removed with a warning. A summary of the proportion of zero-count
+#' data points (missing data) is reported via `vmsg()`.
+#'
+#' @param madc_file character or `NULL`. Path to the input MADC CSV file.
+#'   At least one of `madc_file` or `madc_object` must be provided.
+#' @param madc_object data frame or `NULL`. A pre-read MADC data frame
+#'   (e.g., as returned by `check_botloci()`). When supplied, file reading is
+#'   skipped. At least one of `madc_file` or `madc_object` must be provided.
+#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch`
+#'   and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt`
+#'   rows. If `FALSE` (default), `|AltMatch` and `|RefMatch` rows are discarded.
+#' @param verbose logical. Whether to print progress messages. Default is `TRUE`.
+#'
+#' @return A named list with two numeric matrices, both with markers as rows
+#'   and samples as columns:
+#'   \describe{
+#'     \item{`ref_matrix`}{Reference allele read counts.}
+#'     \item{`size_matrix`}{Total read counts (reference + alternative).}
+#'   }
 #'
-#' @param madc_file Path to MADC file
-#' @import dplyr
-#' @return A list of read count matrices for reference, alternate, and total read count values
 #' @examples
 #' # Get the path to the MADC file
 #' madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr")
 #'
 #' # Extract the read count matrices
 #' counts_matrices <- get_countsMADC(madc_path)
 #'
-#' # Access the reference, alternate, and size matrices
-#'
-#' # ref_matrix <- counts_matrices$ref_matrix
-#' # alt_matrix <- counts_matrices$alt_matrix
+#' # Access the reference and size matrices
+#' # ref_matrix  <- counts_matrices$ref_matrix
 #' # size_matrix <- counts_matrices$size_matrix
 #'
 #' rm(counts_matrices)
+#'
+#' @seealso [get_counts()], [check_madc_sanity()]
+#'
+#' @import dplyr
 #' @export
-get_countsMADC <- function(madc_file) {
+get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) {
+
+  # Add check inputs
+  if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.")
+  if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.")
+  if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.")
+
+  vmsg(paste0("Extracting read counts from ", ifelse(!is.null(madc_file), paste0("MADC file: ", madc_file), "madc_object")), verbose = verbose, level = 0, type = ">>")
+  vmsg(ifelse(collapse_matches_counts,
+              "|AltMatch and |RefMatch counts will be collapsed into their respective |Ref and |Alt alleles.",
+              "|AltMatch and |RefMatch rows will be discarded (collapse_matches_counts = FALSE)."),
+       verbose = verbose, level = 1, type = ">>")
+
   # This function takes the MADC file as input and generates a Ref and Alt counts dataframe as output
-  update_df <- get_counts(madc_file)
+  if (is.null(madc_object)) {
+    update_df <- get_counts(madc_file = madc_file, collapse_matches_counts = collapse_matches_counts, verbose = verbose)
+  } else {
+    update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose)
+  }
+  # Ensure plain data.frame so row.names<- does not trigger tibble deprecation warning
+  update_df <- as.data.frame(update_df)
 
   # Filter rows where 'AlleleID' ends with 'Ref'
   ref_df <- subset(update_df, grepl("Ref$", AlleleID))
@@ -43,9 +88,16 @@ get_countsMADC <- function(madc_file) {
 
   #Retain only the rows in common if they are not identical and provide warning
   if (same == FALSE) {
-    warning("Mismatch between Ref and Alt Markers. Markers without a Ref or Alt match removed.")
     # Find the common CloneIDs between the two dataframes
+    all_mks <- unique(c(rownames(ref_df), rownames(alt_df)))
     common_ids <- intersect(rownames(ref_df), rownames(alt_df))
+    n_singles <- length(all_mks) - length(common_ids)
+
+    vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>")
+    vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>")
+
+    warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion"))
+
     # Subset both dataframes to retain only the common rows
     ref_df <- ref_df[common_ids, ]
     alt_df <- alt_df[common_ids, ]
@@ -77,7 +129,7 @@ get_countsMADC <- function(madc_file) {
 
   # Print the result
   ratio_missing_data <- count_zeros / length(size_matrix)
-  message("Ratio of missing data =", ratio_missing_data, "\n")
+  vmsg(paste0("Percentage of missing data (datapoints with 0 total count): ", round(ratio_missing_data * 100, 2), "%"), verbose = verbose, level = 2, type = ">>")
 
   # Return the ref and alt matrices as a list
   matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix)
@@ -86,41 +138,109 @@ get_countsMADC <- function(madc_file) {
 
 }
 
-get_counts <- function(madc_file) {
+#' Read and Pre-process a MADC File
+#'
+#' Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the
+#' file format, and returns a filtered data frame containing only `Ref` and `Alt`
+#' haplotype rows ready for count-matrix construction.
+#'
+#' @details
+#' **Input**: either `madc_file` (path to CSV) or `madc_object` (pre-read data
+#' frame) must be supplied; at least one is required.
+#'
+#' **Format detection** (applied to file or object alike): the first seven rows
+#' of the first column are inspected:
+#' - **Standard format**: all entries are blank or `"*"` — the first 7 rows are
+#'   treated as DArT placeholder rows and skipped.
+#' - **Fixed-allele-ID format**: no filler rows — data are used as-is.
+#'
+#' **`|AltMatch` / `|RefMatch` handling** (controlled by `collapse_matches_counts`):
+#' - `FALSE` (default): these rows are simply discarded.
+#' - `TRUE`: their counts are summed into the corresponding `|Ref` or `|Alt`
+#'   row for the same `CloneID`.
+#'
+#' In all cases, trailing suffixes on `AlleleID` (e.g., `|Ref_001`, `|Alt_002`)
+#' are stripped to the canonical `|Ref` / `|Alt` form.
+#'
+#' @param madc_file character or `NULL`. Path to the input MADC CSV file.
+#'   At least one of `madc_file` or `madc_object` must be provided.
+#' @param madc_object data frame or `NULL`. A pre-read MADC data frame
+#'   (e.g., from `check_botloci()`). When supplied, file reading is skipped.
+#'   At least one of `madc_file` or `madc_object` must be provided.
+#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch`
+#'   and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt`
+#'   rows. If `FALSE` (default), those rows are discarded.
+#' @param verbose logical. Whether to print progress messages. Default is `TRUE`.
+#'
+#' @return A data frame with one row per `Ref` or `Alt` allele entry, retaining
+#'   all original columns (`AlleleID`, `CloneID`, `AlleleSequence`, sample
+#'   count columns, etc.).
+#'
+#' @importFrom dplyr mutate group_by summarise across where select
+#' @importFrom dplyr %>%
+#'
+#' @keywords internal
+get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) {
+
+  # Add check inputs
+  if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.")
+  if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.")
+  if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.")
+
   # Read the MADC file
+
+  if(!is.null(madc_file)){
   #Read only the first column for the first seven rows
   first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))
 
   #Check if all entries in the first column are either blank or "*"
   check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))
 
+  } else {
+    check_entries <- all(madc_object[1:min(7L, nrow(madc_object)), 1] %in% c("", "*"))
+  }
+
   #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
   if (check_entries) {
     #Note: This assumes that the first 7 rows are placeholder info from DArT processing
-
     #Read the madc file
-    madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
-
-    #Retain only the Ref and Alt haplotypes
-    filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]
-
-    #Remove extra text after Ref and Alt (_001 or _002)
-    filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
-
+    vmsg("Detected raw MADC format with 7-row header. Reading file while skipping the first 7 rows.", verbose = verbose, level = 1, type = ">>")
+    if(!is.null(madc_file)){
+      madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
+    } else {
+      madc_df <- madc_object[-(1:7), ]
+    }
   } else {
 
     #Read the madc file
-    madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
-
-    # Retain only the Ref and Alt haplotypes
-    filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]
+    vmsg("Detected fixed allele IDs MADC format", verbose = verbose, level = 1, type = ">>")
+    if(!is.null(madc_file)){
+      madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
+    } else {
+      madc_df <- madc_object
+    }
+  }
 
-    #Remove extra text after Ref and Alt (_001 or _002)
-    filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
-    filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
+  if(collapse_matches_counts){
+    filtered_df <- madc_df[order(madc_df$AlleleID),] %>%
+      mutate(Type = ifelse(grepl("Alt", AlleleID), "Alt", "Ref")) %>%
+      group_by(CloneID, Type) %>%
+      summarise(
+        AlleleID = paste0(unique(CloneID), "|", unique(Type)),
+        AlleleSequence = first(AlleleSequence),
+        across(where(is.numeric), sum),
+        .groups = "drop"
+      ) %>%
+      select(AlleleID, CloneID, AlleleSequence, everything(), -Type)
 
+  } else {
+    #Retain only the Ref and Alt haplotypes
+    filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]
   }
 
+  #Remove extra text after Ref and Alt (_001 or _002)
+  filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
+  filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
+
   return(filtered_df)
 }
diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R
@@ -60,8 +60,11 @@ madc2vcf_all <- function(madc = NULL,
                          multiallelic_SNP_sample_thr = 0,
                          alignment_score_thr = 40,
                          out_vcf = NULL,
+                         markers_info = NULL,
                          verbose = TRUE){
 
+  vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>")
+
   # Input checks
   if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.")
   if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.")
@@ -85,8 +88,8 @@ madc2vcf_all <- function(madc = NULL,
   bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=<ID=madc2vcf_all,Version="',
                       packageVersion("BIGr"), '",Data="',
                       Sys.time(),'", CommandLine="> madc2vcf_all(',deparse(substitute(madc)),', ',
-                      "botloci= ", botloci_file, ', ',
-                      "hap_seq= ", hap_seq_file, ', ',
+                      "botloci_file= ", botloci_file, ', ',
+                      "hap_seq_file= ", hap_seq_file, ', ',
                       "n.cores= ", n.cores, ', ',
                       "rm_multiallelic_SNP= ", rm_multiallelic_SNP, ', ',
                       "multiallelic_SNP_dp_thr= ", multiallelic_SNP_dp_thr, ', ',
@@ -95,7 +98,53 @@ madc2vcf_all <- function(madc = NULL,
                       "out_vcf= ", out_vcf, ', ',
                       "verbose= ", verbose,')">')
 
-  if(!is.null(madc)) report <- read.csv(madc, check.names = FALSE) else stop("Please provide a MADC file")
+  report <- read.csv(madc, check.names = FALSE)
+  checks <- check_madc_sanity(report)
+
+  messages_results <- mapply(function(check, message) {
+    if (check)  message[1] else message[2]
-    if (check)  message[1] else message[2]
+    if (isTRUE(check)) {
+      message[1]
+    } else if (isFALSE(check)) {
+      message[2]
+    } else {
+      NA_character_
+    }
-    if (check)  message[1] else message[2]
+    if (isTRUE(check)) {
+      message[1]
+    } else if (isFALSE(check)) {
+      message[2]
+    } else {
+      NA_character_
+    }
+  }, checks$checks, checks$messages)
+
+  if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){
+    idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")]))
+    stop(paste("The MADC file does not pass the sanity checks:\n",
+               paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n")))
+  }
+
+  if(any(checks$checks[c("IUPACcodes")])){
+    idx <- which((checks$checks[c("IUPACcodes")]))
+    stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n"))
+  }
+
+  if(any(!checks$checks[c("ChromPos")])){
+    if(is.null(markers_info)) {
+      stop(paste(messages_results[c("ChromPos")], collapse = "\n"))
+    } else {
+      mi_df <- read.csv(markers_info)
+      if(!all(c("Chr", "Pos") %in% colnames(mi_df)))
+        stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ",
+             "The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.")
+    }
+  }
+
+  if(any(checks$checks[c("Indels")])){
+    idx <- which((checks$checks[c("Indels")]))
+    if(is.null(markers_info)) {
+      stop(paste(messages_results[c("Indels")[idx]], collapse = "\n"))
+    } else {
+      mi_df <- read.csv(markers_info)
+      if(checks$checks["Indels"] &&
+         !all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df)))
+        stop("Indels detected in MADC file. ",
+             "The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.")
+    }
+  }
+
+  if(checks$checks["LowerCase"]){
+    vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1)
+    report$AlleleSequence <- toupper(report$AlleleSequence)
+  }
+
   if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file")
   if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL