Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ export(merge_MADCs)
export(solve_composition_poly)
export(thinSNP)
export(updog2vcf)
export(vmsg)
import(dplyr)
import(ggplot2)
Copy link

Copilot AI Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import(ggplot2) was added, but ggplot2 is not declared in DESCRIPTION Imports/Depends (it's not currently listed). This will fail R CMD check (namespace imports must be declared). Either add ggplot2 to DESCRIPTION Imports, or move it to Suggests and use requireNamespace() conditionally in plotting code.

Suggested change
import(ggplot2)

Copilot uses AI. Check for mistakes.
import(janitor)
import(parallel)
import(quadprog)
Expand All @@ -34,6 +36,13 @@ importFrom(Biostrings,DNAString)
importFrom(Biostrings,reverseComplement)
importFrom(Rdpack,reprompt)
importFrom(Rsamtools,bgzip)
importFrom(dplyr,"%>%")
importFrom(dplyr,across)
importFrom(dplyr,group_by)
importFrom(dplyr,mutate)
importFrom(dplyr,select)
importFrom(dplyr,summarise)
importFrom(dplyr,where)
importFrom(pwalign,nucleotideSubstitutionMatrix)
importFrom(pwalign,pairwiseAlignment)
importFrom(readr,read_csv)
Expand Down
8 changes: 8 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# BIGr 0.6.4

- Add function `vmsg` to organize messages printed on the console
- Add metadata to VCF header from madc2vcf_targets
- Add argument `madc_object` to `get_countsMADC` to avoid reading the MADC file twice and to get directly the MADC fixed padding output from `check_botloci`
- Organize messages from `madc2vcf_targets` checks
- Add argument `collapse_matches_counts` and `verbose` to `madc2vcf_targets` function

# BIGr 0.6.3

- New function to check MADC files: `check_madc_sanity`. Currently, it checks for the presence of required columns, whether fixed allele IDs were assigned, the presence of IUPAC codes, lowercase sequence bases, indels, and chromosome and position information.
Expand Down
291 changes: 238 additions & 53 deletions R/check_madc_sanity.R

Large diffs are not rendered by default.

184 changes: 152 additions & 32 deletions R/get_countsMADC.R
Original file line number Diff line number Diff line change
@@ -1,31 +1,76 @@
#' Obtain Read Counts from MADC File
#'
#' This function takes the MADC file as input and retrieves the ref and alt counts for each sample,
#' and converts them to ref, alt, and size(total count) matrices for dosage calling tools. At the moment,
#' only the read counts for the Ref and Alt target loci are obtained while the additional loci are ignored.
#' Reads a DArTag MADC report and returns reference and total read count matrices
#' per marker and sample. Only `Ref` and `Alt` target loci are retained;
#' `|AltMatch` / `|RefMatch` rows are either discarded or collapsed depending on
#' `collapse_matches_counts`.
#'
#' @details
#' Either `madc_file` or `madc_object` must be provided (not both `NULL`).
#' When `madc_object` is supplied it is passed directly to `get_counts()`,
#' skipping file I/O. The function constructs:
#' - `ref_matrix` — per-sample reference allele counts.
#' - `size_matrix` — per-sample total counts (ref + alt).
#'
#' Markers whose `CloneID` appears only in the `Ref` or only in the `Alt` rows
#' are removed with a warning. A summary of the proportion of zero-count
#' data points (missing data) is reported via `vmsg()`.
#'
#' @param madc_file character or `NULL`. Path to the input MADC CSV file.
#' At least one of `madc_file` or `madc_object` must be provided.
#' @param madc_object data frame or `NULL`. A pre-read MADC data frame
#' (e.g., as returned by `check_botloci()`). When supplied, file reading is
#' skipped. At least one of `madc_file` or `madc_object` must be provided.
#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch`
#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt`
#' rows. If `FALSE` (default), `|AltMatch` and `|RefMatch` rows are discarded.
#' @param verbose logical. Whether to print progress messages. Default is `TRUE`.
#'
#' @return A named list with two numeric matrices, both with markers as rows
#' and samples as columns:
#' \describe{
#' \item{`ref_matrix`}{Reference allele read counts.}
#' \item{`size_matrix`}{Total read counts (reference + alternative).}
#' }
#'
#' @param madc_file Path to MADC file
#' @import dplyr
#' @return A list of read count matrices for reference, alternate, and total read count values
#' @examples
#' # Get the path to the MADC file
#' madc_path <- system.file("iris_DArT_MADC.csv", package = "BIGr")
#'
#' # Extract the read count matrices
#' counts_matrices <- get_countsMADC(madc_path)
#'
#' # Access the reference, alternate, and size matrices
#'
#' # ref_matrix <- counts_matrices$ref_matrix
#' # alt_matrix <- counts_matrices$alt_matrix
#' # Access the reference and size matrices
#' # ref_matrix <- counts_matrices$ref_matrix
#' # size_matrix <- counts_matrices$size_matrix
#'
#' rm(counts_matrices)
#'
#' @seealso [get_counts()], [check_madc_sanity()]
#'
#' @import dplyr
#' @export
get_countsMADC <- function(madc_file) {
get_countsMADC <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) {

# Add check inputs
if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.")
if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.")
if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.")

vmsg(paste0("Extracting read counts from ", ifelse(!is.null(madc_file), paste0("MADC file: ", madc_file), "madc_object")), verbose = verbose, level = 0, type = ">>")
vmsg(ifelse(collapse_matches_counts,
"|AltMatch and |RefMatch counts will be collapsed into their respective |Ref and |Alt alleles.",
"|AltMatch and |RefMatch rows will be discarded (collapse_matches_counts = FALSE)."),
verbose = verbose, level = 1, type = ">>")

# This function takes the MADC file as input and generates a Ref and Alt counts dataframe as output
update_df <- get_counts(madc_file)
if (is.null(madc_object)) {
update_df <- get_counts(madc_file = madc_file, collapse_matches_counts = collapse_matches_counts, verbose = verbose)
} else {
update_df <- get_counts(madc_object = madc_object, collapse_matches_counts = collapse_matches_counts, verbose = verbose)
}
# Ensure plain data.frame so row.names<- does not trigger tibble deprecation warning
update_df <- as.data.frame(update_df)

# Filter rows where 'AlleleID' ends with 'Ref'
ref_df <- subset(update_df, grepl("Ref$", AlleleID))
Expand All @@ -43,9 +88,16 @@ get_countsMADC <- function(madc_file) {

#Retain only the rows in common if they are not identical and provide warning
if (same == FALSE) {
warning("Mismatch between Ref and Alt Markers. Markers without a Ref or Alt match removed.")
# Find the common CloneIDs between the two dataframes
all_mks <- unique(c(rownames(ref_df), rownames(alt_df)))
common_ids <- intersect(rownames(ref_df), rownames(alt_df))
n_singles <- length(all_mks) - length(common_ids)

vmsg(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa"), verbose = verbose, level = 2, type = ">>")
vmsg("Only the markers with both Ref and Alt tags will be retained for the conversion", verbose = verbose, level = 1, type = ">>")

warning(paste("There are", n_singles,"Ref tags without corresponding Alt tags, or vice versa. Only the markers with both Ref and Alt tags will be retained for the conversion"))

# Subset both dataframes to retain only the common rows
ref_df <- ref_df[common_ids, ]
alt_df <- alt_df[common_ids, ]
Expand Down Expand Up @@ -77,7 +129,7 @@ get_countsMADC <- function(madc_file) {

# Print the result
ratio_missing_data <- count_zeros / length(size_matrix)
message("Ratio of missing data =", ratio_missing_data, "\n")
vmsg(paste0("Percentage of missing data (datapoints with 0 total count): ", round(ratio_missing_data * 100, 2), "%"), verbose = verbose, level = 2, type = ">>")

# Return the ref and alt matrices as a list
matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix)
Expand All @@ -86,41 +138,109 @@ get_countsMADC <- function(madc_file) {

}

get_counts <- function(madc_file) {
#' Read and Pre-process a MADC File
#'
#' Reads a DArTag MADC CSV file (or accepts a pre-read data frame), detects the
#' file format, and returns a filtered data frame containing only `Ref` and `Alt`
#' haplotype rows ready for count-matrix construction.
#'
#' @details
#' **Input**: either `madc_file` (path to CSV) or `madc_object` (pre-read data
#' frame) must be supplied; at least one is required.
#'
#' **Format detection** (applied to file or object alike): the first seven rows
#' of the first column are inspected:
#' - **Standard format**: all entries are blank or `"*"` — the first 7 rows are
#' treated as DArT placeholder rows and skipped.
#' - **Fixed-allele-ID format**: no filler rows — data are used as-is.
#'
#' **`|AltMatch` / `|RefMatch` handling** (controlled by `collapse_matches_counts`):
#' - `FALSE` (default): these rows are simply discarded.
#' - `TRUE`: their counts are summed into the corresponding `|Ref` or `|Alt`
#' row for the same `CloneID`.
#'
#' In all cases, trailing suffixes on `AlleleID` (e.g., `|Ref_001`, `|Alt_002`)
#' are stripped to the canonical `|Ref` / `|Alt` form.
#'
#' @param madc_file character or `NULL`. Path to the input MADC CSV file.
#' At least one of `madc_file` or `madc_object` must be provided.
#' @param madc_object data frame or `NULL`. A pre-read MADC data frame
#' (e.g., from `check_botloci()`). When supplied, file reading is skipped.
#' At least one of `madc_file` or `madc_object` must be provided.
#' @param collapse_matches_counts logical. If `TRUE`, counts for `|AltMatch`
#' and `|RefMatch` rows are summed into their corresponding `|Ref` and `|Alt`
#' rows. If `FALSE` (default), those rows are discarded.
#' @param verbose logical. Whether to print progress messages. Default is `TRUE`.
#'
#' @return A data frame with one row per `Ref` or `Alt` allele entry, retaining
#' all original columns (`AlleleID`, `CloneID`, `AlleleSequence`, sample
#' count columns, etc.).
#'
#' @importFrom dplyr mutate group_by summarise across where select
#' @importFrom dplyr %>%
#'
#' @keywords internal
get_counts <- function(madc_file = NULL, madc_object = NULL, collapse_matches_counts = FALSE, verbose = TRUE) {

# Add check inputs
if(is.null(madc_file) && is.null(madc_object)) stop("Please provide either madc_file or madc_object.")
if(!is.null(madc_file) && !file.exists(madc_file)) stop("MADC file not found. Please provide a valid path.")
if(!is.null(madc_object) && !is.data.frame(madc_object)) stop("madc_object must be a data frame.")

# Read the MADC file

if(!is.null(madc_file)){
#Read only the first column for the first seven rows
first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))

#Check if all entries in the first column are either blank or "*"
check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))

} else {
check_entries <- all(madc_object[1:min(7L, nrow(madc_object)), 1] %in% c("", "*"))
}

#Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
if (check_entries) {
#Note: This assumes that the first 7 rows are placeholder info from DArT processing

#Read the madc file
madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)

#Retain only the Ref and Alt haplotypes
filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]

#Remove extra text after Ref and Alt (_001 or _002)
filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)

vmsg("Detected raw MADC format with 7-row header. Reading file while skipping the first 7 rows.", verbose = verbose, level = 1, type = ">>")
if(!is.null(madc_file)){
madc_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)
} else {
madc_df <- madc_object[-(1:7), ]
}
} else {

#Read the madc file
madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE)

# Retain only the Ref and Alt haplotypes
filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]
vmsg("Detected fixed allele IDs MADC format", verbose = verbose, level = 1, type = ">>")
if(!is.null(madc_file)){
madc_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
} else {
madc_df <- madc_object
}
}

#Remove extra text after Ref and Alt (_001 or _002)
filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)
if(collapse_matches_counts){
filtered_df <- madc_df[order(madc_df$AlleleID),] %>%
mutate(Type = ifelse(grepl("Alt", AlleleID), "Alt", "Ref")) %>%
group_by(CloneID, Type) %>%
summarise(
AlleleID = paste0(unique(CloneID), "|", unique(Type)),
AlleleSequence = first(AlleleSequence),
across(where(is.numeric), sum),
.groups = "drop"
) %>%
select(AlleleID, CloneID, AlleleSequence, everything(), -Type)

} else {
#Retain only the Ref and Alt haplotypes
filtered_df <- madc_df[!grepl("\\|AltMatch|\\|RefMatch", madc_df$AlleleID), ]
}

#Remove extra text after Ref and Alt (_001 or _002)
filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID)

return(filtered_df)
}
55 changes: 52 additions & 3 deletions R/madc2vcf_all.R
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,11 @@ madc2vcf_all <- function(madc = NULL,
multiallelic_SNP_sample_thr = 0,
alignment_score_thr = 40,
out_vcf = NULL,
markers_info = NULL,
verbose = TRUE){

vmsg("Checking inputs", verbose = verbose, level = 0, type = ">>")

# Input checks
if(!is.null(madc) & !file.exists(madc)) stop("MADC file not found. Please provide a valid path.")
if(!is.null(botloci_file) & !file.exists(botloci_file)) stop("Botloci file not found. Please provide a valid path.")
Expand All @@ -85,8 +88,8 @@ madc2vcf_all <- function(madc = NULL,
bigr_meta <- paste0('##BIGrCommandLine.madc2vcf_all=<ID=madc2vcf_all,Version="',
packageVersion("BIGr"), '",Data="',
Sys.time(),'", CommandLine="> madc2vcf_all(',deparse(substitute(madc)),', ',
"botloci= ", botloci_file, ', ',
"hap_seq= ", hap_seq_file, ', ',
"botloci_file= ", botloci_file, ', ',
"hap_seq_file= ", hap_seq_file, ', ',
"n.cores= ", n.cores, ', ',
"rm_multiallelic_SNP= ", rm_multiallelic_SNP, ', ',
"multiallelic_SNP_dp_thr= ", multiallelic_SNP_dp_thr, ', ',
Expand All @@ -95,7 +98,53 @@ madc2vcf_all <- function(madc = NULL,
"out_vcf= ", out_vcf, ', ',
"verbose= ", verbose,')">')

if(!is.null(madc)) report <- read.csv(madc, check.names = FALSE) else stop("Please provide a MADC file")
report <- read.csv(madc, check.names = FALSE)
checks <- check_madc_sanity(report)
Comment on lines 99 to +102
Copy link

Copilot AI Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

madc is allowed to be NULL by the signature, but the function now unconditionally calls read.csv(madc, ...). When madc is NULL, this will error with an unhelpful message (and your input checks currently don't stop on NULL). Add an explicit is.null(madc) guard (or make madc required).

Copilot uses AI. Check for mistakes.

messages_results <- mapply(function(check, message) {
if (check) message[1] else message[2]
Copy link

Copilot AI Mar 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same NA-handling issue as in madc2vcf_targets(): if (check) inside mapply() will error if any check is NA. check_madc_sanity() uses NA for skipped checks, so this needs isTRUE()/isFALSE() handling before building messages_results.

Suggested change
if (check) message[1] else message[2]
if (isTRUE(check)) {
message[1]
} else if (isFALSE(check)) {
message[2]
} else {
NA_character_
}

Copilot uses AI. Check for mistakes.
}, checks$checks, checks$messages)

if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){
idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")]))
stop(paste("The MADC file does not pass the sanity checks:\n",
paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n")))
}

if(any(checks$checks[c("IUPACcodes")])){
idx <- which((checks$checks[c("IUPACcodes")]))
stop(paste(messages_results[c("IUPACcodes")[idx]], collapse = "\n"))
}

if(any(!checks$checks[c("ChromPos")])){
if(is.null(markers_info)) {
stop(paste(messages_results[c("ChromPos")], collapse = "\n"))
} else {
mi_df <- read.csv(markers_info)
if(!all(c("Chr", "Pos") %in% colnames(mi_df)))
stop("ChromPos check failed: CloneID values do not follow the Chr_Position format. ",
"The markers_info file must contain 'Chr' and 'Pos' columns to supply CHROM and POS.")
}
}

if(any(checks$checks[c("Indels")])){
idx <- which((checks$checks[c("Indels")]))
if(is.null(markers_info)) {
stop(paste(messages_results[c("Indels")[idx]], collapse = "\n"))
} else {
mi_df <- read.csv(markers_info)
if(checks$checks["Indels"] &&
!all(c("Ref", "Alt", "Indel_pos") %in% colnames(mi_df)))
stop("Indels detected in MADC file. ",
"The markers_info file must contain 'Ref', 'Alt', and 'Indel_pos' columns.")
}
}

if(checks$checks["LowerCase"]){
vmsg("MADC Allele Sequences presented lower case characters. They were converted to upper case.", verbose = verbose, level = 1)
report$AlleleSequence <- toupper(report$AlleleSequence)
}

if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file")
if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL

Expand Down
Loading