# Extract cell type marker regions as BED file

## Load required libraries

In [None]:
library(EPICATAC)
library(tidyr)

## Define output paths

In [None]:
# Define the output directory and BED file path
output_directory <- "/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/"
output_bed_file <- file.path(output_directory, "cell_type_markers.bed")


## Extract cell type marker region 

In [None]:
# Extract all cell type markers from the atacRef_TME object
all_cell_types <- names(atacRef_TME$markers)

# Create an empty list to store adjusted markers
marker_list <- list()

for (cell_type in all_cell_types) {
  adjusted_markers <- gsub("chr(\\w+)-", "chr\\1:", atacRef_TME$markers[[cell_type]])
  adjusted_markers <- adjusted_markers[grep("^chr[0-9XYM]+:\\d+-\\d+$", adjusted_markers)]
  
  cell_type_df <- data.frame(Markers = adjusted_markers, stringsAsFactors = FALSE)

  # Split chromosome and positions
  cell_type_df <- tidyr::separate(cell_type_df, Markers, into = c("chrom", "start_end"), sep = ":", extra = "drop", fill = "right")
  cell_type_df <- tidyr::separate(cell_type_df, start_end, into = c("start", "end"), sep = "-", extra = "drop", fill = "right")

  # Convert start/end positions to numeric
  cell_type_df$start <- suppressWarnings(as.numeric(cell_type_df$start))
  cell_type_df$end <- suppressWarnings(as.numeric(cell_type_df$end))

  # Remove rows with missing values
  cell_type_df <- cell_type_df[!is.na(cell_type_df$start) & !is.na(cell_type_df$end), ]

  # Append to list
  marker_list[[cell_type]] <- cell_type_df
}

In [None]:
# Combine all markers into a single data frame
marker_df <- do.call(rbind, marker_list)

# Sort the markers by chromosome and start position
marker_df <- marker_df[order(marker_df$chrom, marker_df$start), ]

In [None]:
print(marker_df)

In [None]:
# Save as a BED file (without headers)
write.table(marker_df, output_bed_file, sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

print(paste("Saved marker list in BED format:", output_bed_file))