# Extraction and formatting of cell-type specific marker regions from EPIC-ATAC

## Import required libraries

In [None]:
library(EPICATAC)
library(tidyr)

## Define output file path

In [None]:
# Define the output CSV file
output_csv <- "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/cell_types_markers.csv"


## Initialize data structure

In [None]:
# Extract all cell type markers from the atacRef_TME object
all_cell_types <- names(atacRef_TME$markers)

# Create an empty data frame to store all markers
all_markers_df <- data.frame(cell_type = character(),
                             chrom = character(),
                             start = numeric(),
                             end = numeric(),
                             stringsAsFactors = FALSE)

## Loop through cell types, combine and save

In [None]:
# Loop through each cell type and adjust marker format
for (cell_type in all_cell_types) {
  # Ensure markers follow the correct pattern "chrX:12345-67890"
  adjusted_markers <- gsub("chr(\\w+)-", "chr\\1:", atacRef_TME$markers[[cell_type]])
  adjusted_markers <- adjusted_markers[grep("^chr[0-9XYM]+:\\d+-\\d+$", adjusted_markers)]
  
  # Convert to data frame
  cell_type_df <- data.frame(Markers = adjusted_markers, stringsAsFactors = FALSE)

  # Split chromosome and positions safely
  cell_type_df <- tidyr::separate(cell_type_df, Markers, into = c("chrom", "start_end"), sep = ":", extra = "drop", fill = "right")
  cell_type_df <- tidyr::separate(cell_type_df, start_end, into = c("start", "end"), sep = "-", extra = "drop", fill = "right")

  # Convert start and end to numeric
  cell_type_df$start <- suppressWarnings(as.numeric(cell_type_df$start))
  cell_type_df$end <- suppressWarnings(as.numeric(cell_type_df$end))

  # Remove rows with NA values
  cell_type_df <- cell_type_df[!is.na(cell_type_df$start) & !is.na(cell_type_df$end), ]

  # Add cell type column
  cell_type_df$cell_type <- cell_type

  # Append to the main data frame
  all_markers_df <- rbind(all_markers_df, cell_type_df[, c("cell_type", "chrom", "start", "end")])
}

# Save as CSV file
write.csv(all_markers_df, output_csv, row.names = FALSE, quote = FALSE)

print(paste("Saved:", output_csv))