# Creating the new EPIC-ATAC reference and variance profiles 
- Normalize counts to TPM-like values using EPIC-ATAC
- Calculate median TPM per cell type to generate reference profile
- Compute IQR (interquartile range) per region across samples within each cell type

## Load required libraries

In [None]:
library(EPICATAC)
library(tidyr)
library(ggplot2)
library(data.table)
library(dplyr)
library(reshape2)  
library(ggthemes)
library(patchwork)
library(matrixStats) 

# Load the raw count matrix with new samples

In [None]:
# Define the file path
csv_file <- "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/raw_counts_matrix.csv"

# Load the CSV into R as a DataFrame
raw_counts_matrix <- read.csv(csv_file, header = TRUE, stringsAsFactors = FALSE)

# Print the first few rows
head(raw_counts_matrix)

# Check the structure of the DataFrame
str(raw_counts_matrix)


# TPM-like normalization for each sample

In [None]:
# Create a "region" column in the format "chrom:start-end"
raw_counts_matrix$region <- paste0(raw_counts_matrix$chrom, ":", raw_counts_matrix$start, "-", raw_counts_matrix$end)

# Convert to a data frame
raw_counts_matrix <- as.data.frame(raw_counts_matrix)

# Make "region" the row names
rownames(raw_counts_matrix) <- raw_counts_matrix$region

# Drop the original 'chrom', 'start', and 'end' columns
raw_counts_matrix <- raw_counts_matrix[, !(colnames(raw_counts_matrix) %in% c("chrom", "start", "end", "region"))]

# Verify the final DataFrame
str(raw_counts_matrix)
head(raw_counts_matrix)


In [None]:
tpm_counts_matrix <- EPICATAC:::get_TPMlike_counts(raw_counts_matrix)
head(tpm_counts_matrix)

# Build new EPIC-ATAC reference profile using the median across cell type samples

In [None]:
# Extract sample names and cell types
sample_names <- colnames(tpm_counts_matrix)  # Extract sample names
cell_type_groups <- sapply(strsplit(sample_names, "_"), `[`, 1)  # Extract first part as cell type

# Map samples to their cell types
sample_to_cell_type <- setNames(cell_type_groups, sample_names)

# Convert TPM matrix to a standard matrix 
tpm_mat <- as.matrix(tpm_counts_matrix)

# Compute median TPM per cell type 
reference_profile_mat <- apply(tpm_mat, 1, function(x) tapply(x, sample_to_cell_type, median, na.rm = TRUE))

# Convert back to a data frame and ensure numeric values
reference_profile_df <- as.data.frame(t(reference_profile_mat), stringsAsFactors = FALSE)  

# Rename specific columns to match expected reference names
colnames(reference_profile_df)[colnames(reference_profile_df) == "CD4"] <- "CD4_Tcells"
colnames(reference_profile_df)[colnames(reference_profile_df) == "CD8"] <- "CD8_Tcells"
colnames(reference_profile_df)[colnames(reference_profile_df) == "hepatocytes"] <- "Hepatocytes"

# Ensure proper row names 
if (!grepl("^chr", rownames(reference_profile_df)[1])) {
  stop("Row names do not match chromosome regions. Check input format.")
}

# Convert to a numeric matrix
reference_profile_mat <- as.matrix(reference_profile_df)

# Convert row names from "chr:start-end" to "chr-start-end"
rownames(reference_profile_mat) <- gsub(":", "-", rownames(reference_profile_mat))

# Verify structure
str(reference_profile_mat)

# Print first few rows to confirm
head(reference_profile_mat)


# Save in RDS format

In [None]:
saveRDS(reference_profile_mat, file = "/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/refProfiles.rds")

# Build EPIC-ATAC variance profile across cell type samples

In [None]:
# Compute IQR of TPM-like counts across all samples per cell type
iqr_profile_mat <- apply(tpm_mat, 1, function(x) tapply(x, sample_to_cell_type, IQR, na.rm = TRUE))

# Convert to a data frame
iqr_profile_df <- as.data.frame(t(iqr_profile_mat), stringsAsFactors = FALSE)

# Rename specific columns to match expected reference names
colnames(iqr_profile_df)[colnames(iqr_profile_df) == "CD4"] <- "CD4_Tcells"
colnames(iqr_profile_df)[colnames(iqr_profile_df) == "CD8"] <- "CD8_Tcells"
colnames(iqr_profile_df)[colnames(iqr_profile_df) == "hepatocytes"] <- "Hepatocytes"

# Ensure numeric values
iqr_profile_df[] <- lapply(iqr_profile_df, as.numeric)

# Ensure proper row names 
rownames(iqr_profile_df) <- rownames(tpm_mat)

# Convert row names from "chr:start-end" to "chr-start-end"
rownames(iqr_profile_df) <- gsub(":", "-", rownames(iqr_profile_df))

# Verify the change
head(rownames(iqr_profile_df))

# Verify structure
str(iqr_profile_df)


# Save in RDS format

In [None]:
saveRDS(iqr_profile_df, file = "/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/refProfiles.var.rds")