# EPIC-ATAC deconvolution using synthetic cfDNA mixtures across varying coverage levels (down-sampling)

## Import required libraries

In [None]:
library(EPICATAC)
library(tidyr)
library(ggplot2)
library(data.table)
library(dplyr)
library(reshape2)
library(pheatmap)
library(IRdisplay)
library(scales)
library(Metrics)
library(viridis)  
library(gridExtra) 

# Load the samples

In [None]:
df_true <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/combinations_syn_samples.csv", header = TRUE, sep = ";")
rownames(df_true) <- df_true$X
df_true$X <- NULL   

df_unbiased <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_markers_summed.csv", , header = TRUE, row.names = 1)
            
df_0.1x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_0.1x.csv", header = TRUE, row.names = 1, sep = ";")
df_0.3x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_0.3x.csv", header = TRUE, row.names = 1, sep = ";")
df_1x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_1x.csv", header = TRUE, row.names = 1, sep = ";")
df_3x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_3x.csv", header = TRUE, row.names = 1, sep = ";")
df_9x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_9x.csv", header = TRUE, row.names = 1, sep = ";")
df_30x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_30x.csv", header = TRUE, row.names = 1, sep = ";")
df_80x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_80x.csv", header = TRUE, row.names = 1, sep = ";")
df_245x <- read.csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/synthetic_dataset_245x.csv", header = TRUE, row.names = 1, sep = ";")


In [None]:
head(df_true)
head(df_unbiased)
head(df_9x)  

# Run the deconvolution

In [None]:
results_unbiased <- EPIC_ATAC(
  bulk = df_unbiased,
  reference = atacRef_TME,
  ATAC = TRUE,
  withOtherCells = FALSE
)

df_unbiased_pred <- as.data.frame(results_unbiased$cellFractions)


In [None]:
head(df_unbiased_pred)

In [None]:
# Define the biased datasets and output filenames
coverage_reduced_datasets <- list(
  "0.1x" = df_0.1x,
  "0.3x" = df_0.3x,
  "1x" = df_1x,
  "3x" = df_3x,
  "9x" = df_9x,
  "30x" = df_30x,
  "80x" = df_80x,
  "245x" = df_245x
)

# Filter out datasets with all zeros
coverage_reduced_datasets_filtered <- coverage_reduced_datasets[
  sapply(coverage_reduced_datasets, function(df) {
    
    !(all(df == 0) || all(apply(df, 2, var) == 0))
  })
]

# Run your loop only on those that pass the filter
results_list_coverage_reduced <- list()

for (name in names(coverage_reduced_datasets_filtered)) {
  cat("Running deconvolution for:", name, "\n")
  current_df <- coverage_reduced_datasets_filtered[[name]]
  
  results <- EPIC_ATAC(
    bulk = current_df, 
    reference = atacRef_TME,
    ATAC = TRUE,
    withOtherCells = FALSE
  )
  
  df_coverage_reduced_pred <- as.data.frame(results$cellFractions)
  results_list_coverage_reduced[[name]] <- df_coverage_reduced_pred
}


In [None]:
# print(results_list_coverage_reduced)

# Performance analyis - unbiased

In [None]:
# Ensure df_pred_sub and df_true_sub exist
df_pred_sub <- df_unbiased_pred 
df_true_sub <- df_true           

# Ensure row names match
rownames(df_pred_sub) <- rownames(df_true_sub)

# Function to compute RMSE per cell type
compute_rmse <- function(pred, obs) {
  rmse_val  <- rmse(obs, pred)
  return(data.frame(RMSE = round(rmse_val, 3)))
}

# Compute RMSE for each cell type
results_list <- lapply(colnames(df_true_sub), function(ct) {
  pred <- df_pred_sub[[ct]]
  obs  <- df_true_sub[[ct]]
  
  rmse_df <- compute_rmse(pred, obs)
  rmse_df$celltype <- ct  
  return(rmse_df)
})

# Combine RMSE results into a single dataframe
results_df <- do.call(rbind, results_list)

In [None]:
# Convert true values to long format
df_long_true <- df_true_sub %>%
  mutate(combo = rownames(.)) %>%
  melt(id.vars = "combo", variable.name = "cell_type", value.name = "true_value")

# Convert unbiased predictions to long format
df_long_unbiased <- df_pred_sub %>%
  mutate(combo = rownames(.)) %>%
  melt(id.vars = "combo", variable.name = "cell_type", value.name = "pred_unbiased")

# Merge true and predicted values
df_compare_unbiased <- df_long_true %>%
  left_join(df_long_unbiased, by = c("combo", "cell_type"))

# Merge RMSE values
df_metrics <- results_df %>%
  mutate(RMSE = round(RMSE, 3)) %>%
  select(celltype, RMSE) %>%
  rename(cell_type = celltype)  

df_compare_unbiased <- df_compare_unbiased %>%
  left_join(df_metrics, by = "cell_type")

# Check structure
str(df_compare_unbiased)
head(df_compare_unbiased)


In [None]:
compute_ccc_manual <- function(pred, obs) {
  mu_x <- mean(pred)
  mu_y <- mean(obs)
  var_x <- var(pred)
  var_y <- var(obs)
  cov_xy <- cov(pred, obs)
  
  numerator <- 2 * cov_xy
  denominator <- var_x + var_y + (mu_x - mu_y)^2
  
  return(numerator / denominator)  
}

In [None]:
# Compute CCC manually for each cell type
df_ccc_results <- data.frame(
  cell_type = colnames(df_true),
  CCC = sapply(colnames(df_true), function(ct) {
    compute_ccc_manual(df_pred_sub[[ct]], df_true[[ct]])
  })
)

print(df_ccc_results)

In [None]:
# Merge CCC values into df_compare_unbiased
df_compare_unbiased <- df_compare_unbiased %>%
  left_join(df_ccc_results, by = "cell_type")

In [None]:
options(repr.plot.width=10, repr.plot.height=8)


In [None]:
ggplot(df_compare_unbiased, aes(x = true_value, y = pred_unbiased)) +
  geom_point(alpha = 0.6, color = "blue") +  
  facet_wrap(~cell_type, scales = "fixed") + 
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") +  
  geom_text(aes(x = 0.60, y = 0.98, 
                label = paste0("CCC=", format(round(CCC, 3), nsmall = 3))),  
            hjust = 0, size = 4, color = "grey30") +  
  geom_text(aes(x = 0.60, y = 0.89, 
                label = paste0("RMSE=", format(round(RMSE, 3), nsmall = 3))),  
            hjust = 0, size = 4, color = "grey30") +  
  scale_x_continuous(limits = c(0, 1.0)) + 
  scale_y_continuous(limits = c(0, 1.0)) +  
  theme_bw(base_size = 16) +
  labs(
    x = "True proportions", 
    y = "Predicted proportions",
    title = "True vs. predicted cell-type proportions"
  ) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", size = 16, hjust = 0.5)
  )

# Save the plot in high resolution (300 dpi)
ggsave("full_coverage.png", dpi = 600, width = 10, height = 8)


# Performance analyis - Coverage reduction

### RSME calculation

In [None]:
# Initialize a list to store RMSE results
rmse_results_list <- list()

# Loop through all coverage-reduced datasets
for (name in names(results_list_coverage_reduced)) {
  
  print(paste("Computing RMSE for:", name))
  
  # Extract predicted values for the current coverage-reduced dataset
  df_coverage_reduced_pred <- results_list_coverage_reduced[[name]]
  
  # Ensure row names match
  rownames(df_coverage_reduced_pred) <- rownames(df_true)

  # Compute RMSE for each cell type
  results_list_rmse <- lapply(colnames(df_true), function(ct) {
    pred <- df_coverage_reduced_pred[[ct]]
    obs  <- df_true[[ct]]
    
    rmse_df <- data.frame(RMSE = round(rmse(obs, pred), 3), cell_type = ct)
    return(rmse_df)
  })

  # Combine RMSE results into a single dataframe
  results_df <- do.call(rbind, results_list_rmse)

  # Store in the RMSE results list
  rmse_results_list[[name]] <- results_df
}

print("All RMSE calculations completed.")

In [None]:
# Combine all RMSE results into a single dataframe for easy comparison
rmse_results_df <- do.call(rbind, lapply(names(rmse_results_list), function(name) {
  df <- rmse_results_list[[name]]
  df$dataset <- name  
  return(df)
}))


#  Comparison Dataframe

In [None]:
# Initialize a list to store comparison data for each coverage-reduced dataset
df_compare_coverage_reduced_list <- list()

# Loop through all coverage-reduced datasets
for (name in names(results_list_coverage_reduced)) {
  
  print(paste("Processing:", name))
  
  # Extract predicted values for the current dataset
  df_coverage_reduced_pred <- results_list_coverage_reduced[[name]]
  
  # Ensure row names match
  rownames(df_coverage_reduced_pred) <- rownames(df_true)

  # Convert true values to long format
  df_long_true <- df_true %>%
    mutate(combo = rownames(.)) %>%
    melt(id.vars = "combo", variable.name = "cell_type", value.name = "true_value")

  # Convert coverage-reduced predictions to long format
  df_long_coverage_reduced <- df_coverage_reduced_pred %>%
    mutate(combo = rownames(.)) %>%
    melt(id.vars = "combo", variable.name = "cell_type", value.name = "pred_coverage_reduced")

  # Merge true and predicted values
  df_compare_coverage_reduced <- df_long_true %>%
    left_join(df_long_coverage_reduced, by = c("combo", "cell_type"))

  # Get RMSE values for this dataset
  df_metrics <- rmse_results_list[[name]] %>%
    mutate(RMSE = round(RMSE, 3)) %>%
    select(cell_type, RMSE)

  # Merge RMSE values
  df_compare_coverage_reduced <- df_compare_coverage_reduced %>%
    left_join(df_metrics, by = "cell_type")

  # Store results in the list
  df_compare_coverage_reduced_list[[name]] <- df_compare_coverage_reduced
}

print("All datasets processed successfully.")


#  CCC Calculation

In [None]:
# Initialize a list to store CCC results
ccc_results_list <- list()

# Compute CCC for all coverage-reduced datasets
for (name in names(results_list_coverage_reduced)) {
  
  print(paste("Computing CCC for:", name))
  
  # Extract predicted values for the current dataset
  df_coverage_reduced_pred <- results_list_coverage_reduced[[name]]
  
  # Ensure row names match
  rownames(df_coverage_reduced_pred) <- rownames(df_true)

  # Compute CCC for each cell type
  results_list_ccc <- lapply(colnames(df_true), function(ct) {
    pred <- df_coverage_reduced_pred[[ct]]
    obs  <- df_true[[ct]]
    
    ccc_df <- data.frame(CCC = round(compute_ccc_manual(obs, pred), 3), cell_type = ct)
    return(ccc_df)
  })

  # Combine CCC results into a single dataframe
  results_df_ccc <- do.call(rbind, results_list_ccc)

  # Store in the CCC results list
  ccc_results_list[[name]] <- results_df_ccc
}

print("All CCC calculations completed.")


# Final data merging for RMSE % CCC

In [None]:
# Loop through all coverage-reduced datasets
for (name in names(results_list_coverage_reduced)) {
  
  print(paste("Processing:", name))
  
  # Extract predicted values for the current dataset
  df_coverage_reduced_pred <- results_list_coverage_reduced[[name]]
  
  # Ensure row names match
  rownames(df_coverage_reduced_pred) <- rownames(df_true)

  # Convert true values to long format
  df_long_true <- df_true %>%
    mutate(combo = rownames(.)) %>%
    melt(id.vars = "combo", variable.name = "cell_type", value.name = "true_value")

  # Convert coverage-reduced predictions to long format
  df_long_coverage_reduced <- df_coverage_reduced_pred %>%
    mutate(combo = rownames(.)) %>%
    melt(id.vars = "combo", variable.name = "cell_type", value.name = "pred_coverage_reduced")

  # Merge true and predicted values
  df_compare_coverage_reduced <- df_long_true %>%
    left_join(df_long_coverage_reduced, by = c("combo", "cell_type"))

  # Get RMSE values for this dataset
  df_metrics <- rmse_results_list[[name]] %>%
    mutate(RMSE = round(RMSE, 3)) %>%
    select(cell_type, RMSE)

  # Get CCC values for this dataset
  df_ccc_metrics <- ccc_results_list[[name]] %>%
    mutate(CCC = round(CCC, 3)) %>%
    select(cell_type, CCC)

  # Merge RMSE & CCC values
  df_compare_coverage_reduced <- df_compare_coverage_reduced %>%
    left_join(df_metrics, by = "cell_type") %>%
    left_join(df_ccc_metrics, by = "cell_type")

  # Store results in the list
  df_compare_coverage_reduced_list[[name]] <- df_compare_coverage_reduced
}


# Plot generation

In [None]:
options(repr.plot.width=28, repr.plot.height=36)

In [None]:
# List to store plots
plot_list <- list()

# Count number of datasets for layout handling
num_plots <- length(df_compare_coverage_reduced_list)
num_cols <- 2  
num_rows <- ceiling(num_plots / num_cols)  

for (i in seq_along(names(df_compare_coverage_reduced_list))) {
  name <- names(df_compare_coverage_reduced_list)[i]
  print(paste("Generating plot for:", name))

  df_compare_coverage_reduced <- df_compare_coverage_reduced_list[[name]]

  # Determine position in the grid
  is_bottom_row <- i > (num_plots - num_cols)
  is_top_row <- i <= num_cols
  is_left_col <- (i %% num_cols == 1)
  is_right_col <- (i %% num_cols == 0)

  p <- ggplot(df_compare_coverage_reduced, aes(x = true_value, y = pred_coverage_reduced)) +
    geom_point(alpha = 0.6, color = "blue") +  
    facet_wrap(~cell_type, scales = "fixed") + 
    geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey") + 
    
    # CCC and RMSE positioned better
    geom_text(aes(x = 0.60, y = 0.95, label = paste0("CCC=", round(CCC, 3))), 
              hjust = 0, size = 5, color = "grey30") +
    geom_text(aes(x = 0.60, y = 0.80, label = paste0("RMSE=", round(RMSE, 3))),  
              hjust = 0, size = 5, color = "grey30") + 

    scale_x_continuous(limits = c(0, 1.0)) + 
    scale_y_continuous(limits = c(0, 1.0)) +  
    theme_bw(base_size = 18) + 
    theme(
      plot.title = element_text(face = "bold", size = 20, hjust = 0.5),
      strip.text = element_text(size = 16),  
      axis.text = element_text(size = 16),
      axis.title = element_text(size = 16),

      # Force panel borders to prevent missing edges
      panel.border = element_rect(color = "black", fill = NA, size = 0.5),

      # Remove y-axis line for second column (inner plots)
      axis.line.y = if (is_right_col || is_left_col) element_line(color = "black", size = 0.75) else element_blank(),

      # Only apply x-axis line on the bottom row
      axis.line.x = if (is_bottom_row) element_line(color = "black", size = 0.75) else element_blank(),

      # Remove excess tick labels inside the grid
      axis.text.x = if (is_bottom_row) element_text(size = 14) else element_blank(),
      axis.text.y = if (is_left_col) element_text(size = 14) else element_blank(),

      # Ensure axis titles are only on the outermost edges
      axis.title.x = if (is_bottom_row) element_text(size = 16, margin = margin(t = 10)) else element_blank(),
      axis.title.y = if (is_left_col) element_text(size = 16, margin = margin(r = 10)) else element_blank()
    ) +
    labs(
      x = "True proportions", 
      y = "Predicted proportions",
      title = paste(name, "coverage")
    )

  plot_list[[name]] <- p
}

# Arrange the plots
grid_plot <- grid.arrange(grobs = plot_list, ncol = num_cols)

# Save with larger size and better resolution
ggsave("coverage_reduction.png", plot = grid_plot, dpi = 500, width = 28, height = 36)
