# Creating synthetic cfDNA datasets across simulated sequencing coverage levels for down-sampling experiment
- Loads cell-type-specific reference accessibility profiles and synthetic cell composition combinations
- Simulates various sequencing depths (e.g., 245x to 0.1x) by scaling down the accessibility values
- Applies a function to generate coverage-reduced versions of the ATAC-seq reference matrix by dividing and rounding counts
- Multiplies reduced reference profiles with synthetic compositions to simulate mixed cfDNA profiles at each coverage level

## Import required libraries

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import numpy as np
import os

## Load the reference profile matrix, variance data and syn combos

In [None]:
# Define file paths
atac_path = "/mnt/DATA3/daniel/project/03_synthetic_samples/data/reference_marker_counts_scaled.csv"

syn_combo_path = "/mnt/DATA3/daniel/project/03_synthetic_samples/data/combinations_syn_samples.csv"

# Load datasets
ATAC_marker_counts_df = pd.read_csv(atac_path)
syn_combo_df = pd.read_csv(syn_combo_path, sep=';')
syn_combo_df.columns = syn_combo_df.columns.str.strip()

# Ensure proper indexing
if "peak_id" in ATAC_marker_counts_df.columns:
    ATAC_marker_counts_df.set_index("peak_id", inplace=True)

# Quick check
print(ATAC_marker_counts_df.head())
print(syn_combo_df.head())


In [None]:
# Check dimensions
print(ATAC_marker_counts_df.shape)
print(syn_combo_df.shape)

## Coverage reduction function

In [None]:
def reduce_coverage(df: pd.DataFrame, divisor: float) -> pd.DataFrame:
    """
    Reduce coverage by dividing all values in the dataframe by a divisor,
    then round to integers.
    """
    # Scale to a lower mean
    scaled_df = df / divisor
    
    # Round to nearest integer
    rounded_df = scaled_df.round().astype(int)
    
    return rounded_df


## Define the coverage levels for reduction

In [None]:
# Define coverage levels for reduction
coverage_levels = {
    "245x": 1.15,   
    "80x": 3.53,
    "30x": 9.42,
    "9x": 31.41,    
    "3x": 94.24,
    "1x": 282.71,
    "0.3x": 942.36,
    "0.1x": 2827.08
}


## Application

In [None]:
# Apply coverage reduction to ATAC-seq data for all levels
reduced_coverage_dfs = {
    cov_label: reduce_coverage(ATAC_marker_counts_df, val_div) 
    for cov_label, val_div in coverage_levels.items()
}

# Print an example output
print("Example: ATAC reduced to 0.1x coverage")
print(reduced_coverage_dfs["0.1x"].head())

In [None]:
print(reduced_coverage_dfs["1x"].describe())


# Function for synthetic datasets

In [None]:
def multiply_reference_with_combinations(reference_df, combo_df):
    """
    Multiplies the fractions in the combo_df with the accessibility scores in reference_df
    for each corresponding cell type.
    
    Args:
        reference_df (pd.DataFrame): Reference matrix with peaks and cell types.
        combo_df (pd.DataFrame): Synthetic combinations with proportions for each cell type.
        
    Returns:
        pd.DataFrame: A new dataframe with the multiplied values.
    """

    # Ensure "peak_id" is the index
    if reference_df.index.name != "peak_id":
        reference_df = reference_df.set_index("peak_id")  

    # Set the first column (combo names) as index for synthetic combinations
    combo_matrix = combo_df.set_index(combo_df.columns[0])

    # Ensure both have matching cell type column names
    common_cell_types = reference_df.columns.intersection(combo_matrix.columns)

    # Subset to only the common cell types
    reference_matrix = reference_df[common_cell_types]
    combo_matrix = combo_matrix[common_cell_types]

    # Perform element-wise multiplication for each combo separately
    synthetic_results = {
        f"{combo}": reference_matrix.multiply(combo_matrix.loc[combo], axis=1)
        for combo in combo_matrix.index
    }

    # Concatenate results along the columns
    synthetic_df = pd.concat(synthetic_results, axis=1)

    return synthetic_df


## Create synthetic datasets

In [None]:
# Initialize a dictionary to store synthetic datasets for each coverage level
synthetic_datasets = {}

# Loop through each coverage-reduced dataset
for cov_label, df_reduced in reduced_coverage_dfs.items():
    print(f"Processing synthetic dataset for coverage level: {cov_label}")

    # Generate synthetic dataset using the reduced ATAC-seq data
    synthetic_multiplied_df = multiply_reference_with_combinations(df_reduced, syn_combo_df)

    # Sum across cell type columns for each combo
    synthetic_summed_df = synthetic_multiplied_df.groupby(level=0, axis=1).sum()

    # Store the result
    synthetic_datasets[cov_label] = synthetic_summed_df

    # Print preview
    print(f"Preview of synthetic dataset at {cov_label} coverage:")
    print(synthetic_summed_df.head())

## Save the files

In [None]:
# Define the output directory
output_dir = "/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_coverage_reduced_0bp/"

# Ensure the directory exists
os.makedirs(output_dir, exist_ok=True)

# Loop through each synthetic dataset and save as CSV
for cov_label, df_synthetic in synthetic_datasets.items():
    file_path = os.path.join(output_dir, f"synthetic_dataset_{cov_label}.csv")
    df_synthetic.to_csv(file_path, sep=";", index=True) 
    print(f"Saved: {file_path}")