# Creating full-coverage synthetic cfDNA mixtures from reference marker profiles
- Loads reference marker profiles and synthetic cell type proportions
- Multiplies marker signals with cell type fractions to simulate synthetic cfDNA mixtures
- Sums across cell types to get total signal per synthetic sample

## Import required libraries

In [None]:
import pandas as pd
import numpy as np

## Load the reference profile matrix and syn combos

In [None]:
# File paths
file_path_reference_matrix = "/mnt/DATA3/daniel/project/03_synthetic_samples/data/reference_marker_counts.csv"
file_path_syn_combo        = "/mnt/DATA3/daniel/project/03_synthetic_samples/data/combinations_syn_samples.csv"

# Load the reference profiles
reference_counts_df = pd.read_csv(file_path_reference_matrix)

# Load the synthetic combos 
syn_combo_df = pd.read_csv(file_path_syn_combo, sep=';')
syn_combo_df.columns = syn_combo_df.columns.str.strip()

# Quick check
print("reference_counts_df:\n", reference_counts_df.head(), "\n")
print("syn_combo_df:\n", syn_combo_df.head(), "\n")

## Create synthetic datasets

In [None]:
def multiply_reference_with_combinations(reference_df, combo_df):
    """
    Multiplies the fractions in the combo_df with the accessibility scores in reference_df
    for each corresponding cell type.
    
    Args:
        reference_df (pd.DataFrame): Reference matrix with peaks and cell types.
        combo_df (pd.DataFrame): Synthetic combinations with proportions for each cell type.
        
    Returns:
        pd.DataFrame: A new dataframe with the multiplied values.
    """
    # Set peak_id as index for reference matrix
    reference_matrix = reference_df.set_index("peak_id")

    # Set the first column (combo names) as index for synthetic combinations
    combo_matrix = combo_df.set_index(combo_df.columns[0])

    # Ensure both have matching cell type column names
    common_cell_types = reference_matrix.columns.intersection(combo_matrix.columns)

    # Subset to only the common cell types
    reference_matrix = reference_matrix[common_cell_types]
    combo_matrix = combo_matrix[common_cell_types]

    # Perform element-wise multiplication for each combo separately
    synthetic_results = {
        f"{combo}": reference_matrix.multiply(combo_matrix.loc[combo], axis=1)
        for combo in combo_matrix.index
    }

    # Concatenate results along the columns
    synthetic_df = pd.concat(synthetic_results, axis=1)

    return synthetic_df


In [None]:
synthetic_multiplied_df = multiply_reference_with_combinations(reference_counts_df, syn_combo_df)

In [None]:
print(synthetic_multiplied_df)

In [None]:
# Sum across cell type columns for each combo
synthetic_summed_df = synthetic_multiplied_df.groupby(level=0, axis=1).sum()

# Display the result
print(synthetic_summed_df)

In [None]:
# Compute basic statistics for the synthetic matrix
stats_summary = synthetic_summed_df.describe()
print(stats_summary)

## Save dataframe as csv file

In [None]:
synthetic_summed_df.to_csv("/mnt/DATA3/daniel/project/03_synthetic_samples/data/synthetic_markers_summed.csv",  index=True)