# Determine scaling factors for down-sampling in synthetic datasets
- Scales the reference markers so that the minimum non-zero value equals 1
- Computes the total summed reference marker signal
- Computes the total fragment count per cfDNA sample and their mean
- Estimates the expected total counts for the different coverage levels based on the cfDNA mean
- Calculates scaling factors to simulate synthetic datasets at the desired coverage

## Import required libraries

In [None]:
import pandas as pd
import glob

## Load the cfDNA files

In [None]:
# Define the base directory containing the files
base_dir = "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/03_intersect_mapped/cfDNA_healthy_original/"

# List of sample IDs
sample_ids = ["EE87922", "EE87925", "EE87927", "EE87932", "EE87933"]

# Create an empty dictionary to store DataFrames
dfs = {}

# Loop through each sample ID and load the corresponding BED file
for sample in sample_ids:
    file_path = f"{base_dir}/{sample}/mapped_counts/Gfeller/{sample}_summed_marker_counts.bed"
    
    # Load the BED file
    df = pd.read_csv(file_path, sep="\t", header=None, names=["chrom", "start", "end", f"{sample}"])
    
    # Store in dictionary
    dfs[sample] = df

# Merge all DataFrames on the "chrom", "start", "end" columns
df_merged = dfs[sample_ids[0]]  

for sample in sample_ids[1:]:  # Merge the remaining ones
    df_merged = df_merged.merge(dfs[sample], on=["chrom", "start", "end"], how="outer")

# Display the first few rows of the merged DataFrame
print(df_merged.head())


## Load the reference marker file

In [None]:
# Define path for synthetic dataset
marker_file = "/mnt/DATA3/daniel/project/03_synthetic_samples/data/reference_marker_counts.csv"

# Load synthetic dataset
df_marker = pd.read_csv(marker_file)

In [None]:
print(df_marker)

In [None]:
# Remove the 'peak_id' column to sum only the numeric values
df_numeric = df_marker.iloc[:, 1:]

# Compute the total summed signal in the reference marker matrix
total_reference_signal = df_numeric.sum().sum()

print(f"Total summed signal in reference marker matrix: {total_reference_signal:.2f}")


## Scaling the reference marker matrix to replicate fragment counts

In [None]:
# Ensure peak_id is included
df_scaled = df_marker.set_index("peak_id")  

# Find the minimum non-zero value across all regions and cell types
min_signal = df_scaled[df_scaled > 0].min().min()

# Scale reference marker matrix so that min signal = 1
df_scaled = df_scaled / min_signal

# Reset index to keep peak_id as a column
df_scaled.reset_index(inplace=False)

# Display the scaled DataFrame
print(df_scaled.head())


In [None]:
# Compute total summed signal in the scaled reference marker matrix
total_reference_signal_scaled = df_scaled.sum().sum()

print(f"Total summed signal in scaled reference marker matrix: {total_reference_signal_scaled:.2f}")


In [None]:
print(df_scaled.describe())

In [None]:
# Define output file path
# output_file = "/mnt/DATA3/daniel/project/2_cfDNA_data/data/reference_marker_counts_scaled.csv"

# Save DataFrame as CSV, keeping the index
# df_scaled.to_csv(output_file, index=True)

# print(f"Scaled reference marker matrix saved to: {output_file}")


## cfDNA calculations

In [None]:
# Compute total summed counts for each individual sample
sample_totals = {sample: df_merged[sample].sum() for sample in sample_ids}

# Print summed counts per sample
for sample, total in sample_totals.items():
    print(f"Total summed counts for {sample}: {total}")


In [None]:
# Compute the mean total counts across all samples
mean_total_signal = sum(sample_totals.values()) / len(sample_totals)

print(f"Mean summed counts across healthy cfDNA samples: {mean_total_signal}")


In [None]:
mean_per_region = mean_total_signal/716
print(mean_per_region)

## Determine the equivalent fragment signal

In [None]:
# Define target coverages
target_coverages = [0.1, 0.3, 1, 3, 9, 30, 80, 245]

# Given cfDNA coverage
cfDNA_coverage = 2.7

# Compute equivalent signal
equivalent_signal = {target: (target / cfDNA_coverage) * mean_total_signal for target in target_coverages}

# Print computed equivalent signals
print("Computed equivalent signal:")
for target, signal in sorted(equivalent_signal.items(), reverse=True):  
    print(f"Target coverage: {target}x, Computed equivalent signal: {signal:.2f}")


## Determine the divisor

In [None]:
# Compute divisors based on scaled reference marker signal
divisors = {target: total_reference_signal_scaled / equivalent_signal[target] for target in target_coverages}

# Print computed divisors
print("\nComputed divisors for synthetic sample creation:")
for target, divisor in sorted(divisors.items(), reverse=True):
    print(f"Target coverage: {target}x, Computed divisor: {divisor:.2f}")