# Pre-processing for cfDNA fragment center counts
- Aggregate counts into 2000 bp intervals
- z-score normalization
- Smoothing with Whittaker smoothing followed by Guassian filtering

## Import required libraries

In [None]:
import os
import pandas as pd
import numpy as np
from whittaker_eilers import WhittakerSmoother
import scipy.ndimage
from scipy.stats import zscore

## Smoothing function (Whittaker and Gaussian smooting)

In [None]:
def smooth_fragment_centers(fragment_center_array, lmbda=1000, sigma=30):
    """Applies Whittaker smoothing followed by Gaussian smoothing."""
    whittaker_smoother = WhittakerSmoother(
        lmbda=lmbda, order=2, data_length=len(fragment_center_array))
    smoothed_fragment_centers = np.array(whittaker_smoother.smooth(fragment_center_array))
    return scipy.ndimage.gaussian_filter1d(smoothed_fragment_centers, sigma)


## Function to load, normalize, smooth and save cfDNA fragment center counts

In [None]:
def process_sample(sample_path):
    """Processes a single sample by loading, normalizing, and smoothing fragment counts."""
    matrix_file = os.path.join(sample_path, 'mapped_counts', f"{os.path.basename(sample_path)}_all_counts.bed")
    if not os.path.exists(matrix_file):
        print(f"Missing file: {matrix_file}, skipping...")
        return
    
    # Load and format cfDNA raw fragment counts
    df_matrix = pd.read_csv(matrix_file, sep="\t", header=None, names=['chrom', 'start', 'end', 'count'])
    counts = df_matrix['count'].values
    
    # Define interval size and reshape counts
    interval_size = 2000
    reshaped_counts = counts.reshape(-1, interval_size)
    df_counts_split = pd.DataFrame(reshaped_counts)
    
    # Extract start and end positions for each interval
    regions = df_matrix.iloc[::interval_size, :3].reset_index(drop=True)
    regions['end'] = regions['start'] + interval_size
    
    # Ensure regions match row count
    if len(regions) != df_counts_split.shape[0]:
        raise ValueError(f"Mismatch in reshaped data and BED regions for {sample_path}")
    
    # Assign region indices
    df_counts_split.index = regions.apply(lambda row: f"{row['chrom']}:{row['start']}-{row['end']}", axis=1)
    
    # Z-score normalization
    df_zscore_rows = df_counts_split.apply(zscore, axis=1).fillna(0)
    
    # Apply smoothing
    df_smoothed = df_zscore_rows.apply(lambda row: smooth_fragment_centers(row.values), axis=1)
    df_smoothed = pd.DataFrame(df_smoothed.tolist(), columns=df_zscore_rows.columns, index=df_zscore_rows.index)
    
    # Save as pickle
    output_file = os.path.join(sample_path, f"{os.path.basename(sample_path)}_smoothed.pkl")
    df_smoothed.to_pickle(output_file)
    print(f"Processed and saved: {output_file}")


## Main function for processing

In [None]:
def main():
    """Main function to process all samples in the base directory."""
    base_dir = "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/03_intersect_mapped/cfDNA_healthy_new/"
    
    # Iterate over each sample directory
    for sample in os.listdir(base_dir):
        sample_path = os.path.join(base_dir, sample)
        if os.path.isdir(sample_path):
            print(f"Processing: {sample}")
            process_sample(sample_path)
    
    print("Processing complete for all samples.")

if __name__ == "__main__":
    main()
