# Post-processing for cfDNA fragment center counts
- Trimming to 100 bp
- row-wise averaging within the trimmed data
- Re-assignment to the marker regions

## Import required libraries

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## Load smoothed cfDNA data from directory

In [None]:
# Load smoothed cfDNA data
def load_smoothed_data(base_directory):
    df_dict = {}
    for sample_id in os.listdir(base_directory):
        sample_path = os.path.join(base_directory, sample_id, f"{sample_id}_smoothed.pkl")
        if os.path.exists(sample_path):
            df_dict[sample_id] = pd.read_pickle(sample_path)
            print(f"Loaded: {sample_id}, Shape: {df_dict[sample_id].shape}")
    return df_dict


## Trim each cfDNA sample to the central 100 bp 

In [None]:
# Trim cfDNA to 100 bp
def trim_cfDNA(df_dict, start=950, end=1050):
    df_trimmed_dict = {}
    for sample_id, df in df_dict.items():
        trimmed_df = df.iloc[:, start:end].copy()
        trimmed_df.columns = range(trimmed_df.shape[1])
        df_trimmed_dict[sample_id] = trimmed_df
    return df_trimmed_dict


## Compute row-wise mean fragment center signal for each sample

In [None]:
# Calculate row-wise mean for trimmed cfDNA data
def calculate_row_means(df_trimmed_dict):
    row_means_dict = {sample_id: df.mean(axis=1) for sample_id, df in df_trimmed_dict.items()}
    return row_means_dict


## Load marker regions from BED file

In [None]:
# Load marker regions
def load_marker_regions(marker_file_path):
    df_markers = pd.read_csv(marker_file_path, sep="\t", header=None, names=["chrom", "start", "end"])
    df_markers["chrom"] = df_markers["chrom"].astype(str).str.strip()
    df_markers["start"] = df_markers["start"].astype(int)
    df_markers["end"] = df_markers["end"].astype(int)
    print(f"Loaded {len(df_markers)} marker regions.")
    return df_markers


## Assign mean cfDNA signals to corresponding marker regions

In [None]:
# Assign cfDNA data to original marker regions
def assign_original_regions(df_markers, row_means_dict):
    df_combined = df_markers.copy()
    for sample_id, row_means in row_means_dict.items():
        df_combined[sample_id] = row_means.values
    return df_combined


## Scale signal values per region to 0,1 after inverting signal direction

In [None]:
# Normalize and scale data
def transform_data(df_combined):
    metadata_cols = ["chrom", "start", "end"]
    sample_cols = [col for col in df_combined.columns if col not in metadata_cols]

    df_combined[sample_cols] = df_combined[sample_cols] * -1

    scaler = MinMaxScaler(feature_range=(0, 1))
    df_combined[sample_cols] = scaler.fit_transform(df_combined[sample_cols])

    return df_combined


## Add marker start/end coordinates to the final DataFrame

In [None]:
# Assign original marker regions directly
def assign_marker_regions(df_combined, marker_file_path):
    df_markers = pd.read_csv(marker_file_path, sep="\t", header=None, names=["chrom", "marker_start", "marker_end"])

    if df_combined.shape[0] != df_markers.shape[0]:
        raise ValueError(f"Mismatch: df_combined has {df_combined.shape[0]} rows, but df_markers has {df_markers.shape[0]} rows.")

    df_combined["marker_start"] = df_markers["marker_start"].values
    df_combined["marker_end"] = df_markers["marker_end"].values

    return df_combined


## Save the final transformed DataFrame to a BED file

In [None]:
# Save final transformed data
def save_combined_data(df_combined, output_bed_file):
    df_combined.to_csv(output_bed_file, sep="\t", header=True, index=False)
    print(f"Saved transformed df to: {output_bed_file}")


## Main function for post-processing

In [None]:
def main():
    base_directory = "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/03_intersect_mapped/cfDNA_healthy_new/"
        
    marker_file_path = "/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/new_pairwise_cell_types_markers.bed"
    
    output_bed_file = os.path.join(base_directory, "cfDNA_healthy_samples_new.bed")

    df_dict = load_smoothed_data(base_directory)
    df_trimmed_dict = trim_cfDNA(df_dict)
    row_means_dict = calculate_row_means(df_trimmed_dict)

    df_markers = load_marker_regions(marker_file_path)
    df_combined = assign_original_regions(df_markers, row_means_dict)

    df_scaled = transform_data(df_combined)
    df_final = assign_marker_regions(df_scaled, marker_file_path)

    save_combined_data(df_final, output_bed_file)

    print("Processing complete.")


In [None]:
if __name__ == "__main__":
    main()