# Visualization of cfDNA fragment center profiles across universal open/closed and cell-type specific regions
- Loads smoothed cfDNA fragment count matrices (z-score normalized) for selected samples
- Loads cell-type-specific marker regions, universal open regions, and universal closed regions.
- Extracts matching regions from cfDNA data for universal open and closed regions.
- Merges original genomic regions with smoothed cfDNA profiles for all samples.
- Filters merged data to keep only cell-type marker regions.
- Computes average fragment profiles per cell type and sample.
- Combines and aggregates profiles across samples for each cell type.
- Computes mean ± standard deviation for universal open, closed, and cell-type-specific regions.
- Generates subplots visualizing cfDNA fragment patterns across cell types and open/closed states.

## Import required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.ndimage
import re
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import os

## Import pickle cfDNA smoothed z-score normalized counts

In [None]:
# Define the base directory
base_directory = "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/03_intersect_mapped/cfDNA_healthy_original/"

# Define the specific sample IDs 
sample_ids = ["EE87922", "EE87925", "EE87927", "EE87932", "EE87933"]

# Load each pickle file following the naming convention
df_dict = {}
for sample_id in sample_ids:
    file_path = os.path.join(base_directory, sample_id, f"{sample_id}_smoothed_sub.pkl")
    if os.path.exists(file_path):  
        df_dict[sample_id] = pd.read_pickle(file_path)
        print(f"Loaded: {sample_id}, Shape: {df_dict[sample_id].shape}")  
    else:
        print(f"File not found: {file_path}")  


# Import cell type marker file

In [None]:
# Define the path to your pickle file
cell_type_markers = '/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/subset_trimmed_data_markers/all_cell_types_markers.csv'

# Load the CSV file into a DataFrame
df_markers = pd.read_csv(cell_type_markers)
print(df_markers.head())

# Import universal closed and open

In [None]:
# Define the sample IDs
sample_ids = ["EE87922", "EE87925", "EE87927", "EE87932", "EE87933"]

# Create an empty dictionary to store DataFrames
df_universal_closed_dict = {}

# Loop through each sample and load its smoothed universal closed counts
for sample_id in sample_ids:
    file_path = os.path.join(base_directory, sample_id, f"{sample_id}_universal_closed_smoothed.pkl")
    
    if os.path.exists(file_path):  
        df_universal_closed_dict[sample_id] = pd.read_pickle(file_path)
        print(f"Loaded: {sample_id}, Shape: {df_universal_closed_dict[sample_id].shape}")  
    else:
        print(f"File not found: {file_path}")


In [None]:
# Display one of the merged DataFrames
print(df_universal_closed_dict["EE87922"].head())

In [None]:
# Load the universal open region file
universal_open_file = "/mnt/DATA3/daniel/project/02_cfDNA_preprocessing/data/02_split_chromosomes/universal_open_regions/trimmed_universal_accessible_regions.bed"

df_universal_open = pd.read_csv(universal_open_file, sep="\t", header=None, names=["chrom", "start", "end"])

# Display the first few rows to verify
print(df_universal_open.head())


# Import the original region file

In [None]:
# Load the original region file
original_regions_file = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/original_reference_regions_sorted.bed"

df_original = pd.read_csv(original_regions_file, sep="\t", header=None, names=["chrom", "start", "end"])

print(df_original.head())

# Extract universal open regions from cfDNA smoothed files

In [None]:
# Dictionary to store extracted universal open regions from cfDNA smoothed files
df_open_extracted_dict = {}

# Loop through each sample and extract the universal open regions
for sample_id in sample_ids:
    file_path = os.path.join(base_directory, sample_id, f"{sample_id}_smoothed_sub.pkl")
    
    if os.path.exists(file_path):
        df_smoothed = pd.read_pickle(file_path)
        
        # Extract chrom, start, and end from index
        df_smoothed = df_smoothed.reset_index()
        df_smoothed[['chrom', 'coords']] = df_smoothed['index'].astype(str).str.split(':', expand=True)
        df_smoothed[['start', 'end']] = df_smoothed['coords'].str.split('-', expand=True)
        df_smoothed[['start', 'end']] = df_smoothed[['start', 'end']].astype(int)

        # Drop unnecessary columns
        df_smoothed = df_smoothed.drop(columns=['index', 'coords'])

        # Extract only universal open regions
        df_open_extracted = df_smoothed.merge(df_universal_open, on=['chrom', 'start', 'end'], how='inner')
        
        # Store in dictionary
        df_open_extracted_dict[sample_id] = df_open_extracted
        print(f"Extracted {df_open_extracted.shape[0]} universal open regions for {sample_id}")

    else:
        print(f"File not found: {file_path}")


# Assign original regions to the matrices

In [None]:
# Dictionary to store merged DataFrames for each sample
df_merged_dict = {}

# Loop through each sample and merge with original regions
for sample_id in sample_ids:
    if sample_id in df_dict: 
        df_smoothed_sub = df_dict[sample_id]  

        # Make a copy of df_original
        df_merged = df_original.copy()

        # Append the values from df_smoothed_sub
        df_merged = pd.concat([df_merged, df_smoothed_sub.reset_index(drop=True)], axis=1)

        # Store merged DataFrame in dictionary
        df_merged_dict[sample_id] = df_merged
        
        print(f"Merged {sample_id}: {df_merged.shape}")

# Display one of the merged DataFrames
print(df_merged_dict["EE87922"].head())


# Extract the cell type marker regions from the smoothed cfDNA samples

In [None]:
# Ensure start and end are integers in df_markers
df_markers[['start', 'end']] = df_markers[['start', 'end']].astype(int)

# Dictionary to store the filtered DataFrames for each sample
df_filtered_markers_dict = {}

# Loop through each sample and extract marker-associated regions
for sample_id, df_merged in df_merged_dict.items():
    
    # Ensure start and end are integers in df_merged
    df_merged[['start', 'end']] = df_merged[['start', 'end']].astype(int)

    # Extract only the rows from df_merged that match df_markers regions
    df_filtered_markers = df_merged.merge(df_markers, on=['chrom', 'start', 'end'], how='inner')

    # Store filtered DataFrame in dictionary
    df_filtered_markers_dict[sample_id] = df_filtered_markers
    
    print(f"Filtered markers for {sample_id}: {df_filtered_markers.shape}")

# Display the first few rows of a filtered sample
print(df_filtered_markers_dict["EE87922"].head())


# Compute the mean and plotting

In [None]:
# Dictionary to store sample-wise (marker-based) means
df_mean_per_sample_dict = {}  # store (2000 x cell_types) per sample

for sample_id, df_filtered_markers in df_filtered_markers_dict.items():
    # 1. Group by cell_type, average -> shape(#cell_types, 2000)
    df_mean_per_cell = df_filtered_markers.groupby('cell_type').mean(numeric_only=True)
    # 2. Transpose -> shape(2000, #cell_types)
    df_mean_per_cell = df_mean_per_cell.T
    
    df_mean_per_sample_dict[sample_id] = df_mean_per_cell

# Stack all samples row-wise -> shape(2000*S, #cell_types)
df_combined_means = pd.concat(df_mean_per_sample_dict.values(), axis=0)

# Mean across samples (group by row index -> 2000 bins)
df_mean_across_samples = df_combined_means.groupby(df_combined_means.index).mean()
# Std across samples
df_std_across_samples  = df_combined_means.groupby(df_combined_means.index).std()

In [None]:
# Dictionary to store universal-open mean per sample
df_universal_open_mean_dict = {}

for sample_id, df_open_extracted in df_open_extracted_dict.items():
    numeric_cols = df_open_extracted.select_dtypes(include=[np.number]).columns
    mean_profile = df_open_extracted[numeric_cols].mean(axis=0)
    df_universal_open_mean_dict[sample_id] = mean_profile

df_universal_open_combined = pd.DataFrame(df_universal_open_mean_dict)
universal_open_mean = df_universal_open_combined.mean(axis=1)  
universal_open_std  = df_universal_open_combined.std(axis=1)  


In [None]:
# Dictionary for universal-closed mean per sample
df_universal_closed_mean_dict = {}

for sample_id, df_closed in df_universal_closed_dict.items():
    numeric_cols = df_closed.select_dtypes(include=[np.number]).columns
    mean_profile = df_closed[numeric_cols].mean(axis=0)
    df_universal_closed_mean_dict[sample_id] = mean_profile

df_universal_closed_combined = pd.DataFrame(df_universal_closed_mean_dict)
universal_closed_mean = df_universal_closed_combined.mean(axis=1)
universal_closed_std  = df_universal_closed_combined.std(axis=1)

In [None]:
df_mean_across_samples["Universal_Open"] = universal_open_mean
df_mean_across_samples["Universal_Closed"] = universal_closed_mean

df_std_across_samples["Universal_Open"] = universal_open_std
df_std_across_samples["Universal_Closed"] = universal_closed_std

# Convert index to numeric (so we can do linspace, etc.)
df_mean_across_samples.index = pd.to_numeric(df_mean_across_samples.index, errors='coerce')
df_std_across_samples.index  = pd.to_numeric(df_std_across_samples.index, errors='coerce')

# Drop any NaNs in the index
df_mean_across_samples = df_mean_across_samples[~df_mean_across_samples.index.isna()]
df_std_across_samples  = df_std_across_samples[~df_std_across_samples.index.isna()]

# Sort the index if needed
df_mean_across_samples = df_mean_across_samples.sort_index()
df_std_across_samples  = df_std_across_samples.sort_index()

In [None]:
col_order = ["Universal_Open", "Universal_Closed"] + [
    col for col in df_mean_across_samples.columns
    if col not in ["Universal_Open", "Universal_Closed"]
]

df_reordered_mean = df_mean_across_samples[col_order]
df_reordered_std  = df_std_across_samples[col_order]

In [None]:
df_cols = df_reordered_mean.columns
n_plots = len(df_cols)
ncols = 3
nrows = 4 

fig, axes = plt.subplots(nrows, ncols, figsize=(18, 5*nrows), sharex=False, sharey=True)
axes = axes.flatten()

# Define custom x-range
original_ticks = np.linspace(df_reordered_mean.index.min(),
                             df_reordered_mean.index.max(),
                             num=5)
new_tick_labels = np.linspace(-1000, 1000, num=5)

for i, column_name in enumerate(df_cols):
    if i >= len(axes):
        break
    ax = axes[i]
    ax.set_title(column_name, fontsize=16)

    mean_values = df_reordered_mean[column_name]
    std_values  = df_reordered_std[column_name]

    # Plot the mean line
    ax.plot(df_reordered_mean.index, mean_values, label=column_name, color="blue")
    
    # Shaded area: mean ± 1 std
    ax.fill_between(
        df_reordered_mean.index,
        mean_values - std_values,
        mean_values + std_values,
        alpha=0.2, color="blue", label="±1 SD"
    )
    
    # Set x-ticks
    ax.set_xticks(original_ticks)
    ax.set_xticklabels([f"{int(x)}" for x in new_tick_labels])
    # Increase the tick label font size on both axes to 16
    plt.setp(ax.get_xticklabels(), fontsize=16)
    plt.setp(ax.get_yticklabels(), fontsize=16)
    
    ax.grid(True, linestyle="--", alpha=0.5)

for j in range(i+1, nrows*ncols):
    axes[j].set_visible(False)

# Add a figure-level title 
fig.suptitle("Smoothed z-score normalized cfDNA fragment center count profiles", fontsize=20, fontweight="bold")

# Adjust layout 
plt.tight_layout(rect=[0, 0, 1, 0.98])

# Save the figure 
plt.savefig("smoothed_zscore_cfDNA_profiles.png", dpi=600, bbox_inches="tight")

plt.show()
