# Average replicates in bigwig files from Corces et al.

## Import required libraries

In [None]:
import os
import subprocess
import glob
import re
import tempfile

## Define directories and load files

In [None]:
# Paths
merged_files_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/corces/LUSC_merged_samples/"
os.makedirs(merged_files_dir, exist_ok=True)

bw_files = glob.glob("/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/corces/LUSC_bigWig_data/LUSC_*.bw")

In [None]:
# Filter files
t1_files = [f for f in bw_files if "_T1_" in f]
t2_files = [f for f in bw_files if "_T2_" in f]

In [None]:
def get_sample_prefix(filename):
    parts = re.split(r"_L\d+_", os.path.basename(filename))
    return parts[0]

In [None]:
# Create file dictionaries
t1_dict = {get_sample_prefix(f): f for f in t1_files}
t2_dict = {get_sample_prefix(f): f for f in t2_files}

In [None]:
# Match T1 and T2 files
pairs = {prefix: (t1_file, t2_dict[prefix]) for prefix, t1_file in t1_dict.items() if prefix in t2_dict}

## Average replicates
- Check if both replicates have a score, if yes take the average

In [None]:
# Process each pair
for prefix, (t1_file, t2_file) in pairs.items():
    output_bg = os.path.join(merged_files_dir, f"{prefix}_merged.bedGraph")
    print(f"Processing sample: {prefix}")

    # Temporary files for intermediate steps
    with tempfile.NamedTemporaryFile(delete=False) as t1_temp, tempfile.NamedTemporaryFile(delete=False) as t2_temp:
        t1_temp_name = t1_temp.name
        t2_temp_name = t2_temp.name

        # Convert BigWig to BedGraph
        subprocess.run(
            ["bigWigToBedGraph", t1_file, t1_temp_name],
            check=True
        )
        subprocess.run(
            ["bigWigToBedGraph", t2_file, t2_temp_name],
            check=True
        )

        # Combine BedGraphs using unionbedg
        combined_temp_name = tempfile.NamedTemporaryFile(delete=False).name
        subprocess.run(
            ["bedtools", "unionbedg", "-i", t1_temp_name, t2_temp_name],
            stdout=open(combined_temp_name, "w"),
            check=True
        )

        # Filter for regions present in both replicates and calculate the mean score
        with open(output_bg, "w") as out:
            subprocess.run(
                ["awk", '{OFS="\t"; if ($4 != "." && $5 != ".") {mean=($4+$5)/2; print $1,$2,$3,mean}}'],
                input=open(combined_temp_name).read(),
                stdout=out,
                text=True,
                check=True
            )

        # Remove temporary files
        os.remove(t1_temp_name)
        os.remove(t2_temp_name)
        os.remove(combined_temp_name)

print("Merging completed. Check the LUAD_merged_samples directory for output files.")