## ATAC-seq preprocessing and quality control (post-alignment)
- Removes reads to blacklist and chrM
- Filter out low quality reads

## Import required libraries

In [None]:
import os
import subprocess

## Define paths

In [None]:
# Define directories
aligned_bam_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/aligned_trimmed_data/"
output_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/preprocessed_QC__trimmed_data/"
blacklist_bed = "/mnt/DATA1/resources/reference_genomes/blacklist/ENCFF356LFX-hg38.bed"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get list of BAM files
bam_files = [
    f for f in os.listdir(aligned_bam_dir) 
    if f.endswith("_aligned_sorted.bam") and not f.startswith("hepatocytes")
]

## Process each sample

In [None]:
# Process each BAM file
for bam_file in bam_files:
    sample_name = bam_file.replace("_aligned_sorted.bam", "")
    input_bam = os.path.join(aligned_bam_dir, bam_file)

    print(f"\n### Processing {sample_name} ###")

    # Define output file paths
    idxstats_file = os.path.join(output_dir, f"{sample_name}.idxstats")
    stats_file = os.path.join(output_dir, f"{sample_name}.stats")
    blacklisted_bam = os.path.join(output_dir, f"{sample_name}.blacklist_filt.bam")
    no_chrM_bam = os.path.join(output_dir, f"{sample_name}.blacklist_nochrM_filt.bam")
    final_bam = os.path.join(output_dir, f"{sample_name}.final_preprocessed.bam")

    # Run samtools idxstats
    cmd_idxstats = f"samtools idxstats {input_bam} > {idxstats_file}"
    subprocess.run(cmd_idxstats, shell=True, check=True)
    print(f"Generated idxstats: {idxstats_file}")

    # Run samtools stats
    cmd_stats = f"samtools stats {input_bam} > {stats_file}"
    subprocess.run(cmd_stats, shell=True, check=True)
    print(f"Generated mapping stats: {stats_file}")

    # Remove reads in blacklisted regions
    cmd_blacklist_filter = f"bedtools intersect -v -a {input_bam} -b {blacklist_bed} > {blacklisted_bam}"
    subprocess.run(cmd_blacklist_filter, shell=True, check=True)
    print(f"Filtered blacklisted regions: {blacklisted_bam}")

    # Remove chrM reads
    cmd_remove_chrM = (
        f"samtools view -h {blacklisted_bam} | "  
        f"grep -v '^@SQ.*SN:chrM' | "  
        f"awk '$3 != \"chrM\" && $7 != \"chrM\" || $1 ~ /^@/' | "  
        f"samtools view -b -o {no_chrM_bam} -"
    )
    subprocess.run(cmd_remove_chrM, shell=True, check=True)
    print(f"Removed chrM reads: {no_chrM_bam}")

    # Remove low-quality reads (-q 5)
    cmd_remove_low_quality = f"samtools view -b -f 0x2 -q 5 -o {final_bam} {no_chrM_bam}"
    subprocess.run(cmd_remove_low_quality, shell=True, check=True)
    print(f"Removed low-quality reads: {final_bam}")

    # Index the final BAM file
    cmd_index = f"samtools index {final_bam}"
    subprocess.run(cmd_index, shell=True, check=True)
    print(f"Indexed: {final_bam}.bai")

    # Generate final QC report
    cmd_flagstat = f"samtools flagstat {final_bam} > {final_bam}.flagstat"
    subprocess.run(cmd_flagstat, shell=True, check=True)
    print(f"Generated flagstat report: {final_bam}.flagstat")

print("\n### QC Processing Completed for All Samples! ###")
