## Alignment to hg38 with bowtie2 on the adapter trimmed data

## Import required libraries

In [None]:
import subprocess
import os

## Define paths

In [None]:
# Define directories
raw_data_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/trimmed_data/"
output_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/aligned_trimmed_data/"
bowtie2_index_base = "/mnt/DATA1/resources/reference_genomes/hg38"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get list of all sample directories
sample_dirs = [d for d in os.listdir(raw_data_dir) if os.path.isdir(os.path.join(raw_data_dir, d))]

# List to store sample details
samples = []


## Process each sample

In [None]:
# Populate the list of samples
for sample_dir in sample_dirs:
    sample_name = sample_dir  # Use folder name as sample name
    fastq1 = os.path.join(raw_data_dir, sample_dir, f"{sample_name}_1.trimmed.fastq")
    fastq2 = os.path.join(raw_data_dir, sample_dir, f"{sample_name}_2.trimmed.fastq")
    sorted_bam = os.path.join(output_dir, f"{sample_name}_aligned_sorted.bam")
    dedup_bam = os.path.join(output_dir, f"{sample_name}_aligned_sorted_dedup.bam")

    # Check if the expected files exist before adding
    if os.path.exists(fastq1) and os.path.exists(fastq2):
        samples.append({'sample_name': sample_name, 'fastq1': fastq1, 'fastq2': fastq2, 
                        'sorted_bam': sorted_bam, 'dedup_bam': dedup_bam})
    else:
        print(f"Skipping {sample_name} (missing FASTQ files)")

# Process each sample
for sample in samples:
    sample_name = sample['sample_name']
    fastq1 = sample['fastq1']
    fastq2 = sample['fastq2']
    sorted_bam = sample['sorted_bam']
    dedup_bam = sample['dedup_bam']

    print(f"\n### Processing {sample_name} with Bowtie2 ###")

    # Bowtie2 command
    bowtie2_command = [
        "bowtie2",
        "-p", "16",
        "--very-sensitive",
        "-x", bowtie2_index_base,
        "-1", fastq1,
        "-2", fastq2
    ]

    # Samtools sort command
    samtools_sort_command = [
        "samtools", "sort",
        "-@", "16",
        "-o", sorted_bam
    ]

    try:
        # Run Bowtie2 and pipe output to Samtools sort
        with subprocess.Popen(bowtie2_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as bowtie2_process, \
             subprocess.Popen(samtools_sort_command, stdin=bowtie2_process.stdout, stderr=subprocess.PIPE) as samtools_process:

            bowtie2_process.stdout.close()  
            _, samtools_err = samtools_process.communicate()

            if samtools_process.returncode != 0:
                print(f"Error: Samtools sorting failed for {sample_name}\n{samtools_err.decode('utf-8')}")
                continue

        print(f"Alignment and sorting completed for {sample_name}.")

        # Remove duplicates using Samtools markdup
        print(f"Removing duplicates for {sample_name}...")
        samtools_markdup_command = [
            "samtools", "markdup",
            "-r", "-@", "16", sorted_bam, dedup_bam
        ]
        subprocess.run(samtools_markdup_command, check=True)
        print(f"Duplicate removal completed for {sample_name}.")

        # Run Samtools index
        samtools_index_command = ["samtools", "index", dedup_bam]
        subprocess.run(samtools_index_command, check=True)

        print(f"Indexing completed for {sample_name}.")

    except Exception as e:
        print(f"An error occurred while processing {sample_name}: {e}")

print("\n### All samples have been processed! ###")