## Trimming paired-end FASTQ files using trimmomatic
- Uses Trimmomatic to perform adapter removal and quality trimming.

## Import required libraries

In [None]:
import os
import subprocess

## Define paths

In [None]:
# Define paths
raw_data_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/raw_data"
trimmed_data_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/trimmed_data/"
trimmomatic_jar = "/home/ubuntu/miniconda3/envs/daniel_env_py/share/trimmomatic-0.39-2/trimmomatic.jar"
adapter_file = "/home/ubuntu/miniconda3/envs/daniel_env_py/share/trimmomatic-0.39-2/adapters/NexteraPE-PE.fa"

# Ensure trimmed data directory exists
os.makedirs(trimmed_data_dir, exist_ok=True)

## Process each sample

In [None]:
# Get list of sample directories
sample_dirs = [d for d in os.listdir(raw_data_dir) if os.path.isdir(os.path.join(raw_data_dir, d))]

# Process each sample
for sample_dir in sample_dirs:
    sample_name = sample_dir  # Use folder name as sample name
    fastq1 = os.path.join(raw_data_dir, sample_dir, f"{sample_name.split('_')[-1]}_1.fastq")
    fastq2 = os.path.join(raw_data_dir, sample_dir, f"{sample_name.split('_')[-1]}_2.fastq")

    # Define cell type folder
    cell_type_folder = os.path.join(trimmed_data_dir, sample_name)
    os.makedirs(cell_type_folder, exist_ok=True)  

    # Define output trimmed FASTQ file paths inside cell type folder
    trimmed_fastq1 = os.path.join(cell_type_folder, f"{sample_name}_1.trimmed.fastq")
    trimmed_fastq2 = os.path.join(cell_type_folder, f"{sample_name}_2.trimmed.fastq")

    # Trimmomatic command
    trimmomatic_command = [
        "java", "-jar", trimmomatic_jar, "PE", "-threads", "8",
        fastq1, fastq2,  # Input paired-end FASTQ files
        trimmed_fastq1, "/dev/null",  # Forward reads (trimmed) + discard unpaired
        trimmed_fastq2, "/dev/null",  # Reverse reads (trimmed) + discard unpaired
        "ILLUMINACLIP:" + adapter_file + ":2:30:10",  # Adapter trimming
        "LEADING:3",  # Remove leading low-quality bases
        "TRAILING:3",  # Remove trailing low-quality bases
        "SLIDINGWINDOW:4:15",  # Sliding window quality filter
        "MINLEN:36"  # Minimum read length
    ]

    # Run trimming
    try:
        print(f"Processing {sample_name} with Trimmomatic...")
        subprocess.run(trimmomatic_command, check=True)
        print(f"Trimming completed for {sample_name}. Trimmed files saved in: {cell_type_folder}")
    except subprocess.CalledProcessError as e:
        print(f"Error processing {sample_name}: {e}")

print("\n### All samples have been trimmed and stored in their respective cell type folders! ###")
