## Adapter detection in ATAC-Seq raw data

## Import required libraries

In [None]:
import os
import subprocess
import multiprocessing
import json

## Run fastq on the ATAC-Seq raw data

In [None]:
# Define input and output directories
raw_data_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/raw_data"
fastp_output_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/fastp_reports"

# Ensure output directory exists
os.makedirs(fastp_output_dir, exist_ok=True)

# Get a list of all FASTQ files in the dataset
fastq_files = []
for root, _, files in os.walk(raw_data_dir):
    for file in files:
        if file.endswith(".fastq") or file.endswith(".fq"):
            fastq_files.append(os.path.join(root, file))

print(f"Found {len(fastq_files)} FASTQ files.")

# Function to run fastp on a single file
def run_fastp(fastq_file):
    base_name = os.path.basename(fastq_file).replace(".fastq", "").replace(".fq", "")
    json_output = os.path.join(fastp_output_dir, f"{base_name}_fastp.json")
    html_output = os.path.join(fastp_output_dir, f"{base_name}_fastp.html")

    print(f"\n### Processing {fastq_file} with fastp ###")

    try:
        # Run fastp with adapter detection
        subprocess.run([
            "fastp",
            "-i", fastq_file,
            "-o", "/dev/null",  
            "--detect_adapter_for_pe",
            "--json", json_output,
            "--html", html_output
        ], check=True)
        
        print(f"Completed: {fastq_file} → Results saved in {json_output} and {html_output}")

    except subprocess.CalledProcessError as e:
        print(f"Error processing {fastq_file}: {e}")

# Use multiprocessing to run fastp in parallel
num_threads = min(4, len(fastq_files))  
with multiprocessing.Pool(num_threads) as pool:
    pool.map(run_fastp, fastq_files)

print("\n### FastP Adapter Detection Completed for All Files! ###")


In [None]:
# Define the fastp reports directory
fastp_reports_dir = "/mnt/DATA3/daniel/project/01_ATAC_preprocessing/data/fastp_reports"

# Get all JSON reports
json_files = [f for f in os.listdir(fastp_reports_dir) if f.endswith(".json")]

# Process each JSON file
for json_file in json_files:
    json_path = os.path.join(fastp_reports_dir, json_file)

    with open(json_path, "r") as file:
        data = json.load(file)

    # Extract detected adapters
    if "adapter_cutting" in data:
        adapters = data["adapter_cutting"]
        print(f"\n### Adapters found in {json_file} ###")
        print(json.dumps(adapters, indent=4))
    else:
        print(f"\nNo adapters found in {json_file}")


# Overall based on the adapter sequences detected in the FASTQ files, they match the Nextera transposase adapters (CTGTCTCTTATACACATCT...)