In [10]:
# Import necessary libraries
import subprocess

# Define paths
fastq_data_forward = "/home/abozar/pathogenereads/dashtnaz/GH_1.fq.gz"
fastq_data_reverse = "/home/abozar/pathogenereads/dashtnaz/GH_2.fq.gz"
outputs_dir = "/home/abozar/pathogenereads/dashtnaz"
cutadapt_path = "/home/abozar/anaconda3/bin/cutadapt"
report_path = "/home/abozar/pathogenereads/Reports/trimreport.txt"

# Replace with your actual adapter sequences if available
adapter_forward = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA"
adapter_reverse = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT"

# Command to run cutadapt
command = [
    cutadapt_path,
    "-a", adapter_forward,
    "-A", adapter_reverse,
    "-o", f"{outputs_dir}/trimmed_R1.fq.gz",
    "-p", f"{outputs_dir}/trimmed_R2.fq.gz",
    "-q", "20",
    "--minimum-length", "50",
    "--cores", "2",
    fastq_data_forward,
    fastq_data_reverse
]

# Run cutadapt and save the report
with open(report_path, 'w') as report_file:
    subprocess.run(command, stdout=report_file, stderr=subprocess.STDOUT)

print("Trimming completed. Text report saved as trimreport.txt.")


Trimming completed. Text report saved as trimreport.txt.


In [None]:
#Paired end 

import gzip

# Set the paths and filenames
data_folder = "/home/abozar/pathogenereads/dashtnaz"
output_folder = "/home/abozar/pathogenereads/dashtnaz"
forward_filename = "trimmed_R1.fq.gz"
reverse_filename = "trimmed_R2.fq.gz"
output_filename = "paired_reads.fastq"

# Open the forward and reverse FASTQ files using gzip
with gzip.open(f"{data_folder}/{forward_filename}", "rt") as forward_file:
    forward_lines = forward_file.readlines()

with gzip.open(f"{data_folder}/{reverse_filename}", "rt") as reverse_file:
    reverse_lines = reverse_file.readlines()

# Check if the number of lines in the forward and reverse files match
if len(forward_lines) != len(reverse_lines):
    print("Error: The number of lines in the forward and reverse files do not match.")
else:
    # Open the output FASTQ file for writing
    with open(f"{output_folder}/{output_filename}", "w") as output_file:
        # Pair the reads and write them to the output FASTQ file
        for i in range(0, len(forward_lines), 4):
            output_file.write(forward_lines[i])
            output_file.write(forward_lines[i + 1])
            output_file.write(forward_lines[i + 2])
            output_file.write(forward_lines[i + 3])
            output_file.write(reverse_lines[i])
            output_file.write(reverse_lines[i + 1])
            output_file.write(reverse_lines[i + 2])
            output_file.write(reverse_lines[i + 3])

In [12]:
import subprocess
import os

# Define paths
fastq_data = "/home/abozar/pathogenereads/dashtnaz/paired_reads.fastq"
reference_genome = "/home/abozar/pathogenereads/GCF_000317415.1_Csi_valencia_1.0_genomic.fna"
minimap2_path = "/usr/bin/minimap2"
samtools_path = "/usr/bin/samtools"
outputs_dir = "/home/abozar/pathogenereads/outputs"
report_folder = "/home/abozar/pathogenereads/Reports"

# Define output filenames
sam_file = os.path.join(outputs_dir, "aligned.sam")
unmapped_fasta = os.path.join(outputs_dir, "unmapped.fasta")
report_file = os.path.join(report_folder, "analysis_report.txt")

# Build minimap2 command
minimap2_command = [
    minimap2_path,
    "-ax", "sr",
    reference_genome,
    fastq_data,
    "-o", sam_file,
]

# Run minimap2 alignment
try:
    subprocess.run(minimap2_command, check=True)
except subprocess.CalledProcessError as e:
    print("Minimap2 alignment failed:", e)
    exit(1)

# Extract unmapped reads using samtools and convert to FASTA format
try:
    with open(unmapped_fasta, "w") as unmapped_file:
        samtools_command = [samtools_path, "fasta", "-f", "4", sam_file]
        subprocess.run(samtools_command, stdout=unmapped_file, check=True)
except subprocess.CalledProcessError as e:
    print("Samtools failed:", e)
    exit(1)

# Check if the output files are empty
if os.path.getsize(sam_file) == 0:
    print("Error: Aligned SAM file is empty.")
    exit(1)

if os.path.getsize(unmapped_fasta) == 0:
    print("Error: Unmapped FASTA file is empty.")
    exit(1)

# Generate statistics using samtools
stats_command = [samtools_path, "flagstat", sam_file]
process = subprocess.run(stats_command, capture_output=True, text=True, check=True)
stats_output = process.stdout

# Write statistics to the report
with open(report_file, "w") as report:
    report.write("Alignment Statistics Report\n")
    report.write("===========================\n\n")
    report.write(stats_output)

# Print confirmation message
print("Analysis report saved to:", report_file)



[M::mm_idx_gen::3.466*1.49] collected minimizers
[M::mm_idx_gen::4.085*1.72] sorted minimizers
[M::main::4.086*1.72] loaded/built the index for 4844 target sequence(s)
[M::mm_mapopt_update::4.086*1.72] mid_occ = 1000
[M::mm_idx_stat] kmer size: 21; skip: 11; is_hpc: 0; #seq: 4844
[M::mm_idx_stat::4.351*1.67] distinct minimizers: 36311109 (86.23% are singletons); average occurrences: 1.386; average spacing: 6.514; total length: 327829540
[M::worker_pipeline::11.694*2.53] mapped 338626 sequences
[M::worker_pipeline::16.505*2.69] mapped 338602 sequences
[M::worker_pipeline::23.488*2.79] mapped 338590 sequences
[M::worker_pipeline::30.509*2.86] mapped 338552 sequences
[M::worker_pipeline::37.311*2.88] mapped 338352 sequences
[M::worker_pipeline::43.482*2.91] mapped 338488 sequences
[M::worker_pipeline::48.529*2.93] mapped 338542 sequences
[M::worker_pipeline::55.634*2.94] mapped 338532 sequences
[M::worker_pipeline::61.944*2.96] mapped 338520 sequences
[M::worker_pipeline::66.779*2.97] map

Analysis report saved to: /home/abozar/pathogenereads/Reports/analysis_report.txt
