In [64]:
import subprocess
import os
import time
import pandas as pd
from Bio import SeqIO
from Bio import pairwise2


from helpers.consensus_utils import select_best_alignment
from helpers.fastq import concatenate_fastq

In [71]:
def run_command(command: str, windows: bool = False) -> (subprocess.CompletedProcess, float):
    if windows : 
        command = "wsl " + command
    print(f"Running command: {command}")

    start_time = time.time()
    result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
    elapsed_time = time.time() - start_time

    print(f"Command '{command}' took {elapsed_time:.2f} seconds to run.")
    if result.stdout:
        print(result.stdout)
    if result.stderr:
        print("Error:", result.stderr)
    return result, elapsed_time

In [51]:
def split_fastq(input_path, output_dir, base_name, percentile=20):
    # Read sequences and sort by length
    sequences = list(SeqIO.parse(input_path, "fastq"))
    sequences.sort(key=lambda x: len(x), reverse=True)

    # Split into top percentile and remaining sequences
    split_index = len(sequences) * percentile // 100
    top_sequences = sequences[:split_index]
    remaining_sequences = sequences[split_index:]

    # Write to separate files
    top_sequences_path = os.path.join(output_dir, f"{base_name}_top{percentile}.fastq")
    remaining_sequences_path = os.path.join(output_dir, f"{base_name}_remaining{100-percentile}.fastq")
    SeqIO.write(top_sequences, top_sequences_path, "fastq")
    SeqIO.write(remaining_sequences, remaining_sequences_path, "fastq")

    return top_sequences_path, remaining_sequences_path

In [87]:
def pipeline_consensus(input_name: str, preprocess: bool = False, windows: bool = False): 
    input_fastq_path = os.path.join("assets", "input", input_name)

    # In case the fastq are not yet concatenated, we do so here before proceeding with the rest of the code
    if preprocess : 
        if not os.path.isdir(input_fastq_path) : 
            raise NotADirectoryError
        
        concatenate_fastq(input_fastq_path, input_fastq_path + ".fastq")

    input_fastq_path = input_fastq_path + ".fastq"
    if not os.path.isfile(input_fastq_path) : 
        raise FileNotFoundError
    
    output_dir = os.path.join("assets", "output", input_name)
    os.makedirs(output_dir, exist_ok=True)

    # Split the file into top 20% and remaining 80%
    top_sequences_path, remaining_sequences_path = split_fastq(input_fastq_path, output_dir, input_name, percentile=20)

    total_time_taken = 0

    # Step 1: Align and generate consensus from top 20% sequences
    top_paf_path = os.path.join(output_dir, f"{input_name}_top20_reads.paf")
    top_consensus_path = os.path.join(output_dir, f"{input_name}_top20_consensus.fasta")
    minimap2_command = f"minimap2 -x ava-ont {top_sequences_path} {top_sequences_path} > {top_paf_path}"
    if windows : 
        minimap2_command = minimap2_command.replace('\\','/')
    print("Running read alignment with minimap2 on top 20% sequences...")
    _, minimap2_time = run_command(minimap2_command, windows)
    total_time_taken += minimap2_time

    racon_command = f"racon -m 8 -x -6 -g -8 -w 500 {top_sequences_path} {top_paf_path} {top_sequences_path} > {top_consensus_path}"
    if windows : 
        racon_command = racon_command.replace('\\','/')
    print("Generating consensus sequence with racon on top 20% sequences...")
    _, racon_time = run_command(racon_command, windows)
    total_time_taken += racon_time

    # Step 1 bis: If the outputed racon file contains more than one sequence, we take the longest one
    consensus_sequences = list(SeqIO.parse(top_consensus_path, "fasta"))
    if len(consensus_sequences) > 1:
        print(f"Multiple sequences found in {top_consensus_path}. Selecting the best alignment...")
        best_sequence = select_best_alignment(consensus_sequences, consensus_sequences[0])  # Choose the first sequence as the reference
        SeqIO.write(best_sequence, top_consensus_path, "fasta")

    # Step 2: Align the remaining 80% sequences to the top 20% consensus
    remaining_paf_path = os.path.join(output_dir, f"{input_name}_remaining80_reads.paf")
    final_consensus_path = os.path.join(output_dir, f"{input_name}_final_consensus.fasta")
    minimap2_command = f"minimap2 -x map-ont {top_consensus_path} {remaining_sequences_path} > {remaining_paf_path}"
    if windows : 
        minimap2_command = minimap2_command.replace('\\', '/')
    print("Running read alignment with minimap2 on remaining 80% sequences...")
    _, minimap2_time = run_command(minimap2_command, windows)
    total_time_taken += minimap2_time

    # Step 3: Generate the final consensus sequence with racon
    racon_command = f"racon -m 8 -x -6 -g -8 -w 500 {remaining_sequences_path} {remaining_paf_path} {top_consensus_path} > {final_consensus_path}"
    if windows : 
        racon_command = racon_command.replace('\\','/')
    print("Generating final consensus sequence with racon...")
    _, racon_time = run_command(racon_command, windows)
    total_time_taken += racon_time

    # Print out the total time for each step
    print(f"Minimap2 alignment took {minimap2_time:.2f} seconds.")
    print(f"Total Racon iterations took {total_time_taken - minimap2_time:.2f} seconds.")

    # Print out the total time for the pipeline
    print(f"Total time taken for the pipeline: {total_time_taken:.2f} seconds.")


In [90]:
def pipeline_identification(input_name: str, database: str = None, windows: bool = False) :
    input_fasta_path = os.path.join("assets", "output", input_name, input_name + "_final_consensus.fasta")
    output_blastn = os.path.join("assets", "output", "blastn", input_name)
    os.makedirs(output_blastn, exist_ok=True)

    if database == None : 
        for db in {"ITS_seqs", "matK_seqs", "psbA-trnH_seqs", "rbcL_seqs"} :
            output_blastn_path = os.path.join(output_blastn, db + ".txt")
            blastn_cmd = f"blastn -query {input_fasta_path} -db {db} -out {output_blastn_path} -max_target_seqs 5"
            if windows : 
                blastn_cmd = blastn_cmd.replace("\\", "/")
            run_command(blastn_cmd)
    else :
        output_blastn_path = os.path.join(output_blastn, database + ".txt")
        blastn_cmd = f"blastn -query {input_fasta_path} -db {database} -out {output_blastn_path} -max_target_seqs 5"
        if windows : 
                blastn_cmd = blastn_cmd.replace("\\", "/")
        run_command(blastn_cmd)

    print("You can find identification ouput at ", output_blastn)


In [82]:
def main(input_name: str, preprocess: bool = False, database: str = None, windows: bool = False) :
    print("Debuting consensus sequencing : ") 
    pipeline_consensus(input_name, preprocess, windows)
    print("Consensus sequencing finished")
    print("Debuting Identification with blastn : ")
    pipeline_identification(input_name, database, windows)
    print("Identification finished")

In [91]:
main("rbcL_Qiagen_tomato_5000", False, "rbcL_seqs", True)

Debuting consensus sequencing : 
Running read alignment with minimap2 on top 20% sequences...
Running command: wsl minimap2 -x ava-ont assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20.fastq assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20.fastq > assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20_reads.paf
Command 'wsl minimap2 -x ava-ont assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20.fastq assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20.fastq > assets/output/rbcL_Qiagen_tomato_5000/rbcL_Qiagen_tomato_5000_top20_reads.paf' took 2.38 seconds to run.
Error: [M::mm_idx_gen::0.056*0.37] collected minimizers
[M::mm_idx_gen::0.066*0.66] sorted minimizers
[M::main::0.066*0.66] loaded/built the index for 1000 target sequence(s)
[M::mm_mapopt_update::0.068*0.67] mid_occ = 608
[M::mm_idx_stat] kmer size: 15; skip: 5; is_hpc: 0; #seq: 1000
[M::mm_idx_stat::0.069*0.68] distinct minimizers: 61942 (78.57%