In [4]:
import os
from Bio.Blast.Applications import NcbiblastnCommandline, NcbiblastxCommandline

# Define the blast databases and their corresponding blast command
blast_databases = {
    "nucleotide": {
        "path": "/home/abozar/pathogenereads/Blastdatabase/Viralnuclotide/viral2024",
        "command": NcbiblastnCommandline
    },
    "viroids": {
        "path": "/home/abozar/pathogenereads/Blastdatabase/Viroids/ref_viroids_rep_genomes",
        "command": NcbiblastnCommandline
    },
    "protein": {
        "path": "/home/abozar/pathogenereads/Blastdatabase/viralprotein/viral1protein",
        "command": NcbiblastxCommandline
    }
}

def blast_sequence(sequence_file, database_key, output_file, evalue, word_size, gapopen, gapextend):
    # Retrieve the database path and command based on the database key
    database_path = blast_databases[database_key]["path"]
    blast_command = blast_databases[database_key]["command"]
    
    # Update the word size and gap penalties based on the database
    if database_key == "protein":
        word_size = 3  # Default word size for BLASTX
        gapopen, gapextend = 11, 1  # Default for BLASTX with BLOSUM62
    else:
        # Keep the word size as is for BLASTN
        gapopen, gapextend = 5, 2  # Default for BLASTN
    
    # Define the blast command with additional parameters
    blast_cline = blast_command(
        cmd="/usr/bin/blastn" if database_key != "protein" else "/usr/bin/blastx",
        query=sequence_file,
        db=database_path,
        outfmt=5,  # XML format
        out=output_file,
        evalue=evalue,
        word_size=word_size,
        gapopen=gapopen,
        gapextend=gapextend
    )
    
    # Execute the blast command
    stdout, stderr = blast_cline()
    if stderr:
        print(f"An error occurred: {stderr}")
    else:
        print(f"BLAST search with e-value {evalue}, word size {word_size}, gapopen {gapopen}, and gapextend {gapextend} has been completed. Results are saved in {output_file}")

# Set the paths
contig_list_nucleotide = "/home/abozar/pathogenereads/outputscontig_6051/final.contigs.fa"
output_directory = "/home/abozar/pathogenereads/blast2"
merged_output_file = "/home/abozar/pathogenereads/blast2/merged_results.xml"

# Check if the output directory exists, if not, create it
if not os.path.exists(output_directory):
    try:
        os.makedirs(output_directory)
    except PermissionError as e:
        print(f"Permission denied: {e}")
        exit(1)

# Define the range of parameters to test
evalue_thresholds = [1e-100, 1e-50, 1e-10, 1]
word_sizes_nucleotide = [11, 15, 20]  # Word sizes for BLASTN
word_sizes_protein = [3]              # Word size for BLASTX

# Open the merged output file
with open(merged_output_file, 'w') as merged_file:
    # Loop over each combination of parameters
    for evalue in evalue_thresholds:
        for db_key in blast_databases:
            # Update the word size and gap penalties based on the database
            if db_key == "protein":
                word_sizes = word_sizes_protein
                gap_penalties = [(11, 1)]  # Default for BLASTX with BLOSUM62
            else:
                word_sizes = word_sizes_nucleotide
                gap_penalties = [(5, 2)]  # Default for BLASTN
            
            # Loop over each word size and gap penalty
            for word_size in word_sizes:
                for gapopen, gapextend in gap_penalties:
                    # Define the output file name based on the parameters and database
                    output_file_name = f"blast_{db_key}_evalue{evalue}_wordsize{word_size}_gapopen{gapopen}_gapextend{gapextend}.xml"
                    # Full path to the output file
                    output_file = os.path.join(output_directory, output_file_name)
                    # Perform BLAST with the current set of parameters and database
                    blast_sequence(contig_list_nucleotide, db_key, output_file, evalue, word_size, gapopen, gapextend)
                    # Read the content of the current output file
                    with open(output_file, 'r') as file:
                        content = file.read()
                        # Write the content to the merged output file
                        merged_file.write(content)

# Print a success message
print(f"All BLAST results have been merged into {merged_output_file}")


BLAST search with e-value 1e-100, word size 11, gapopen 5, and gapextend 2 has been completed. Results are saved in /home/abozar/pathogenereads/blast2/blast_nucleotide_evalue1e-100_wordsize11_gapopen5_gapextend2.xml
BLAST search with e-value 1e-100, word size 15, gapopen 5, and gapextend 2 has been completed. Results are saved in /home/abozar/pathogenereads/blast2/blast_nucleotide_evalue1e-100_wordsize15_gapopen5_gapextend2.xml
BLAST search with e-value 1e-100, word size 20, gapopen 5, and gapextend 2 has been completed. Results are saved in /home/abozar/pathogenereads/blast2/blast_nucleotide_evalue1e-100_wordsize20_gapopen5_gapextend2.xml
BLAST search with e-value 1e-100, word size 11, gapopen 5, and gapextend 2 has been completed. Results are saved in /home/abozar/pathogenereads/blast2/blast_viroids_evalue1e-100_wordsize11_gapopen5_gapextend2.xml
BLAST search with e-value 1e-100, word size 15, gapopen 5, and gapextend 2 has been completed. Results are saved in /home/abozar/pathogener

In [5]:
import pandas as pd
from Bio.Blast import NCBIXML

def blast_xml_to_excel(xml_file, excel_file):
    # Parse the BLAST XML output
    with open(xml_file) as result_handle:
        blast_records = NCBIXML.parse(result_handle)
        
        # Create a list to hold all the BLAST hits
        blast_results = []
        
        # Iterate over each BLAST record
        for blast_record in blast_records:
            # Iterate over each alignment
            for alignment in blast_record.alignments:
                # Iterate over each hsp (high-scoring pair)
                for hsp in alignment.hsps:
                    # Append a dictionary with the desired information
                    blast_results.append({
                        'Query ID': blast_record.query_id,
                        'Query Def': blast_record.query,
                        'Subject ID': alignment.hit_id,
                        'Subject Def': alignment.hit_def,
                        'Percent Identity': hsp.identities / hsp.align_length * 100,
                        'Alignment Length': hsp.align_length,
                        'Mismatches': hsp.align_length - hsp.identities,
                        'Gap Opens': hsp.gaps,
                        'Query Start': hsp.query_start,
                        'Query End': hsp.query_end,
                        'Subject Start': hsp.sbjct_start,
                        'Subject End': hsp.sbjct_end,
                        'E-value': hsp.expect,
                        'Bit Score': hsp.bits,
                    })
        
        # Convert the list of dictionaries to a pandas DataFrame
        df = pd.DataFrame(blast_results)
        
        # Write the DataFrame to an Excel file
        df.to_excel(excel_file, index=False)

# Specify the XML file path and the desired Excel file path
xml_file_path = "/home/abozar/pathogenereads/blast2/merged_results.xml"
excel_file_path = "/home/abozar/pathogenereads/blast2/merged_results.xlsx"

# Convert the BLAST XML output to Excel format
blast_xml_to_excel(xml_file_path, excel_file_path)
