In [1]:
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import time

In [2]:
# email address required to access GenBank
Entrez.email = "megzlives@gmail.com"

In [3]:
# function to retrieve genomic data
def retrieve_genomic_data(accession):
    print(f"Attempting to retrieve data for {accession}")
    for attempt in range(3):  # Try to fetch the data up to 3 times
        try:
            handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gbwithparts", retmode="text")
            print(f"Downloading data for {accession}...")
            # Use SeqIO.parse() to read multi-sequence GenBank files
            records = list(SeqIO.parse(handle, "genbank"))
            handle.close()
            print(f"Successfully retrieved data for {accession}")
            return records  # This will be a list of records
        except Exception as e:
            print(f"An error occurred while fetching {accession}: {e}")
            print("Attempting to retry...")
            time.sleep(2)  # Wait for 2 seconds before retrying
    print(f"Failed to retrieve data for {accession} after multiple attempts.")
    return None

In [4]:
# sequentially retrieve genomic data for each accession number
d_radiodurans_sequences = {}
accession_numbers = [
    "NZ_CP038663.1",  # Chromosome I
    "NZ_CP038664.1",  # Chromosome II
    "NZ_CP038666.1",  # Plasmid pCP1
    "NZ_CP038665.1",  # Plasmid pMP1
]

for accession in accession_numbers:
    records = retrieve_genomic_data(accession)
    if records:
        d_radiodurans_sequences[accession] = records
        print(f"Records for {accession} added to the database.")
    else:
        print(f"Could not retrieve records for {accession}.")

Attempting to retrieve data for NZ_CP038663.1
Downloading data for NZ_CP038663.1...
Successfully retrieved data for NZ_CP038663.1
Records for NZ_CP038663.1 added to the database.
Attempting to retrieve data for NZ_CP038664.1
Downloading data for NZ_CP038664.1...
Successfully retrieved data for NZ_CP038664.1
Records for NZ_CP038664.1 added to the database.
Attempting to retrieve data for NZ_CP038666.1
Downloading data for NZ_CP038666.1...
Successfully retrieved data for NZ_CP038666.1
Records for NZ_CP038666.1 added to the database.
Attempting to retrieve data for NZ_CP038665.1
Downloading data for NZ_CP038665.1...
Successfully retrieved data for NZ_CP038665.1
Records for NZ_CP038665.1 added to the database.


In [5]:
# retrieve genomic data for Thermus thermophilus
# note: this is only chromosome data, does not include pTT8 or pTT27
t_thermophilus_seq = retrieve_genomic_data("NC_006461.1")

Attempting to retrieve data for NC_006461.1
Downloading data for NC_006461.1...
Successfully retrieved data for NC_006461.1


In [6]:
# function to perform BLAST analysis for a gene sequence from D. radiodurans against an organism
def perform_blast(sequence, database="nt", organism="Thermus thermophilus"):
    try:
        result_handle = NCBIWWW.qblast("blastn", database, sequence, entrez_query=f'organism="{organism}"')
        return result_handle
    except Exception as e:
        print(f"Error performing BLAST: {str(e)}")
        return None

In [7]:
# function to parse BLAST results
def parse_blast_results(blast_result_handle):
    blast_records = NCBIXML.read(blast_result_handle)
    results = []
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            result = {
                'title': alignment.title,
                'score': hsp.score,
                'e_value': hsp.expect,
                'identities': hsp.identities,
                'align_length': hsp.align_length
            }
            results.append(result)
    return results

In [8]:
# dictionary of target genes in D. radiodurans; includes gene name and locus tags
target_genes = {
    "PprA": "E5E91_RS15025",
    "RecA": "E5E91_RS11810",
    "DdrA": "E5E91_RS02140",
    "DdrB": "E5E91_RS00360",
    "DdrC": "E5E91_RS00015",
    "NADH-quinone oxidoreductase subunit N": "E5E91_RS07520",
    "LigD": None,  # Replace with the actual locus tag when found
    "Dps": "E5E91_RS11440",
    "Bcp": None,   # Replace with the actual locus tag when found
    "IrrE": "E5E91_RS00855",
    "Ssb": None,   # Replace with the actual locus tag when found
    "PolA": "E5E91_RS08585",
    "ThyA": "E5E91_RS13290",
    "PNPase": "E5E91_RS10905"
}


In [9]:
# TODO: remove file writing debugging stuff 
# function for extracting gene sequence 
# change search_type to gene to search for gene name or locus_tag to search by locus tag
def extract_gene_sequence(search_term, search_type='locus_tag'):
    with open("gene_and_locus_tags.txt", "w") as file:  # Open a text file for writing
        found = False  # Flag to check if we found the gene
        for accession, records in d_radiodurans_sequences.items():
            for record in records:
                for feature in record.features:
                    if feature.type in ["gene", "CDS"]:
                        # Extract gene name, locus tag, and gene ID
                        gene_name = feature.qualifiers.get("gene", ["No gene name"])[0]
                        locus_tag = feature.qualifiers.get("locus_tag", ["No locus tag"])[0]
                        gene_id_list = feature.qualifiers.get("db_xref", [])
                        gene_id = next((id_part.split(":")[1] for id_part in gene_id_list if id_part.startswith("GeneID:")), "No Gene ID")
                        file.write(f"{gene_name}\t{locus_tag}\t{gene_id}\n")  # Write to file
                        if search_type in feature.qualifiers:
                            # This can be a list, so check if the search term is in the list
                            if search_term in feature.qualifiers[search_type]:
                                print(f"Found {search_type} {search_term} in {accession}, Record ID: {record.id}")
                                found = True
                                gene_sequence = feature.extract(record.seq)
                                return gene_sequence
                if not found:
                    print(f"Gene with {search_type} {search_term} not found in record {record.id}.")
        if not found:
            print(f"Search completed. Gene with {search_type} {search_term} not found in any of the GenBank records.")
    return None

In [10]:
# dictionary to store BLAST results for each gene
blast_results = {}

In [11]:
# extract gene sequences based on locus tag
for gene, locus_tag in target_genes.items():
    if locus_tag:  # Make sure the locus tag is not None
        gene_sequence = extract_gene_sequence(locus_tag, search_type='locus_tag')
        if gene_sequence:
            # Placeholder for the code to proceed with BLAST or other processing
            print(f"Gene sequence found for {gene}: {gene_sequence[:10]}...")  # Show part of the gene sequence
        else:
            print(f"Sequence extraction failed for gene: {gene}")
    else:
        print(f"Locus tag not found for gene: {gene}")

Gene with locus_tag E5E91_RS15025 not found in record NZ_CP038663.1.
Found locus_tag E5E91_RS15025 in NZ_CP038664.1, Record ID: NZ_CP038664.1
Gene sequence found for PprA: GTGCTACCCC...
Found locus_tag E5E91_RS11810 in NZ_CP038663.1, Record ID: NZ_CP038663.1
Gene sequence found for RecA: ATGAGCAAGG...
Found locus_tag E5E91_RS02140 in NZ_CP038663.1, Record ID: NZ_CP038663.1
Gene sequence found for DdrA: ATGAAGCTGA...
Found locus_tag E5E91_RS00360 in NZ_CP038663.1, Record ID: NZ_CP038663.1
Gene sequence found for DdrB: ATGTTGCAGA...
Found locus_tag E5E91_RS00015 in NZ_CP038663.1, Record ID: NZ_CP038663.1
Gene sequence found for DdrC: ATGAAGAACG...
Found locus_tag E5E91_RS07520 in NZ_CP038663.1, Record ID: NZ_CP038663.1
Gene sequence found for NADH-quinone oxidoreductase subunit N: ATGAATCTGG...
Locus tag not found for gene: LigD
Found locus_tag E5E91_RS11440 in NZ_CP038663.1, Record ID: NZ_CP038663.1
Gene sequence found for Dps: ATGACGAAGA...
Locus tag not found for gene: Bcp
Found locus

In [None]:
# extract gene sequences based on gene name
#for gene_name in target_genes.keys():
    #gene_sequence = extract_gene_sequence(gene_name, search_type='gene')
    #if gene_sequence:
        # Proceed with BLAST, etc.
    #else:
        #print(f"Sequence extraction failed for gene: {gene_name}")

In [None]:
# Perform BLAST and parse results
for gene_name, locus_tag in target_genes.items():
    if locus_tag:
        gene_sequence = extract_gene_sequence(locus_tag, search_type='locus_tag')
        if gene_sequence:
            print(f"Performing BLAST search for {gene_name}...")
            result_handle = perform_blast(str(gene_sequence), organism="Thermus thermophilus")
            if result_handle:
                # Parse the BLAST result handle and store it in the blast_results dictionary
                blast_record = NCBIXML.read(result_handle)
                blast_results[gene_name] = blast_record
                print(f"BLAST search completed for {gene_name}")
            else:
                print(f"BLAST search failed for gene: {gene_name}")
        else:
            print(f"Sequence extraction failed for gene: {gene_name}")
    else:
        print(f"Locus tag not found for gene: {gene_name}")

# Now display the BLAST results
print("BLAST results keys:", blast_results.keys())
display_blast_results(blast_results)

Gene with locus_tag E5E91_RS15025 not found in record NZ_CP038663.1.
Found locus_tag E5E91_RS15025 in NZ_CP038664.1, Record ID: NZ_CP038664.1
Performing BLAST search for PprA...
BLAST search completed for PprA
Found locus_tag E5E91_RS11810 in NZ_CP038663.1, Record ID: NZ_CP038663.1
Performing BLAST search for RecA...


In [None]:
# debugging - see if dictionary is empty or not
print("BLAST results keys:", blast_results.keys())

In [None]:
# Function to display BLAST results
def display_blast_results(blast_results):
    for gene, blast_record in blast_results.items():
        print(f"Results for {gene}:")
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                print(f"****Alignment****")
                print(f"sequence: {alignment.title}")
                print(f"length: {alignment.length}")
                print(f"e value: {hsp.expect}")
                print(hsp.query[0:75] + "...")
                print(hsp.match[0:75] + "...")
                print(hsp.sbjct[0:75] + "...")
        print("\n")

# Assuming blast_results contains the BLAST results
display_blast_results(blast_results)