In [1]:
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML
import time

In [2]:
# email address required to access GenBank
Entrez.email = "megzlives@gmail.com"

In [3]:
# function to retrieve genomic data
def retrieve_genomic_data(accession):
    print(f"Attempting to retrieve data for {accession}")
    for attempt in range(3):  # Try to fetch the data up to 3 times
        try:
            handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gbwithparts", retmode="text")
            print(f"Downloading data for {accession}...")
            # Use SeqIO.parse() to read multi-sequence GenBank files
            records = list(SeqIO.parse(handle, "genbank"))
            handle.close()
            print(f"Successfully retrieved data for {accession}")
            return records  # This will be a list of records
        except Exception as e:
            print(f"An error occurred while fetching {accession}: {e}")
            print("Attempting to retry...")
            time.sleep(2)  # Wait for 2 seconds before retrying
    print(f"Failed to retrieve data for {accession} after multiple attempts.")
    return None

In [4]:
# sequentially retrieve genomic data for each accession number
d_radiodurans_sequences = {}
accession_numbers = [
    "CP038663.1",  # Chromosome I
    "CP038664.1",  # Chromosome II
    "CP038666.1",  # Plasmid pCP1
    "CP038665.1",  # Plasmid pMP1
]

for accession in accession_numbers:
    records = retrieve_genomic_data(accession)
    if records:
        d_radiodurans_sequences[accession] = records
        print(f"Records for {accession} added to the database.")
    else:
        print(f"Could not retrieve records for {accession}.")

Attempting to retrieve data for CP038663.1
Downloading data for CP038663.1...
Successfully retrieved data for CP038663.1
Records for CP038663.1 added to the database.
Attempting to retrieve data for CP038664.1
Downloading data for CP038664.1...
Successfully retrieved data for CP038664.1
Records for CP038664.1 added to the database.
Attempting to retrieve data for CP038666.1
Downloading data for CP038666.1...
Successfully retrieved data for CP038666.1
Records for CP038666.1 added to the database.
Attempting to retrieve data for CP038665.1
Downloading data for CP038665.1...
Successfully retrieved data for CP038665.1
Records for CP038665.1 added to the database.


In [5]:
# retrieve genomic data for Thermus thermophilus
t_thermophilus_seq = retrieve_genomic_data("NC_006461.1")

Attempting to retrieve data for NC_006461.1
Downloading data for NC_006461.1...
Successfully retrieved data for NC_006461.1


In [6]:
# function to perform BLAST analysis for a gene sequence
def perform_blast(sequence, database="nt", organism="Thermus thermophilus"):
    try:
        result_handle = NCBIWWW.qblast("blastn", database, sequence, entrez_query=f'organism="{organism}"')
        return result_handle
    except Exception as e:
        print(f"Error performing BLAST: {str(e)}")
        return None

In [7]:
# Function to parse BLAST results
def parse_blast_results(blast_result_handle):
    blast_records = NCBIXML.read(blast_result_handle)
    results = []
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            result = {
                'title': alignment.title,
                'score': hsp.score,
                'e_value': hsp.expect,
                'identities': hsp.identities,
                'align_length': hsp.align_length
            }
            results.append(result)
    return results

In [8]:
# Dictionary for the actual locus tags for the target genes in D. radiodurans
target_genes = {
    "PprA": "E5E91_RS15025",
    "RecA": "E5E91_RS11810",
    "DdrA": "E5E91_RS02140",
    "DdrB": "E5E91_RS00360",
    "DdrC": "E5E91_RS00015",
    "NADH-quinone oxidoreductase subunit N": "E5E91_RS07520",
    "LigD": None,  # Replace with the actual locus tag when found
    "Dps": "E5E91_RS11440",
    "Bcp": None,   # Replace with the actual locus tag when found
    "IrrE": "E5E91_RS00855",
    "Ssb": None,   # Replace with the actual locus tag when found
    "PolA": "E5E91_RS08585",
    "ThyA": "E5E91_RS13290",
    "PNPase": "E5E91_RS10905"
}


In [9]:
def extract_gene_sequence(search_term, search_type='locus_tag'):
    with open("gene_and_locus_tags.txt", "w") as file:  # Open a text file for writing
        for accession, records in d_radiodurans_sequences.items():
            print(f"Starting search in accession: {accession}")
            file.write(f"Starting search in accession: {accession}\n")
            for record in records:
                print(f"Looking in record: {record.id}, Description: {record.description}")
                file.write(f"Looking in record: {record.id}, Description: {record.description}\n")
                for feature in record.features:
                    if feature.type in ["gene", "CDS"]:
                        gene_name = feature.qualifiers.get("gene", ["No gene name"])[0]
                        locus_tag = feature.qualifiers.get("locus_tag", ["No locus tag"])[0]
                        file.write(f"{gene_name}\t{locus_tag}\n")  # Write gene names and locus tags to file
                        if search_type in feature.qualifiers:
                            if search_term == feature.qualifiers[search_type][0]:
                                print(f"Found {search_type} {search_term} in {accession}, Record ID: {record.id}")
                                file.write(f"Found {search_type} {search_term} in {accession}, Record ID: {record.id}\n")
                                gene_sequence = feature.extract(record.seq)
                                return gene_sequence
                print(f"Gene with {search_type} {search_term} not found in record {accession}.")
                file.write(f"Gene with {search_type} {search_term} not found in record {accession}.\n")
        print(f"Search completed. Gene with {search_type} {search_term} not found in any of the GenBank records.")
        file.write(f"Search completed. Gene with {search_type} {search_term} not found in any of the GenBank records.\n")
    return None

In [10]:
# dictionary to store BLAST results for each gene
blast_results = {}

In [11]:
# Perform BLAST analysis for each gene
for gene, locus_tag in target_genes.items():
    if locus_tag:
        gene_sequence = extract_gene_sequence(locus_tag)
        if gene_sequence:
            result_handle = perform_blast(str(gene_sequence), organism="Thermus thermophilus")
            if result_handle:
                blast_results[gene] = parse_blast_results(result_handle)
            else:
                print(f"BLAST search failed for gene: {gene}")
        else:
            print(f"Sequence extraction failed for gene: {gene}")
    else:
        print(f"Locus tag not found for gene: {gene}")

# The blast_results dictionary now contains the parsed BLAST output for each gene

Starting search in accession: CP038663.1
Looking in record: CP038663.1, Description: Deinococcus radiodurans R1 = ATCC 13939 = DSM 20539 strain ATCC 13939 chromosome I, complete sequence
Gene with locus_tag E5E91_RS15025 not found in record CP038663.1.
Starting search in accession: CP038664.1
Looking in record: CP038664.1, Description: Deinococcus radiodurans R1 = ATCC 13939 = DSM 20539 strain ATCC 13939 chromosome II, complete sequence
Gene with locus_tag E5E91_RS15025 not found in record CP038664.1.
Starting search in accession: CP038666.1
Looking in record: CP038666.1, Description: Deinococcus radiodurans R1 = ATCC 13939 = DSM 20539 strain ATCC 13939 plasmid pCP1, complete sequence
Gene with locus_tag E5E91_RS15025 not found in record CP038666.1.
Starting search in accession: CP038665.1
Looking in record: CP038665.1, Description: Deinococcus radiodurans R1 = ATCC 13939 = DSM 20539 strain ATCC 13939 plasmid pMP1, complete sequence
Gene with locus_tag E5E91_RS15025 not found in record