In [1]:
from Bio import Entrez, SeqIO
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

In [2]:
# email address required to access GenBank
Entrez.email = "megzlives@gmail.com"

In [3]:
# function to retrieve genomic data
def retrieve_genomic_data(accession):
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return record.seq

In [4]:
# retrieve genomic data for Dienococcus radiodurans
d_radiodurans_seq = retrieve_genomic_data("NC_001263.1") 

# retrieve genomic data for Thermus thermophilus
t_thermophilus_seq = retrieve_genomic_data("NC_006461.1")

In [5]:
# function to perform BLAST analysis for a gene sequence
def perform_blast(sequence, database="nt", organism="Thermus thermophilus"):
    try:
        # We use the entrez_query parameter to filter the search for the specific organism
        result_handle = NCBIWWW.qblast("blastn", database, sequence, entrez_query=f'organism="{organism}"')
        return result_handle
    except Exception as e:
        print(f"Error performing BLAST: {str(e)}")
        return None

In [6]:
# Function to parse BLAST results
def parse_blast_results(blast_result_handle):
    blast_records = NCBIXML.read(blast_result_handle)
    # Extract relevant information from the BLAST records
    results = []
    for alignment in blast_records.alignments:
        for hsp in alignment.hsps:
            result = {
                'title': alignment.title,
                'score': hsp.score,
                'e_value': hsp.expect,
                'identities': hsp.identities,
                'align_length': hsp.align_length
            }
            results.append(result)
    return results

In [9]:
# If no records found, possibly need to search for the gene in the genome rather than in the 
# nucleotide database. This part of the code might need to be adjusted based on the actual 
# data available in GenBank

def extract_gene_sequence(accession, gene_name):
    # Search for the gene in the nucleotide database to get its GenBank ID
    search_handle = Entrez.esearch(db="nucleotide", term=f"{accession}[Accession] AND {gene_name}[Gene]")
    search_results = Entrez.read(search_handle)
    search_handle.close()
    # Get the list of GenBank IDs
    genbank_ids = search_results["IdList"]
    
    if not genbank_ids:
        print(f"No GenBank records found for gene: {gene_name}")
        return None
    
    # Fetch the GenBank record for the first ID in the list
    genbank_id = genbank_ids[0]
    genbank_handle = Entrez.efetch(db="nucleotide", id=genbank_id, rettype="gb", retmode="text")
    genbank_record = SeqIO.read(genbank_handle, "genbank")
    genbank_handle.close()
    
    # Extract the gene sequence from the GenBank record
    for feature in genbank_record.features:
        if feature.type == "gene" and 'gene' in feature.qualifiers:
            if feature.qualifiers['gene'][0].lower() == gene_name.lower():
                # Get the sequence within the feature location
                gene_sequence = feature.extract(genbank_record.seq)
                return gene_sequence

    print(f"Gene {gene_name} not found in the GenBank record with ID {genbank_id}")
    return None


In [10]:
# list of target genes involved in radiation resistance
target_genes = [
    "PprA", "RecA", "DdrA", "DdrB", "DdrC", "Ku", "LigD", "DR0423", "Ssb", "PolA", "ThyA", "PNPase"
    # Note: Gene names should be the standard codes used in the database.
]

In [11]:
# dictionary to store BLAST results for each gene
blast_results = {}

In [12]:
# Perform BLAST analysis for each gene
for gene in target_genes:
    # Extract the gene sequence from D. radiodurans genome
    gene_sequence = extract_gene_sequence(d_radiodurans_seq, gene)
    if gene_sequence:
        # Perform BLAST search against the T. thermophilus genome
        result_handle = perform_blast(str(gene_sequence), organism="Thermus thermophilus")
        
        # Parse BLAST results and store in the dictionary
        if result_handle:
            blast_results[gene] = parse_blast_results(result_handle)
        else:
            print(f"BLAST search failed for gene: {gene}")
    else:
        print(f"Sequence extraction failed for gene: {gene}")

# After this point, blast_results will contain the parsed BLAST output for each gene

No GenBank records found for gene: PprA
Sequence extraction failed for gene: PprA
No GenBank records found for gene: RecA
Sequence extraction failed for gene: RecA
No GenBank records found for gene: DdrA
Sequence extraction failed for gene: DdrA
No GenBank records found for gene: DdrB
Sequence extraction failed for gene: DdrB
No GenBank records found for gene: DdrC
Sequence extraction failed for gene: DdrC
No GenBank records found for gene: Ku
Sequence extraction failed for gene: Ku
No GenBank records found for gene: LigD
Sequence extraction failed for gene: LigD
No GenBank records found for gene: DR0423
Sequence extraction failed for gene: DR0423
No GenBank records found for gene: Ssb
Sequence extraction failed for gene: Ssb
No GenBank records found for gene: PolA
Sequence extraction failed for gene: PolA
No GenBank records found for gene: ThyA
Sequence extraction failed for gene: ThyA
No GenBank records found for gene: PNPase
Sequence extraction failed for gene: PNPase
