In [1]:
from Bio.Blast import NCBIWWW, NCBIXML
from Bio import SeqIO
import requests
import os
import xml.etree.ElementTree as ET

def search_sequence(sequence, num_results=5):
    """Search NCBI Protein Blast for the given sequence and return UniProt IDs of best matches."""
    result_handle = NCBIWWW.qblast("blastp", "refseq_protein", sequence)
    blast_records = NCBIXML.parse(result_handle)
    blast_records = list(blast_records)
    result_handle.close()

    if not blast_records:
        print(f"No results found for sequence: {sequence}")
        return []

    best_hits = blast_records[0].alignments[:num_results]
    uniprot_ids = []

    for hit in best_hits:
        for hsp in hit.hsps:
            title = hit.title
            uniprot_id = title.split("|")[1]
            uniprot_ids.append(uniprot_id)
    
    return uniprot_ids

def get_pdb_ids_from_uniprot(uniprot_id):
    """Retrieve PDB IDs associated with a given UniProt ID using the UniProt API."""
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve data for UniProt ID: {uniprot_id}")
        return []

    pdb_ids = []
    xml_content = response.text
    root = ET.fromstring(xml_content)
    for dbReference in root.findall(".//{http://uniprot.org/uniprot}dbReference"):
        if dbReference.attrib.get('type') == 'PDB':
            pdb_id = dbReference.attrib.get('id')
            pdb_ids.append(pdb_id)
    
    return pdb_ids

def download_pdb_file(pdb_id, output_dir):
    """Download a PDB file given a PDB ID."""
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        output_path = os.path.join(output_dir, f"{pdb_id}.pdb")
        with open(output_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded PDB file: {output_path}")
    else:
        print(f"Failed to download PDB file for PDB ID: {pdb_id}")

In [3]:
sequence_file = "regression/data/fastas/davis/sequence_358.fasta"  # Replace with your input sequence file
output_dir = "pdb_files"  # Directory to save downloaded PDB files

# Load sequence from FASTA file
sequence = None
with open(sequence_file, "r") as file:
    record = SeqIO.read(file, "fasta")
    sequence = str(record.seq)

if not sequence:
    print("Failed to read sequence from file.")

In [4]:
# Step 1: Search for UniProt IDs using NCBI Protein Blast
uniprot_ids = search_sequence(sequence)
if not uniprot_ids:
    print("No UniProt IDs found for the sequence.")

In [5]:
# Step 2: Retrieve PDB files for each UniProt ID
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [6]:
uniprot_ids

['XP_002831615.3',
 'XP_034806557.2',
 'XP_049499709.1',
 'XP_032114390.1',
 'XP_039331268.1']

In [7]:
for uniprot_id in uniprot_ids:
    pdb_ids = get_pdb_ids_from_uniprot(uniprot_id)
    if not pdb_ids:
        print(f"No PDB IDs found for UniProt ID: {uniprot_id}")
        continue
    
    for pdb_id in pdb_ids:
        download_pdb_file(pdb_id, output_dir)

Failed to retrieve data for UniProt ID: XP_002831615.3
No PDB IDs found for UniProt ID: XP_002831615.3
Failed to retrieve data for UniProt ID: XP_034806557.2
No PDB IDs found for UniProt ID: XP_034806557.2
Failed to retrieve data for UniProt ID: XP_049499709.1
No PDB IDs found for UniProt ID: XP_049499709.1
Failed to retrieve data for UniProt ID: XP_032114390.1
No PDB IDs found for UniProt ID: XP_032114390.1
Failed to retrieve data for UniProt ID: XP_039331268.1
No PDB IDs found for UniProt ID: XP_039331268.1
