In [6]:
import pandas as pd
from Bio import Entrez
from Bio import SeqIO
from tqdm import tqdm
import time

# ---- Parameters ---- #
EXCEL_FILE = "data/RITA_and_ABT_pos_selection_screens.xlsx"
SHEET_NAME = "RITA"
TYPE_FILTER = ["VP", "VT"]
ID_COLUMN = "NCBI_id"
TYPE_COLUMN = "type"
OUTPUT_FASTA = "data/all_virus_proteins.fasta"
ENTREZ_EMAIL = "phitro@bu.edu"  
ENTREZ_API_KEY = "9bb9af72db60905930367f8f543e5ef0d108"       

# ---- Load Data ---- #
df = pd.read_excel(EXCEL_FILE, sheet_name=SHEET_NAME, engine="openpyxl")
filtered = df[df[TYPE_COLUMN].isin(TYPE_FILTER)]
unique_ids = sorted(filtered[ID_COLUMN].dropna().astype(str).unique())

print(f"Found {len(unique_ids)} unique NCBI IDs of type {TYPE_FILTER}")

# ---- Set up Entrez ---- #
Entrez.email = ENTREZ_EMAIL
if ENTREZ_API_KEY:
    Entrez.api_key = ENTREZ_API_KEY

# ---- Download FASTA sequences ---- #
seqs = {}
for ncbi_id in tqdm(unique_ids, desc="Fetching FASTA"):
    try:
        # Try protein database first (most likely)
        handle = Entrez.efetch(db="protein", id=ncbi_id, rettype="fasta", retmode="text")
        records = list(SeqIO.parse(handle, "fasta"))
        handle.close()
        if records:
            for rec in records:
                rec.id = f"{ncbi_id}|{rec.id}"      # FASTA header = NCBI_id|original_id
                # Deduplicate by new ID/sequence
                seqs[(rec.id, str(rec.seq))] = rec
        else:
            print(f"ID {ncbi_id}: No sequences found.")
    except Exception as e:
        print(f"Failed for ID: {ncbi_id}\nError: {e}")
    time.sleep(0.11)  # NCBI: <=10/sec with API key, <=3/sec without

all_seq_records = [v for v in seqs.values()]
print(f"Downloaded {len(all_seq_records)} unique protein sequences.")

# ---- Write to single FASTA file ---- #
with open(OUTPUT_FASTA, "w") as out_handle:
    SeqIO.write(all_seq_records, out_handle, "fasta")

print(f"Wrote sequences to {OUTPUT_FASTA}")

Found 1522 unique NCBI IDs of type ['VP', 'VT']


Fetching FASTA: 100%|██████████| 1522/1522 [10:28<00:00,  2.42it/s]

Downloaded 1522 unique protein sequences.
Wrote sequences to data/all_virus_proteins.fasta



