In [1]:
import pandas as pd
from Bio import Entrez
from Bio import SeqIO
from tqdm import tqdm
import time

# ---- Parameters ---- #
FILE = "data/VP_library_all_sequences.csv"
TYPE_FILTER = ["VP", "VT"]
ID_COLUMN = "NCBI_id"
TYPE_COLUMN = "code"
OUTPUT_FASTA = "data/full_library_virus_proteins.fasta"
OUTPUT_CSV = "data/full_library_virus_proteins.csv"
ENTREZ_EMAIL = "phitro@bu.edu"  
ENTREZ_API_KEY = "9bb9af72db60905930367f8f543e5ef0d108"       

# ---- Load Data ---- #
df = pd.read_csv(FILE)
filtered = df[df[TYPE_COLUMN].isin(TYPE_FILTER)]
unique_ids = sorted(filtered[ID_COLUMN].dropna().astype(str).unique())

print(f"Found {len(unique_ids)} unique NCBI IDs of type {TYPE_FILTER}")

# ---- Set up Entrez ---- #
Entrez.email = ENTREZ_EMAIL
if ENTREZ_API_KEY:
    Entrez.api_key = ENTREZ_API_KEY

# ---- Download FASTA sequences ---- #
seqs = {}
seq_df = pd.DataFrame(columns=['NCBI_id', 'Sequence'])

for ncbi_id in tqdm(unique_ids, desc="Fetching FASTA"):
    try:
        # Try protein database first (most likely)
        handle = Entrez.efetch(db="protein", id=ncbi_id, rettype="fasta", retmode="text")
        records = list(SeqIO.parse(handle, "fasta"))
        handle.close()
        if records:
            for rec in records:
                rec.id = f"{ncbi_id}|{rec.id}"      # FASTA header = NCBI_id|original_id
                # Deduplicate by new ID/sequence
                seqs[(rec.id, str(rec.seq))] = rec

                # New row data as a list or Series
                new_row_values = [rec.id, str(rec.seq)]
                # Add the new row using .loc at the next available index
                seq_df.loc[len(seq_df)] = new_row_values
        else:
            print(f"ID {ncbi_id}: No sequences found.")
    except Exception as e:
        print(f"Failed for ID: {ncbi_id}\nError: {e}")
    time.sleep(0.11)  # NCBI: <=10/sec with API key, <=3/sec without

all_seq_records = [v for v in seqs.values()]
print(f"Downloaded {len(all_seq_records)} unique protein sequences.")

# ---- Write to single FASTA file ---- #
with open(OUTPUT_FASTA, "w") as out_handle:
    SeqIO.write(all_seq_records, out_handle, "fasta")

seq_df.to_csv(OUTPUT_CSV, index=False)

print(f"Wrote sequences to {OUTPUT_FASTA}")

Found 1550 unique NCBI IDs of type ['VP', 'VT']


Fetching FASTA: 100%|██████████| 1550/1550 [10:56<00:00,  2.36it/s]

Downloaded 1550 unique protein sequences.
Wrote sequences to data/full_library_virus_proteins.fasta



