Fetch NCBI sequences using Entrez API with BioPython using a list of NCBI accessions.

In [1]:
from Bio import Entrez

In [2]:
Entrez.email = 'my@email.ca'

In [3]:
accessions = []
with open('2023-03-28-usher-ncbi-matching-accessions-qc-wtd.txt') as f:
    for l in f:
        l = l.strip()
        if l:
            accessions.append(l)

In [5]:
accessions[0]

'MZ778090.1'

In [6]:
len(accessions)

91

In [25]:
from Bio import SeqIO

In [26]:
with Entrez.efetch('nuccore', id=accessions, rettype='genbank', retmode='text') as handle:
    gb_recs = {rec.id:rec for rec in SeqIO.parse(handle, format='genbank')}

In [28]:
r = gb_recs['MZ778090.1']

In [35]:
import pandas as pd

In [12]:
import re

In [38]:
d = {}
for rid, r in gb_recs.items():
    xs = {k:(v[0] if isinstance(v, list) and len(v) > 0 else "") for k,v in dict(r.features[0].qualifiers).items()}
    d[rid] = xs
df = pd.DataFrame(d).transpose()

In [42]:
accs = []
for x in list(recs.keys()):
    acc = re.sub(r'([A-Z]{2}\d{6}\.\d+) .*', r'\1', x)
    seqid = re.sub(r'.* SARS-CoV-2\/[^\/]+\/([^\/]+\/[^\/]+\/\d+) ?.*', r'\1', x)
    country_state = re.sub(r'(\w+)\/(\w{2})-.*', r'\1-\2', seqid)
    accs.append(acc)

In [44]:
df.loc[accs,:].to_csv('ncbi-metadata.tsv', sep='\t')

In [7]:
from Bio.SeqIO.FastaIO import SimpleFastaParser

In [8]:
with Entrez.efetch('nuccore', id=accessions, rettype='fasta', retmode='text') as handle:
    recs = {x:y for x,y in SimpleFastaParser(handle)}

In [9]:
len(recs)

91

In [19]:
for x in list(recs.keys()):
    acc = re.sub(r'([A-Z]{2}\d{6}\.\d+) .*', r'\1', x)
    seqid = re.sub(r'.* SARS-CoV-2\/[^\/]+\/([^\/]+\/[^\/]+\/\d+) ?.*', r'\1', x)
    country_state = re.sub(r'(\w+)\/(\w{2})-.*', r'\1-\2', seqid)
    print(f'{acc},{country_state},{seqid}')

MZ778090.1,USA-VT,USA/VT-CDCBI-CRSP_5IW5UTDXUN7AOCQU/2021
MZ778136.1,USA-VT,USA/VT-CDCBI-CRSP_SO2IHR6YNWKCJN4Y/2021
MZ801696.1,USA-VT,USA/VT-CDCBI-CRSP_AUR4GTIC2ZMQ3ZLV/2021
MZ831124.1,USA-VT,USA/VT-CDCBI-CRSP_4IHFAXEAE5F6TANS/2021
MZ831131.1,USA-VT,USA/VT-CDCBI-CRSP_5H6RL6A32MFYNCN2/2021
MZ831138.1,USA-VT,USA/VT-CDCBI-CRSP_7VCLRORHOIPFO66B/2021
MZ831153.1,USA-VT,USA/VT-CDCBI-CRSP_EUTCVQKUL5VOZBCL/2021
MZ831164.1,USA-VT,USA/VT-CDCBI-CRSP_IVJSX6MOP5EQIZFH/2021
MZ831171.1,USA-VT,USA/VT-CDCBI-CRSP_LZKAT6PIIUOE7ORV/2021
MZ839391.1,USA-VT,USA/VT-CDCBI-CRSP_24YRWZD4TXLNFGGR/2021
MZ839428.1,USA-VT,USA/VT-CDCBI-CRSP_KWP3WLUGM2KQGZQJ/2021
MZ839443.1,USA-VT,USA/VT-CDCBI-CRSP_SHIKTLGSKTR4CFY6/2021
MZ839450.1,USA-VT,USA/VT-CDCBI-CRSP_UQS4ILC3ZJ7RH5AW/2021
MZ839452.1,USA-VT,USA/VT-CDCBI-CRSP_V6FHLZEXJSDAYK7S/2021
MZ934190.1,USA-VT,USA/VT-CDCBI-CRSP_2CDYU2RU63EZEM2L/2021
MZ934210.1,USA-VT,USA/VT-CDCBI-CRSP_7TPUX2HH3L3HKIYE/2021
MZ934236.1,USA-VT,USA/VT-CDCBI-CRSP_FXJRSVFWTISHNWTH/2021
MZ934251.1,USA

In [22]:
with open('2023-03-28-NCBI-related-seqs-to-QC-WTD.fasta', 'w') as fout:
    for x, seq in recs.items():
        acc = re.sub(r'([A-Z]{2}\d{6}\.\d+) .*', r'\1', x)
        seqid = re.sub(r'.* SARS-CoV-2\/[^\/]+\/([^\/]+\/[^\/]+\/\d+) ?.*', r'\1', x)
        country_state = re.sub(r'(\w+)\/(\w{2})-.*', r'\1-\2', seqid)
        fout.write(f'>{acc}\n{seq}\n')

In [23]:
!cat qc-wtd.fasta 2023-03-22-GISAID-related-seqs-to-QC-WTD.fasta 2023-03-28-NCBI-related-seqs-to-QC-WTD.fasta > 2023-03-29-QC-WTD-and-related-GISAID-NCBI-seqs.fasta