In [1]:
def get_assembly_summary(id):
    """Get esummary for an entrez id"""
    from Bio import Entrez
    esummary_handle = Entrez.esummary(db="assembly", id=id, report="full")
    esummary_record = Entrez.read(esummary_handle)
    return esummary_record

In [6]:
def get_assemblies(term, download=True, path='assemblies'):
    """Download genbank assemblies for a given search term.
    Args:
        term: search term, usually organism name
        download: whether to download the results
        path: folder to save to
    """
    import os, urllib
    from Bio import Entrez
    #provide your own mail here
    Entrez.email = "A.N.Other@example.com"
    handle = Entrez.esearch(db="assembly", term=term, retmax='200')
    record = Entrez.read(handle)
    ids = record['IdList']
    print (f'found {len(ids)} ids')
    links = []
    for id in ids:
        #get summary
        summary = get_assembly_summary(id)
        #get ftp link
        url = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']
        if url == '':
            continue
        label = os.path.basename(url)
        #get the fasta link - change this to get other formats
        link = os.path.join(url,label+'_genomic.fna.gz')
        print (link)
        links.append(link)
        if download == True:
            #download link
            urllib.request.urlretrieve(link, f'{label}.fna.gz')
    return links

In [11]:
# get samples
import pandas as pd
samples = pd.read_table('progenomes_cluster1969.tsv', sep='\t', header='infer')
samples

Unnamed: 0,species,sample,rep,cluster,genes,contigs
0,Mycoplasma pneumoniae,SAMN03142252,No,specI_v3_Cluster1969,1340,29
1,Mycoplasma pneumoniae,SAMN03142253,No,specI_v3_Cluster1969,1401,20
2,Mycoplasma pneumoniae,SAMN03142254,No,specI_v3_Cluster1969,1364,44
3,Mycoplasma pneumoniae,SAMN03142256,No,specI_v3_Cluster1969,1391,16
4,Mycoplasma pneumoniae,SAMN03142257,No,specI_v3_Cluster1969,1397,18
...,...,...,...,...,...,...
71,Mycoplasma pneumoniae M2592,SAMN03284393,No,specI_v3_Cluster1969,743,1
72,Mycoplasma pneumoniae M29,SAMN02582400,No,specI_v3_Cluster1969,1503,1
73,Mycoplasma pneumoniae MAC,SAMN03284394,No,specI_v3_Cluster1969,742,1
74,Mycoplasma pneumoniae PI 1428,SAMN02471520,No,specI_v3_Cluster1969,746,1


In [12]:
for x in samples['sample']:
    print(x)
    links = get_assemblies(x,
                           download=True)


SAMN03142252
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/455/635/GCF_001455635.1_ASM145563v1/GCF_001455635.1_ASM145563v1_genomic.fna.gz
SAMN03142253
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/455/795/GCF_001455795.1_ASM145579v1/GCF_001455795.1_ASM145579v1_genomic.fna.gz
SAMN03142254
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/455/625/GCF_001455625.1_ASM145562v1/GCF_001455625.1_ASM145562v1_genomic.fna.gz
SAMN03142256
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/455/675/GCF_001455675.1_ASM145567v1/GCF_001455675.1_ASM145567v1_genomic.fna.gz
SAMN03142257
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/455/685/GCF_001455685.1_ASM145568v1/GCF_001455685.1_ASM145568v1_genomic.fna.gz
SAMN03142258
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/455/695/GCF_001455695.1_ASM145569v1/GCF_001455695.1_ASM145569v1_genomic.fna.gz
SAMN03142260
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/455/745/GCF_001455745.

SAMN06699949
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/096/015/GCF_002096015.1_ASM209601v1/GCF_002096015.1_ASM209601v1_genomic.fna.gz
SAMN06700070
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/095/995/GCF_002095995.1_ASM209599v1/GCF_002095995.1_ASM209599v1_genomic.fna.gz
SAMN06700077
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/096/035/GCF_002096035.1_ASM209603v1/GCF_002096035.1_ASM209603v1_genomic.fna.gz
SAMN02471521
found 2 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/387/745/GCF_000387745.1_19294_v1/GCF_000387745.1_19294_v1_genomic.fna.gz
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/387/745/GCF_000387745.2_ASM38774v2/GCF_000387745.2_ASM38774v2_genomic.fna.gz
SAMD00012611
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/283/755/GCF_000283755.1_ASM28375v1/GCF_000283755.1_ASM28375v1_genomic.fna.gz
SAMN03284385
found 1 ids
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/272/715/GCF_001272715.1_ASM127271v1/GCF_001272715.1_ASM12