In [1]:
import os
import pandas as pd
import json
import threading
import time
import plotly.express as px
import ncbi_genome_download as ngd
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


# from Bio import Entrez, SeqIO
# from Bio.Seq import Seq
# from Bio.SeqRecord import SeqRecord
# from Bio.SeqFeature import SeqFeature, FeatureLocation

pd.set_option('display.max_rows', 500) #pandas parameter to increase number of viewable rows in a dataframe
pd.set_option('display.max_columns', 500) # pandas paramert to increase the number of viewable columns in a dataframe
pd.options.display.width = 0 #panda dynamically updates the cell display width
pd.set_option('display.max_colwidth', 199)  #dynamically updates the column display width 

In [2]:
target_url = "https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt" #URL summary of all genbank assembly files
target_url_2 = 'https://ftp.ncbi.nlm.nih.gov/genomes/GENOME_REPORTS/prokaryotes.txt' #URL summary of all prokaryotic organisms and metadata

genbank_df = pd.read_csv(target_url, skiprows=1, header=0, sep='\t', low_memory=False) #read in target_URL as data frame
ncbi_data = pd.read_csv(target_url_2, skiprows=0, sep='\t', low_memory=False) #read in prokaryote url (target_URL_2) as data frame

# Limit dataset to only prokaryotes and complete genomes

#Creates a List object that serves as the intersection of the list of prokaryotes and genbank accession numbers to obtain a list of all prokaryote
#accession numbers
bioprojects = list(set(ncbi_data['BioProject Accession']).intersection(set(genbank_df.bioproject)))


    
#generates a dataset dataframe object that pulls all identified prokaryotes from bioprojects object and pulls those from the genbank_df object
dataset = genbank_df[genbank_df.bioproject.isin(bioprojects)]

#Boolean checker to ensure all entries in dataset object contain complete genomes
dataset  = dataset[(dataset.assembly_level == 'Complete Genome')] #default command to grab all complete geneome assemblies
#dataset  = dataset[(dataset.assembly_level == 'Complete Genome') & (dataset.organism_name.str.contains("Escherichia"))] #grab a organism specific subset


In [8]:
# Random sample from final dataset for testing, n is arbitrary
new_dataset = dataset.sample(n=10)

# For loop to retreive genbank files in working directory
for i in list(new_dataset['# assembly_accession']):
    ngd.download(section='genbank', file_formats='all', assembly_accessions=i, output='Genbank_Organisms')

No entry for file ending in '_rm.out.gz'
No entry for file ending in '_wgsmaster.gbff.gz'
No entry for file ending in '_rna.fna.gz'
No entry for file ending in '_rm.out.gz'
No entry for file ending in '_wgsmaster.gbff.gz'
No entry for file ending in '_rna.fna.gz'
No entry for file ending in '_rm.out.gz'
No entry for file ending in '_feature_table.txt.gz'
No entry for file ending in '_genomic.gff.gz'
No entry for file ending in '_protein.faa.gz'
No entry for file ending in '_protein.gpff.gz'
No entry for file ending in '_wgsmaster.gbff.gz'
No entry for file ending in '_cds_from_genomic.fna.gz'
No entry for file ending in '_rna.fna.gz'
No entry for file ending in '_rna_from_genomic.fna.gz'
No entry for file ending in '_rm.out.gz'
No entry for file ending in '_wgsmaster.gbff.gz'
No entry for file ending in '_rna.fna.gz'
No entry for file ending in '_rm.out.gz'
No entry for file ending in '_feature_table.txt.gz'
No entry for file ending in '_genomic.gff.gz'
No entry for file ending in '_pr

In [7]:
#this is just a demo area for showcasing the dataset command

dataset2  = dataset[(dataset.assembly_level == 'Complete Genome') & (dataset.organism_name.str.contains("Bacillus"))] 
dataset2

Unnamed: 0,# assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,version_status,assembly_level,release_type,genome_rep,seq_rel_date,asm_name,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,asm_not_live_date
181,GCA_000007825.1,PRJNA384,SAMN02603340,,na,226900,1396,Bacillus cereus ATCC 14579,strain=ATCC 14579,,latest,Complete Genome,Major,Full,2004/07/06,ASM782v1,INRA,GCF_000007825.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/007/825/GCA_000007825.1_ASM782v1,,assembly from type material,na
182,GCA_000007845.1,PRJNA309,SAMN02603432,,na,198094,1392,Bacillus anthracis str. Ames,strain=Ames,,latest,Complete Genome,Major,Full,2003/04/30,ASM784v1,J. Craig Venter Institute,GCF_000007845.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/007/845/GCA_000007845.1_ASM784v1,,,na
189,GCA_000008005.1,PRJNA74,SAMN02603979,,na,222523,1396,Bacillus cereus ATCC 10987,strain=ATCC 10987,,latest,Complete Genome,Major,Full,2004/02/23,ASM800v1,TIGR,GCF_000008005.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/005/GCA_000008005.1_ASM800v1,,,na
197,GCA_000008165.1,PRJNA10878,SAMN02598266,,na,260799,1392,Bacillus anthracis str. Sterne,strain=Sterne,,latest,Complete Genome,Major,Full,2004/06/24,ASM816v1,DOE Joint Genome Institute,GCF_000008165.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/165/GCA_000008165.1_ASM816v1,,,na
210,GCA_000008425.1,PRJNA13082,SAMN02603292,,na,279010,1402,Bacillus licheniformis DSM 13 = ATCC 14580,strain=DSM 13,,latest,Complete Genome,Major,Full,2004/09/20,ASM842v1,Gottingen Genomics Laboratory,GCF_000008425.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/008/425/GCA_000008425.1_ASM842v1,,assembly from type material,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129261,GCA_900186955.1,PRJEB6403,SAMEA4076707,,na,1408,1408,Bacillus pumilus,strain=NCTC10337,,latest,Complete Genome,Major,Full,2017/08/15,49386_E02,SC,GCF_900186955.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/186/955/GCA_900186955.1_49386_E02,,assembly from type material,na
1146719,GCA_900635765.1,PRJEB6403,SAMEA2771256,,na,1648923,1648923,Bacillus paralicheniformis,strain=NCTC8721,,latest,Complete Genome,Major,Full,2018/12/20,33763_B01,SC,GCF_900635765.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/635/765/GCA_900635765.1_33763_B01,,,na
1146835,GCA_900636945.1,PRJEB6403,SAMEA3919789,,na,1783501,1783501,Bacillus freudenreichii,strain=NCTC4823,,latest,Complete Genome,Major,Full,2018/12/20,45532_G01,SC,GCF_900636945.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/636/945/GCA_900636945.1_45532_G01,unverified source organism,,na
1146887,GCA_900637645.1,PRJEB6403,SAMEA4556067,,na,1402,1402,Bacillus licheniformis,strain=NCTC10341,,latest,Complete Genome,Major,Full,2018/12/20,52363_G01,SC,GCF_900637645.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/900/637/645/GCA_900637645.1_52363_G01,many frameshifted proteins,assembly from type material,na
