# Imports, settings and paths

In [1]:
import pandas as pd
import os
import csv
import traceback
from datetime import datetime

In [2]:
from Bio import SeqIO
from tqdm import tqdm_notebook as tqdm
from ete3 import NCBITaxa

In [3]:
today = f"{datetime.now():%Y-%m-%d}"
path_ncbi = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq"
path_scanning_id = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi_scanning_id.txt"
# path_ncbi_csv = f"/home/ubuntu/Disks/HDD1000/NCBI/ncbi_{today}.csv"
# path_ncbi_pd = f"/home/ubuntu/Disks/HDD1000/NCBI/ncbi_{today}.pd"
path_ncbi_csv = f"/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-27.csv"
path_ncbi_pd = f"/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-27.pd"

In [4]:
path_ncbi_pd

'/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-27.pd'

In [5]:
pd.set_option('display.max_columns', 50)

In [6]:
ncbi = NCBITaxa()

In [7]:
desired_ranks = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

In [8]:
def get_desired_ranks(taxid, desired_ranks, tolist=False):
    """ From stackoverflow
        https://stackoverflow.com/questions/36503042/how-to-get-taxonomic-specific-ids-for-kingdom-phylum-class-order-family-gen
    """
    try:
        lineage = ncbi.get_lineage(taxid)
        lineage2ranks = ncbi.get_rank(lineage)
        ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())
        if tolist: return [ranks2lineage.get(rank, 0) for rank in desired_ranks]
        else:      return {f'{rank}_id': ranks2lineage.get(rank, 0) for rank in desired_ranks}
    except:
        print(f"retrieval of the lineage of {taxid} failed")
        if tolist: return [0 for rank in desired_ranks]
        else:      return {f'{rank}_id': 0 for rank in desired_ranks}

# Retrieve missing .taxon

In [9]:
def retrieve_taxid_from_gff(path_fna, taxo_ext="gff"):
    taxo = ""
    path_taxon = path_fna.replace(".fna", ".taxon")
    if os.path.isfile(path_taxon):
        with open(path_taxon) as f:
            taxo = f.read()
            
    if str.isdigit(taxo):
        return int(taxo)
    else:
        path_gbk = path_fna.replace(".fna", f".{taxo_ext}")
        assert os.path.isfile(path_gbk), f"<{taxo}> wasn't a tax id and {path_fna} " \
                                         f"DOESN'T have a .{taxo_ext} file ??"

        with open(path_gbk) as gbk:
            description = [next(gbk) for i in range(9)][-1]

        if taxo_ext == "gbk":
            identificator = 'db_xref="taxon:'
        elif taxo_ext == "gff":
            identificator = 'Taxonomy/Browser/wwwtax.cgi?id='
            terminator = "\n"
            if identificator not in description[:2000]:
                identificator = "Dbxref=taxon:" 
                terminator = ";"
        taxo_start = description.find(identificator, 0, 2000)
        assert 0 < taxo_start < 2000, f"the value of taxo_start is out of bounds: {taxo_start} \n" \
                                      f"{description[:2010]}"
        taxo = description[taxo_start+len(identificator):
                           taxo_start+description[taxo_start:].find(terminator)]
            
        assert 1 <= len(taxo) <= 9 and str.isdigit(taxo), \
            f"The taxo id search failed, found an id of length {len(taxo)}, \n" \
            f"for the file: {path_gbk} \n found string: {taxo[:min(50, len(taxo))]} ..."
        
        with open(path_taxon, "w") as f:
            f.write(taxo)
        return int(taxo)

# Loop through NCBI genomes to collect data into csv

In [None]:
i = 0
imax = 200000
can_be = ["plasmid", "chloroplaste", "scaffold", "contig", 
          "chromosome", "complete genome", "whole genome shotgun sequence", ]  # to extend
headers = ["taxon_id", "taxon_name", "rank", "type", "length", 
           "record_id", "record_description", "path_folder", "path_file", ]
[headers.append(f"id_{v}") for v in desired_ranks]
[headers.append(f"name_{v}") for v in desired_ranks]

with open(path_scanning_id, 'r') as f:
    scanning_id = int(f.read())

with open(path_ncbi_csv, 'a', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter='\t')
    if scanning_id == 0: 
        writer.writerow(headers)

    for dir_path, dir_names, files in tqdm(os.walk(path_ncbi)):
        for file in files:
            if file.endswith("fna"):
                i += 1
                if i <= scanning_id: continue
                    
                # Read record / .fna file
                path_file = os.path.join(dir_path, file)
                records_to_csv = []
                # Read taxo file if exists
                taxon_id = retrieve_taxid_from_gff(path_file)

                for record in SeqIO.parse(path_file, "fasta"):
                    try:
                        # Is it a genome, chromosome, plasmid ?
                        what = "undefined"
                        descr = record.description
                        if len(descr) > 300: descr = descr[:300]
                        for v in can_be:
                            if v in descr.lower():
                                what = v
                                break

                        # Get the lineage for main ranks
                        try:
                            rank = ncbi.get_rank([taxon_id])[taxon_id]
                        except:
                            rank = "no rank"
                        ranks_ids = get_desired_ranks(taxon_id, desired_ranks, tolist=True)
                        ranks_names = ncbi.translate_to_names(ranks_ids)

                        # The file path (take advantage of pandas categorical)
                        path_base, path_rec_relative = path_file.split("refseq/")
                        path_base += "refseq/"

                        # Gather the data in a tuple
                        record_data = (taxon_id, ncbi.translate_to_names([taxon_id])[0], 
                                       rank, what, len(record.seq), record.id, descr, 
                                       path_base, path_rec_relative, *ranks_ids, *ranks_names)
                        # add to tuple ?  str(record.seq.upper())
                        records_to_csv.append(record_data)
                    
                    except Exception as e:
                        print(path_file)
                        traceback.print_exc()
                        raise NotImplementedError("Need to check file: " + path_file)
                        
                for record in records_to_csv:
                    writer.writerow(record)
                # Keep track which file has been processed
                with open(path_scanning_id, 'w') as f:
                    f.write(str(i))
                if i > imax: break
        if i > imax: break

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

# Read the csv and transform to pandas, plus column dtype conversion

Might take a while if file is big...

In [42]:
df = pd.read_csv(path_ncbi_csv, sep="\t", )

  interactivity=interactivity, compiler=compiler, result=result)


In [43]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'2536.1 MB'

In [44]:
df.shape

(1923707, 25)

In [45]:
for col in df.columns:
    if "id_" in col:
        print(col)
        df[col].fillna(0, inplace=True)
        df[col] = df[col].astype(int)

id_superkingdom
id_kingdom
id_phylum
id_class
id_order
id_family
id_genus
id_species


In [46]:
col_categories = ['rank', 'type', 'path_folder', 
                  'id_superkingdom', 'name_superkingdom', 'id_kingdom', 'name_kingdom', 'id_phylum', 'name_phylum', 
                  'id_class', 'name_class', 'id_order', 'name_order', 'id_family', 'name_family', 
                  'id_genus', 'name_genus', 'id_species', 'name_species']

for col in tqdm(col_categories):
    df[col] = df[col].astype('category')

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




In [47]:
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'996.5 MB'

In [48]:
df

Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
0,456320,Methanococcus voltae A3,no rank,complete genome,1936387,NC_014222.1,"NC_014222.1 Methanococcus voltae A3, complete ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006175.1/GCF_000006175.1_ASM617...,2157,0,28890,183939,2182,2183,2184,2188,Archaea,0,Euryarchaeota,Methanococci,Methanococcales,Methanococcaceae,Methanococcus,Methanococcus voltae
1,64091,Halobacterium salinarum NRC-1,no rank,complete genome,2014239,NC_002607.1,"NC_002607.1 Halobacterium sp. NRC-1, complete ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006805.1/GCF_000006805.1_ASM680...,2157,0,28890,183963,2235,2236,2239,2242,Archaea,0,Euryarchaeota,Halobacteria,Halobacteriales,Halobacteriaceae,Halobacterium,Halobacterium salinarum
2,64091,Halobacterium salinarum NRC-1,no rank,plasmid,191346,NC_001869.1,NC_001869.1 Halobacterium sp. NRC-1 plasmid pN...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006805.1/GCF_000006805.1_ASM680...,2157,0,28890,183963,2235,2236,2239,2242,Archaea,0,Euryarchaeota,Halobacteria,Halobacteriales,Halobacteriaceae,Halobacterium,Halobacterium salinarum
3,64091,Halobacterium salinarum NRC-1,no rank,plasmid,365425,NC_002608.1,NC_002608.1 Halobacterium sp. NRC-1 plasmid pN...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000006805.1/GCF_000006805.1_ASM680...,2157,0,28890,183963,2235,2236,2239,2242,Archaea,0,Euryarchaeota,Halobacteria,Halobacteriales,Halobacteriaceae,Halobacterium,Halobacterium salinarum
4,273057,Saccharolobus solfataricus P2,no rank,complete genome,2992245,NC_002754.1,"NC_002754.1 Sulfolobus solfataricus P2, comple...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,archaea/GCF_000007005.1/GCF_000007005.1_ASM700...,2157,0,28889,183924,2281,118883,2100760,2287,Archaea,0,Crenarchaeota,Thermoprotei,Sulfolobales,Sulfolobaceae,Saccharolobus,Saccharolobus solfataricus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1923702,159148,Shuni orthobunyavirus,species,undefined,4351,NC_043697.1,NC_043697.1 Shuni virus strain SAE1809 polypro...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,viral/GCF_006298405.1/GCF_006298405.1_ASM62984...,10239,0,2497569,2497576,1980410,1980416,11572,159148,Viruses,0,Negarnaviricota,Ellioviricetes,Bunyavirales,Peribunyaviridae,Orthobunyavirus,Shuni orthobunyavirus
1923703,159148,Shuni orthobunyavirus,species,undefined,851,NC_043698.1,NC_043698.1 Shuni virus strain SAE1809 nucleoc...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,viral/GCF_006298405.1/GCF_006298405.1_ASM62984...,10239,0,2497569,2497576,1980410,1980416,11572,159148,Viruses,0,Negarnaviricota,Ellioviricetes,Bunyavirales,Peribunyaviridae,Orthobunyavirus,Shuni orthobunyavirus
1923704,1970374,Acinetobacter phage vB_AbaS_Loki,species,complete genome,41308,NC_042137.1,NC_042137.1 Acinetobacter phage vB_AbaS_Loki g...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,viral/GCF_900010585.1/GCF_900010585.1_vB_AbaS_...,10239,0,0,0,28883,10699,0,1970374,Viruses,0,0,0,Caudovirales,Siphoviridae,0,Acinetobacter phage vB_AbaS_Loki
1923705,1880822,Klebsiella phage PMBT1,no rank,complete genome,175206,NC_042138.1,NC_042138.1 Klebsiella phage PMBT1 genome asse...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,viral/GCF_900095325.1/GCF_900095325.1_PMBT1_ge...,10239,0,0,0,28883,10662,1985328,2560544,Viruses,0,0,0,Caudovirales,Myoviridae,Slopekvirus,Klebsiella virus PMBT1


In [49]:
df.to_pickle(path_ncbi_pd)

# Need to fix the species name

In [None]:
from ete3 import NCBITaxa
ncbi = NCBITaxa()

ncbi.get_fuzzy_name_translation("Clostridioides difficile")

ncbi.get_name_translator(["Clostridioides difficile"])

# End

In [206]:
headers = ["taxon_id", "taxon_name", "rank", "type", "length", "record_id", "record_description",
           "path_folder", "path_file", ]
[headers.extend([f"id_{v}", f"name_{v}"]) for v in desired_ranks]

In [234]:
"['" + "', '".join("taxon_name	rank	type	length	record_id	record_description	path_folder	path_file	id_superkingdom	name_superkingdom	id_kingdom	name_kingdom	id_phylum	name_phylum	id_class	name_class	id_order	name_order	id_family	name_family	id_genus	name_genus	id_species	name_species".split("\t")) + "']"

"['taxon_name', 'rank', 'type', 'length', 'record_id', 'record_description', 'path_folder', 'path_file', 'id_superkingdom', 'name_superkingdom', 'id_kingdom', 'name_kingdom', 'id_phylum', 'name_phylum', 'id_class', 'name_class', 'id_order', 'name_order', 'id_family', 'name_family', 'id_genus', 'name_genus', 'id_species', 'name_species']"

In [213]:
record_data

(272569,
 'Haloarcula marismortui ATCC 43049',
 'no rank',
 'plasmid',
 410554,
 'NC_006395.1',
 'NC_006395.1 Haloarcula marismortui ATCC 43049 plasmid pNG700, complete sequence',
 '/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/',
 'archaea/GCF_000011085.1/GCF_000011085.1_ASM1108v1_genomic.fna',
 2157,
 'Archaea',
 2235,
 'Halobacteriales',
 2237,
 'Haloarcula',
 2238,
 'Haloarcula marismortui',
 28890,
 'Euryarchaeota',
 183963,
 'Halobacteria',
 1963268,
 'Haloarculaceae')

In [114]:
path_file = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/bacteria/GCF_000006845.1/GCF_000006845.1_ASM684v1_genomic.fna"

In [56]:
record

SeqRecord(seq=Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet()), id='NC_014222.1', name='NC_014222.1', description='NC_014222.1 Methanococcus voltae A3, complete genome', dbxrefs=[])

In [59]:
len(record.seq)

1936387

In [85]:
record.seq.upper()

Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())

In [73]:
print(record)

ID: NC_014222.1
Name: NC_014222.1
Description: NC_014222.1 Methanococcus voltae A3, complete genome
Number of features: 0
Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())


In [62]:
dir(record)

['__add__',
 '__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__le___',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

In [86]:
record.description

'NC_014222.1 Methanococcus voltae A3, complete genome'

In [176]:
get_desired_ranks(2, desired_ranks)

{'superkingdom_id': 2,
 'kingdom_id': 'NaN',
 'phylum_id': 'NaN',
 'class_id': 'NaN',
 'order_id': 'NaN',
 'family_id': 'NaN',
 'genus_id': 'NaN',
 'species_id': 'NaN'}

In [154]:
get_desired_ranks(9606, desired_ranks, tolist=True)

[2759, 33208, 7711, 40674, 9443, 9604, 9605, 9606]

In [155]:
ncbi.translate_to_names(get_desired_ranks(9606, desired_ranks, tolist=True))

['Eukaryota',
 'Metazoa',
 'Chordata',
 'Mammalia',
 'Primates',
 'Hominidae',
 'Homo',
 'Homo sapiens']

In [184]:
ranks_id_str = ncbi.get_taxid_translator(get_desired_ranks(6, desired_ranks, tolist=True))
ranks_id_str

{2: 'Bacteria',
 6: 'Azorhizobium',
 356: 'Rhizobiales',
 1224: 'Proteobacteria',
 28211: 'Alphaproteobacteria',
 335928: 'Xanthobacteraceae'}

In [185]:
l=[]
[l.extend([k,v]) for k,v in ranks_id_str.items()]

[None, None, None, None, None, None]

In [186]:
l

[2,
 'Bacteria',
 6,
 'Azorhizobium',
 356,
 'Rhizobiales',
 1224,
 'Proteobacteria',
 28211,
 'Alphaproteobacteria',
 335928,
 'Xanthobacteraceae']