# Imports, settings and paths

In [1]:
import pandas as pd
import os
from copy import deepcopy
from random import randint

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import SingleLetterAlphabet
from tqdm import tqdm_notebook as tqdm
from ete3 import NCBITaxa

In [6]:
ncbi = NCBITaxa()

In [4]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 500)

In [3]:
# path_ncbi = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq"
path_ncbi_csv = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi.csv"
path_ncbi_pd = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.pd"

# SyntRead Class

In [23]:
class SyntRead:
    count = 0
    
    def __init__(self, record_id, taxon, genome, what, s_kingdom, path_complete, description):  # folder, file,
        self.object_id    = SyntRead.counter()
        self.record_id    = record_id
        self.taxon        = taxon
        self.genome       = genome
        self.gen_chr_plas = what
        self.s_kingdom    = s_kingdom
        self.path         = path_complete  # os.path.join(folder, file)
        self.description  = description
        self.description_original = description
        
        self.seq          = None
        self.seq_len      = None
        self.start        = None
        self.stop         = None
        self.read_len     = None
        self.read         = None 
        self.k_window_segments  = 0
        
    def counter():
        SyntRead.count += 1
        return SyntRead.count - 1
        
    def synthetize_read(self, seq="", synt_length=0, k_window_segments=0, window=10000, min_len=10**3, max_len=10**4):
        assert isinstance(k_window_segments, int)
        if k_window_segments > 0:
            self.k_window_segments = k_window_segments
        
        if seq == "":
            records = list(SeqIO.parse(self.path, "fasta"))
            for r in records:
                if r.id == self.record_id:
                    seq = str(r.seq)
                    break
                else:
                    raise FileNotFoundError("File error, couldn't find the right record")
            self.seq_len = len(seq)
        
        if self.k_window_segments > 0:
            self.start     = randint(0, int(self.seq_len / window) - self.k_window_segments) * window
            self.stop      = self.start + self.k_window_segments * window
        else:
            if synt_length == 0:
                synt_length = randint(min_len, max_len)
            self.start     = randint(0, self.seq_len - synt_length)
            self.stop      = self.start + synt_length
            
        self.read      = seq[self.start:self.stop]  
        self.read_len  = self.stop - self.start
        self.update_description()
        
    def deep_random_copy(self, n):
        seq = self.fetch_seq()
        l = []
        for i in range(n):
            new_item = deepcopy(self)
            new_item.object_id = SyntRead.counter()
            new_item.synthetize_read(seq)
            l.append(new_item)
        return l
    
    def update_description(self):
        self.description = f"tax_id={self.taxon},start={self.start},{self.genome.split()[0]} : " \
            + self.description_original \
            + f", synthetic read (sr_id={self.object_id}) from genome (tax_id={self.taxon}), "\
              f"from nucleotide {self.start} to {self.stop} (len={self.read_len})"
        
    def fetch_seq(self):
        records = list(SeqIO.parse(self.path, "fasta"))
        for r in records:
            if r.id == self.record_id:
                seq = str(r.seq)
                break
            else:
                raise FileNotFoundError("File error, couldn't find the right record")
        return seq
    
    def __repr__(self):
        return f"Synthetic read <{self.object_id}> from {self.genome}, len={self.read_len}, [{self.start}:{self.stop}]"
        
    def save_ground_truth(list_sr, path_pandas):
        if os.path.isfile(path_pandas): input(f"The file {path_pandas} already exists, continue ? ")
        rows = []
        for sr in tqdm(list_sr):
            rows.append([sr.record_id, 10**6, sr.read_len, sr.taxon, sr.genome, sr.gen_chr_plas, sr.start, sr.stop,
                         sr.description, sr.path, ])
        df = pd.DataFrame(rows, columns=["record_id", "quality", "length", "taxon", "name", "gen_chr_plas", "start", "stop", 
                                         "description", "file_path"])
        for col in ["taxon", "name", "gen_chr_plas", "record_id", "description", "file_path"]:
            df[col] = df[col].astype('category')
        df.to_pickle(path_pandas)
        print(f"File writen at: {path_pandas}")
        return df
        
    def to_fastq(list_sr, path_fastq):
        if os.path.isfile(path_fastq): input(f"The file {path_fastq} already exists, continue ? ")
        records = []
        for sr in tqdm(list_sr):
            records.append(SeqRecord(Seq(sr.read, SingleLetterAlphabet), 
                                     id=sr.record_id, name=sr.genome, description=sr.description))
        with open(path_fastq, "w") as f:
            SeqIO.write(records, f, "fasta")
        print(f"File filled with reads at: {path_fastq}")

# Create List of SyntReads

In [8]:
df = pd.read_pickle(path_ncbi_pd)

In [9]:
print(df.shape, df.taxon_id.unique().shape)

(1923707, 25) (15545,)


In [10]:
filtered = df[(df.type == "complete genome") 
               & ( ((df.name_superkingdom == "Bacteria") & (df.length > 130000))
                 | ((df.name_superkingdom == "Archea") & (df.length > 800000))
                 | ((df.name_superkingdom == "Virus") & (df.length > 800)))]
filtered.shape

(4424, 25)

## Select particular species for gut microbia
Based on https://en.wikipedia.org/wiki/Human_gastrointestinal_microbiota#Composition <br>


In [30]:
filtered[filtered.id_genus == 286]["id_species"].value_counts()[:3]

287    72
303    19
294    12
Name: id_species, dtype: int64

In [12]:
filtered[filtered.id_species == 823]

Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
978,435591,Parabacteroides distasonis ATCC 8503,no rank,complete genome,4811379,NC_009615.1,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000012845.1/GCF_000012845.1_ASM12...,2,0,976,200643,171549,2005525,375288,823,Bacteria,0,Bacteroidetes,Bacteroidia,Bacteroidales,Tannerellaceae,Parabacteroides,Parabacteroides distasonis


In [13]:
gut_ids_always = [1351, 562, 1280, 573 ]  # Take 3 strains each
gut_ids_often = [817, 550, 216816, 1590]   # 1 strain each
gut_ids_rare = [1502, 584, 287, 823]

rows = []
for gut_id in gut_ids_always:
    print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
    rows.append(filtered[filtered.id_species == gut_id].sample(3))
for gut_id in gut_ids_often + gut_ids_rare:
    print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
    rows.append(filtered[filtered.id_species == gut_id].sample(1))

selected_species = pd.concat(rows)
selected_species

{1351: 'Enterococcus faecalis'} 11
{562: 'Escherichia coli'} 264
{1280: 'Staphylococcus aureus'} 144
{573: 'Klebsiella pneumoniae'} 94
{817: 'Bacteroides fragilis'} 3
{550: 'Enterobacter cloacae'} 16
{216816: 'Bifidobacterium longum'} 17
{1590: 'Lactobacillus plantarum'} 21
{1502: 'Clostridium perfringens'} 7
{584: 'Proteus mirabilis'} 8
{287: 'Pseudomonas aeruginosa'} 72
{823: 'Parabacteroides distasonis'} 1


Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
10643,1351,Enterococcus faecalis,species,complete genome,3026009,NZ_CP015410.2,NZ_CP015410.2 Enterococcus faecalis strain KB1...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001689055.2/GCF_001689055.2_ASM16...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
15495,1351,Enterococcus faecalis,species,complete genome,2704865,NZ_AP017623.1,"NZ_AP017623.1 Enterococcus faecalis DNA, compl...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_002355755.1/GCF_002355755.1_ASM23...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
11728,1351,Enterococcus faecalis,species,complete genome,2668255,NZ_CP018102.1,NZ_CP018102.1 Enterococcus faecalis strain L12...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001886675.1/GCF_001886675.1_ASM18...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
11973,562,Escherichia coli,species,complete genome,5399183,NZ_CP010235.1,"NZ_CP010235.1 Escherichia coli strain S40, com...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001900945.1/GCF_001900945.1_ASM19...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
8488,562,Escherichia coli,species,complete genome,5059732,NZ_CP012631.1,"NZ_CP012631.1 Escherichia coli strain SF-173, ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001280405.1/GCF_001280405.1_ASM12...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
14426,562,Escherichia coli,species,complete genome,4940441,NZ_CP021689.1,"NZ_CP021689.1 Escherichia coli strain AR_0058,...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_002180195.1/GCF_002180195.1_ASM21...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
9101,46170,Staphylococcus aureus subsp. aureus,subspecies,complete genome,2987124,NZ_CP012015.1,NZ_CP012015.1 Staphylococcus aureus subsp. aur...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001515665.1/GCF_001515665.1_ASM15...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
4811,1074919,Staphylococcus aureus subsp. aureus ST228,no rank,complete genome,2759328,NC_020568.1,NC_020568.1 Staphylococcus aureus subsp. aureu...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000383005.1/GCF_000383005.1_ASM38...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
12799,1280,Staphylococcus aureus,species,complete genome,2809136,NZ_CP019563.1,NZ_CP019563.1 Staphylococcus aureus strain SR4...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001986135.1/GCF_001986135.1_ASM19...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
6914,72407,Klebsiella pneumoniae subsp. pneumoniae,subspecies,complete genome,5228295,NZ_CP009876.1,NZ_CP009876.1 Klebsiella pneumoniae subsp. pne...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000785005.1/GCF_000785005.1_ASM78...,2,0,1224,1236,91347,543,570,573,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae


In [13]:
def choose_genomes(tax_ids):
    """From a list of taxonomy ids, select these species with the information of each genome (name path length...)"""
    rows = []
    for gut_id in gut_ids_always:
        print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
        rows.append(filtered[filtered.id_species == gut_id].sample(1))
    selected_species = pd.concat(rows)
    selected_species["path"] = selected_species.path_folder.str.cat(selected_species.path_file)
    selected_species.drop(columns=["path_folder", "path_file"], inplace=True)
    return selected_species
    
gut_ids_always = [1351, 562, 1280, 573 ]  # Take 3 strains each
gut_ids_often = [817, 550, 216816, 1590]   # 1 strain each
gut_ids_rare = [1502, 584, 287, 823]
choose_genomes(gut_ids_always*3 + gut_ids_often + gut_ids_rare)

{1351: 'Enterococcus faecalis'} 11
{562: 'Escherichia coli'} 264
{1280: 'Staphylococcus aureus'} 144
{573: 'Klebsiella pneumoniae'} 94
{817: 'Bacteroides fragilis'} 3
{550: 'Enterobacter cloacae'} 16
{216816: 'Bifidobacterium longum'} 17
{1590: 'Lactobacillus plantarum'} 21
{1502: 'Clostridium perfringens'} 7
{584: 'Proteus mirabilis'} 8
{287: 'Pseudomonas aeruginosa'} 72
{823: 'Parabacteroides distasonis'} 1


Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
10643,1351,Enterococcus faecalis,species,complete genome,3026009,NZ_CP015410.2,NZ_CP015410.2 Enterococcus faecalis strain KB1...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001689055.2/GCF_001689055.2_ASM16...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
15495,1351,Enterococcus faecalis,species,complete genome,2704865,NZ_AP017623.1,"NZ_AP017623.1 Enterococcus faecalis DNA, compl...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_002355755.1/GCF_002355755.1_ASM23...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
11728,1351,Enterococcus faecalis,species,complete genome,2668255,NZ_CP018102.1,NZ_CP018102.1 Enterococcus faecalis strain L12...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001886675.1/GCF_001886675.1_ASM18...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
11973,562,Escherichia coli,species,complete genome,5399183,NZ_CP010235.1,"NZ_CP010235.1 Escherichia coli strain S40, com...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001900945.1/GCF_001900945.1_ASM19...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
8488,562,Escherichia coli,species,complete genome,5059732,NZ_CP012631.1,"NZ_CP012631.1 Escherichia coli strain SF-173, ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001280405.1/GCF_001280405.1_ASM12...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
14426,562,Escherichia coli,species,complete genome,4940441,NZ_CP021689.1,"NZ_CP021689.1 Escherichia coli strain AR_0058,...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_002180195.1/GCF_002180195.1_ASM21...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
9101,46170,Staphylococcus aureus subsp. aureus,subspecies,complete genome,2987124,NZ_CP012015.1,NZ_CP012015.1 Staphylococcus aureus subsp. aur...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001515665.1/GCF_001515665.1_ASM15...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
4811,1074919,Staphylococcus aureus subsp. aureus ST228,no rank,complete genome,2759328,NC_020568.1,NC_020568.1 Staphylococcus aureus subsp. aureu...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000383005.1/GCF_000383005.1_ASM38...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
12799,1280,Staphylococcus aureus,species,complete genome,2809136,NZ_CP019563.1,NZ_CP019563.1 Staphylococcus aureus strain SR4...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001986135.1/GCF_001986135.1_ASM19...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
6914,72407,Klebsiella pneumoniae subsp. pneumoniae,subspecies,complete genome,5228295,NZ_CP009876.1,NZ_CP009876.1 Klebsiella pneumoniae subsp. pne...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000785005.1/GCF_000785005.1_ASM78...,2,0,1224,1236,91347,543,570,573,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae


In [14]:
selected_species["path"] = selected_species.path_folder.str.cat(selected_species.path_file)
selected_species.drop(columns=["path_folder", "path_file"], inplace=True)
# selected_species

In [24]:
seeds = []
for row in tqdm(selected_species.itertuples()):
#     print(row.taxon_id)
    #  record_id, taxon, what, s_kingdom, folder, file    row["type"]
    #  record_id, taxon, genome, what, s_kingdom, path_complete, description
    sr = SyntRead(row.record_id, row.taxon_id, row.taxon_name, row.type, row.name_superkingdom, row.path, row.record_description)
    sr.synthetize_read(k_window_segments=1)
    print(sr)
    seeds.append(sr)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Synthetic read <0> from Enterococcus faecalis, len=10000, [230000:240000]
Synthetic read <1> from Enterococcus faecalis, len=10000, [2200000:2210000]
Synthetic read <2> from Enterococcus faecalis, len=10000, [2230000:2240000]
Synthetic read <3> from Escherichia coli, len=10000, [620000:630000]
Synthetic read <4> from Escherichia coli, len=10000, [1270000:1280000]
Synthetic read <5> from Escherichia coli, len=10000, [1180000:1190000]
Synthetic read <6> from Staphylococcus aureus subsp. aureus, len=10000, [1960000:1970000]
Synthetic read <7> from Staphylococcus aureus subsp. aureus ST228, len=10000, [160000:170000]
Synthetic read <8> from Staphylococcus aureus, len=10000, [980000:990000]
Synthetic read <9> from Klebsiella pneumoniae subsp. pneumoniae, len=10000, [1430000:1440000]
Synthetic read <10> from Klebsiella pneumoniae KCTC 2242, len=10000, [4800000:4810000]
Synthetic read <11> from Klebsiella pneumoniae subsp. pneumoniae 1158, len=10000, [2070000:2080000]
Synthetic read <12> from

In [25]:
synt_reads = []
for seed in tqdm(seeds):
    synt_reads.extend([seed] + seed.deep_random_copy(int(nb_reads/len(seeds)) -1))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




In [26]:
path_out_syntreads = f"/home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/" \
                     f"2019-12-04_{nb_reads}-SyntReads_{len(seeds)}-BacGut.fastq"
SyntRead.to_fastq(synt_reads, path_out_syntreads)

The file /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-12-04_100000-SyntReads_20-BacGut.fastq already exists, continue ? 


HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


File filled with reads at: /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-12-04_100000-SyntReads_20-BacGut.fastq
output at:  /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-12-04_100000-SyntReads_20-BacGut.fastq


In [28]:
path_out_ground_truth = f"/home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/" \
                     f"2019-12-04_{nb_reads}-SyntReads_{len(seeds)}-BacGut.GT.pd"
df = SyntRead.save_ground_truth(synt_reads, path_out_ground_truth)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


File writen at: /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-12-04_100000-SyntReads_20-BacGut.GT.pd


In [29]:
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'34.5 MB'

In [20]:
df

Unnamed: 0,taxon,genome,gen_chr_plas,read_len,start,stop,record_id,object_id,description,path
0,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,9676,2588160,2597836,NC_019770.1,0,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
1,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,6270,940429,946699,NC_019770.1,20,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
2,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,7470,1466599,1474069,NC_019770.1,21,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
3,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,3294,184431,187725,NC_019770.1,22,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
4,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,2879,844312,847191,NC_019770.1,23,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
...,...,...,...,...,...,...,...,...,...,...
99995,435591,Parabacteroides distasonis ATCC 8503,complete genome,2287,98603,100890,NC_009615.1,99995,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99996,435591,Parabacteroides distasonis ATCC 8503,complete genome,7327,3373009,3380336,NC_009615.1,99996,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99997,435591,Parabacteroides distasonis ATCC 8503,complete genome,9824,3633633,3643457,NC_009615.1,99997,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99998,435591,Parabacteroides distasonis ATCC 8503,complete genome,4971,2241208,2246179,NC_009615.1,99998,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...


# End

In [79]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'461.5 MB'

In [39]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 500)

In [76]:
df[(df["type"] == "undefined") 
   & ~(df.record_description.str.lower().str.contains("drosophila melanogaster")) 
   & ~(df.record_description.str.lower().str.contains("plasmid")) 
   & ~(df.record_description.str.lower().str.contains("scaffold")) 
   & ~(df.record_description.str.lower().str.contains("whole genome shotgun sequence")) 
   & ~(df.record_description.str.lower().str.contains("contig"))][["record_description", "length"]] 

Unnamed: 0,record_description,length
27,"NC_005791.1 Methanococcus maripaludis strain S2, complete sequence",1661137
236,NC_023044.1 Methanobacterium sp. MB1 complete sequence,2029766
249,NZ_CP007551.1 Haloferax mediterranei ATCC 33500 genome,2946877
257,NZ_CP008822.1 Metallosphaera sedula strain CuR1 genome,2191492
327,NZ_CP013695.1 Sulfolobus acidocaldarius strain NG05B_CO5_07 genome,2217426
356,NZ_CP019470.1 Methanopyrus sp. KOL6 genome,1430309
368,NZ_CP015193.1 Complete genome sequence of Thermococcus chitonophagus type strain GC74,1961979
379,NZ_CP017881.1 Methanohalophilus portucalensis strain FDF-1T genome,2084975
525,NC_002655.2 Escherichia coli O157:H7 str. EDL933 genome,5528445
595,"NC_002940.2 [Haemophilus] ducreyi 35000HP, complete sequence",1698955


PD from CSV

In [76]:
# path_ncbi_csv
df = pd.read_csv("/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.csv", sep="\t", )

  interactivity=interactivity, compiler=compiler, result=result)


In [77]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'1137.2 MB'

In [78]:
df.shape

(877820, 25)

In [79]:
for col in df.columns:
    if "id_" in col:
        print(col)
        df[col].fillna(0, inplace=True)
        df[col] = df[col].astype(int)

id_superkingdom
id_kingdom
id_phylum
id_class
id_order
id_family
id_genus
id_species


In [80]:
col_categories = ['rank', 'type', 'path_folder', 
                  'id_superkingdom', 'name_superkingdom', 'id_kingdom', 'name_kingdom', 'id_phylum', 'name_phylum', 
                  'id_class', 'name_class', 'id_order', 'name_order', 'id_family', 'name_family', 
                  'id_genus', 'name_genus', 'id_species', 'name_species']

for col in tqdm(col_categories):
    df[col] = df[col].astype('category')
print(f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB")

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))


438.1 MB


In [81]:
# path_ncbi_pd
df.to_pickle('/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.pd')

In [6]:
desired_ranks = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
headers = ["taxon_id", "taxon_name", "rank", "type", "length", 
           "record_id", "record_description", "path_folder", "path_file", ]
[headers.extend([f"id_{v}", f"name_{v}"]) for v in desired_ranks]
headers

['taxon_id',
 'taxon_name',
 'rank',
 'type',
 'length',
 'record_id',
 'record_description',
 'path_folder',
 'path_file',
 'id_superkingdom',
 'name_superkingdom',
 'id_kingdom',
 'name_kingdom',
 'id_phylum',
 'name_phylum',
 'id_class',
 'name_class',
 'id_order',
 'name_order',
 'id_family',
 'name_family',
 'id_genus',
 'name_genus',
 'id_species',
 'name_species']

# Read the csv and transform to pandas, plus column dtype conversion

Might take a while if file is big...

# End

In [144]:
path_file = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/bacteria/GCF_000006845.1/GCF_000006845.1_ASM684v1_genomic.fna"

In [150]:
record = SeqIO.parse(path_file, "fasta")

In [151]:
r = next(record)
r

SeqRecord(seq=Seq('ATAAATTTTTGCACGGGTTGTGGATAAAATATCGGCGAGTCGGTATAATCGGTT...TGG', SingleLetterAlphabet()), id='NC_002946.2', name='NC_002946.2', description='NC_002946.2 Neisseria gonorrhoeae FA 1090 chromosome, complete genome', dbxrefs=[])

In [59]:
len(record.seq)

1936387

In [85]:
record.seq.upper()

Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())

In [73]:
print(record)

ID: NC_014222.1
Name: NC_014222.1
Description: NC_014222.1 Methanococcus voltae A3, complete genome
Number of features: 0
Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())


In [62]:
dir(record)

['__add__',
 '__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__le___',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

In [86]:
record.description

'NC_014222.1 Methanococcus voltae A3, complete genome'

In [176]:
get_desired_ranks(2, desired_ranks)

{'superkingdom_id': 2,
 'kingdom_id': 'NaN',
 'phylum_id': 'NaN',
 'class_id': 'NaN',
 'order_id': 'NaN',
 'family_id': 'NaN',
 'genus_id': 'NaN',
 'species_id': 'NaN'}

In [154]:
get_desired_ranks(9606, desired_ranks, tolist=True)

[2759, 33208, 7711, 40674, 9443, 9604, 9605, 9606]

In [155]:
ncbi.translate_to_names(get_desired_ranks(9606, desired_ranks, tolist=True))

['Eukaryota',
 'Metazoa',
 'Chordata',
 'Mammalia',
 'Primates',
 'Hominidae',
 'Homo',
 'Homo sapiens']

In [184]:
ranks_id_str = ncbi.get_taxid_translator(get_desired_ranks(6, desired_ranks, tolist=True))
ranks_id_str

{2: 'Bacteria',
 6: 'Azorhizobium',
 356: 'Rhizobiales',
 1224: 'Proteobacteria',
 28211: 'Alphaproteobacteria',
 335928: 'Xanthobacteraceae'}

In [185]:
l=[]
[l.extend([k,v]) for k,v in ranks_id_str.items()]

[None, None, None, None, None, None]

In [186]:
l

[2,
 'Bacteria',
 6,
 'Azorhizobium',
 356,
 'Rhizobiales',
 1224,
 'Proteobacteria',
 28211,
 'Alphaproteobacteria',
 335928,
 'Xanthobacteraceae']