# Imports, settings and paths

In [1]:
import pandas as pd
import os
from copy import deepcopy
from random import randint

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import SingleLetterAlphabet
from tqdm import tqdm_notebook as tqdm
from ete3 import NCBITaxa

In [3]:
ncbi = NCBITaxa()

In [4]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 500)

In [5]:
# path_ncbi = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq"
path_ncbi_csv = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi.csv"
path_ncbi_pd = "/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.pd"

# SyntRead Class

In [22]:
class SyntRead:
    count = 0
    
    def __init__(self, record_id, taxon, genome, what, s_kingdom, path_complete, description):  # folder, file,
        self.object_id    = SyntRead.counter()
        self.record_id    = record_id
        self.taxon        = taxon
        self.genome       = genome
        self.gen_chr_plas = what
        self.s_kingdom    = s_kingdom
        self.path         = path_complete  # os.path.join(folder, file)
        self.description  = description
        self.description_original = description
        
        self.seq          = None
        self.seq_len      = None
        self.start        = None
        self.stop         = None
        self.read_len     = None
        self.read         = None 
        self.k_window_segments  = 0
        
    def counter():
        SyntRead.count += 1
        return SyntRead.count - 1
        
    def synthetize_read(self, seq="", synt_length=0, k_window_segments=0, window=10000, min_len=10**3, max_len=10**4):
        """ to avoid storing the whole genome sequence multiple time, in each instance, 
            the sequence can be passed as an argument. 
            Method can be used to generate reads of multiples of windows size or of real random location (k_window_segments). 
            Length is a multiple of the window size in this case, or set by min_len / max_len.
            
        """
        assert isinstance(k_window_segments, int)
        if k_window_segments > 0:
            self.k_window_segments = k_window_segments
        
        if seq == "":
            records = list(SeqIO.parse(self.path, "fasta"))
            for r in records:
                if r.id == self.record_id:
                    seq = str(r.seq)
                    break
            if seq == "":
                raise FileNotFoundError(f"Sequence not found in \n{', '.join([r.id for r in records])} \n{self.path}")
            self.seq_len = len(seq)
        
        if self.k_window_segments > 0:
            self.start     = randint(0, int(self.seq_len / window) - self.k_window_segments) * window
            self.stop      = self.start + self.k_window_segments * window
        else:
            if synt_length == 0:
                synt_length = randint(min_len, max_len)
            self.start     = randint(0, self.seq_len - synt_length)
            self.stop      = self.start + synt_length
            
        self.read      = seq[self.start:self.stop]  
        self.read_len  = self.stop - self.start
        self.update_description()
        
    def deep_random_copy(self, n):
        seq = self.fetch_seq()
        l = []
        for i in range(n):
            new_item = deepcopy(self)
            new_item.object_id = SyntRead.counter()
            new_item.synthetize_read(seq)
            l.append(new_item)
        return l
    
    def update_description(self):
        self.description = f"tax_id={self.taxon},start={self.start},{self.genome.split()[0]} : " \
            + self.description_original \
            + f", synthetic read (sr_id={self.object_id}) from genome (tax_id={self.taxon}), "\
              f"from nucleotide {self.start} to {self.stop} (len={self.read_len})"
        
    def fetch_seq(self):
        records = list(SeqIO.parse(self.path, "fasta"))
        for r in records:
            if r.id == self.record_id:
                seq = str(r.seq)
                break
            else:
                raise FileNotFoundError("File error, couldn't find the right record")
        return seq
    
    def __repr__(self):
        return f"Synthetic read <{self.object_id}> from {self.genome}, len={self.read_len}, [{self.start}:{self.stop}]"
        
    def save_ground_truth(list_sr, path_pandas):
        if os.path.isfile(path_pandas): input(f"The file {path_pandas} already exists, continue ? ")
        rows = []
        for sr in tqdm(list_sr):
            rows.append([sr.record_id, 10**6, sr.read_len, sr.taxon, sr.genome, sr.gen_chr_plas, sr.start, sr.stop,
                         sr.description, sr.path, ])
        df = pd.DataFrame(rows, columns=["record_id", "quality", "length", "taxon", "name", "gen_chr_plas", "start", "stop", 
                                         "description", "file_path"])
        for col in ["taxon", "name", "gen_chr_plas", "record_id", "description", "file_path"]:
            df[col] = df[col].astype('category')
        df.to_pickle(path_pandas)
        print(f"File writen at: {path_pandas}")
        return df
        
    def to_fastq(list_sr, path_fastq):
        if os.path.isfile(path_fastq): input(f"The file {path_fastq} already exists, continue ? ")
        records = []
        for sr in tqdm(list_sr):
            records.append(SeqRecord(Seq(sr.read, SingleLetterAlphabet), 
                                     id=sr.record_id, name=sr.genome, description=sr.description))
        with open(path_fastq, "w") as f:
            SeqIO.write(records, f, "fasta")
        print(f"File filled with reads at: {path_fastq}")

# Create List of SyntReads

In [7]:
df = pd.read_pickle(path_ncbi_pd)

In [8]:
print(df.shape, df.taxon_id.unique().shape)

(1923707, 25) (15545,)


In [9]:
filtered = df[(df.type == "complete genome") 
               & ( ((df.name_superkingdom == "Bacteria") & (df.length > 130000))
                 | ((df.name_superkingdom == "Archea") & (df.length > 800000))
                 | ((df.name_superkingdom == "Virus") & (df.length > 800)))]
filtered.shape

(4424, 25)

## Select particular species for gut microbia
Based on https://en.wikipedia.org/wiki/Human_gastrointestinal_microbiota#Composition <br>


In [10]:
filtered[filtered.id_genus == 286]["id_species"].value_counts()[:3]

287    72
303    19
294    12
Name: id_species, dtype: int64

In [11]:
filtered[filtered.id_species == 823]

Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
978,435591,Parabacteroides distasonis ATCC 8503,no rank,complete genome,4811379,NC_009615.1,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000012845.1/GCF_000012845.1_ASM12...,2,0,976,200643,171549,2005525,375288,823,Bacteria,0,Bacteroidetes,Bacteroidia,Bacteroidales,Tannerellaceae,Parabacteroides,Parabacteroides distasonis


In [19]:
gut_ids_always = [1351, 562, 1280, 573 ]  # Take 3 strains each
gut_ids_often = [817, 550, 216816, 1590]   # 1 strain each
gut_ids_rare = [1502, 584, 287, 823]

rows = []
for gut_id in gut_ids_always:
    print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
    rows.append(filtered[filtered.id_species == gut_id].sample(3))
for gut_id in gut_ids_often + gut_ids_rare:
    print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
    rows.append(filtered[filtered.id_species == gut_id].sample(1))

selected_species = pd.concat(rows)
selected_species

{1351: 'Enterococcus faecalis'} 11
{562: 'Escherichia coli'} 264
{1280: 'Staphylococcus aureus'} 144
{573: 'Klebsiella pneumoniae'} 94
{817: 'Bacteroides fragilis'} 3
{550: 'Enterobacter cloacae'} 16
{216816: 'Bifidobacterium longum'} 17
{1590: 'Lactobacillus plantarum'} 21
{1502: 'Clostridium perfringens'} 7
{584: 'Proteus mirabilis'} 8
{287: 'Pseudomonas aeruginosa'} 72
{823: 'Parabacteroides distasonis'} 1


Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,path_folder,path_file,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species
2523,474186,Enterococcus faecalis OG1RF,no rank,complete genome,2739625,NC_017316.1,"NC_017316.1 Enterococcus faecalis OG1RF, compl...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000172575.2/GCF_000172575.2_ASM17...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
6635,1201292,Enterococcus faecalis ATCC 29212,no rank,complete genome,2939973,NZ_CP008816.1,NZ_CP008816.1 Enterococcus faecalis ATCC 29212...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000742975.1/GCF_000742975.1_ASM74...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
11728,1351,Enterococcus faecalis,species,complete genome,2668255,NZ_CP018102.1,NZ_CP018102.1 Enterococcus faecalis strain L12...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001886675.1/GCF_001886675.1_ASM18...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis
7904,562,Escherichia coli,species,complete genome,4605301,NZ_CP011324.1,"NZ_CP011324.1 Escherichia coli strain SQ2203, ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000988465.1/GCF_000988465.1_ASM98...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
7859,83333,Escherichia coli K-12,no rank,complete genome,4609629,NZ_CP010444.1,NZ_CP010444.1 Escherichia coli K-12 strain ER3...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000974865.1/GCF_000974865.1_ASM97...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
11164,562,Escherichia coli,species,complete genome,5176750,NZ_CP015834.1,"NZ_CP015834.1 Escherichia coli strain MS6198, ...",/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001721525.1/GCF_001721525.1_ASM17...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
7738,1074919,Staphylococcus aureus subsp. aureus ST228,no rank,complete genome,2759510,NC_020529.1,NC_020529.1 Staphylococcus aureus subsp. aureu...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000967325.1/GCF_000967325.1_ASM96...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
11203,1280,Staphylococcus aureus,species,complete genome,2903376,NZ_CP012692.1,NZ_CP012692.1 Staphylococcus aureus strain FOR...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_001725965.1/GCF_001725965.1_ASM17...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
2319,548473,Staphylococcus aureus subsp. aureus TCH60,no rank,complete genome,2802675,NC_017342.1,NC_017342.1 Staphylococcus aureus subsp. aureu...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_000159535.2/GCF_000159535.2_ASM15...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus
13834,573,Klebsiella pneumoniae,species,complete genome,5264852,NZ_CP015120.1,NZ_CP015120.1 Klebsiella pneumoniae strain kp7...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/,bacteria/GCF_002116885.1/GCF_002116885.1_ASM21...,2,0,1224,1236,91347,543,570,573,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae


In [20]:
selected_species["path"] = selected_species.path_folder.str.cat(selected_species.path_file)
selected_species.drop(columns=["path_folder", "path_file"], inplace=True)
# selected_species

In [23]:
def choose_genomes(tax_ids):
    """From a list of taxonomy ids, select these species with the information of each genome (name path length...)"""
    rows = []
    for gut_id in tax_ids:
        print(ncbi.get_taxid_translator([gut_id]), filtered[filtered.id_species == gut_id].shape[0])
        rows.append(filtered[filtered.id_species == gut_id].sample(1))
    selected_species = pd.concat(rows)
    selected_species["path"] = selected_species.path_folder.str.cat(selected_species.path_file)
    selected_species.drop(columns=["path_folder", "path_file"], inplace=True)
    return selected_species
    
gut_ids_always = [1351, 562, 1280, 573 ]  # Take 3 strains each
gut_ids_often = [817, 550, 216816, 1590]   # 1 strain each
gut_ids_rare = [1502, 584, 287, 823]
selected_species= choose_genomes(gut_ids_always*3 + gut_ids_often + gut_ids_rare)

{1351: 'Enterococcus faecalis'} 11
{562: 'Escherichia coli'} 264
{1280: 'Staphylococcus aureus'} 144
{573: 'Klebsiella pneumoniae'} 94
{1351: 'Enterococcus faecalis'} 11
{562: 'Escherichia coli'} 264
{1280: 'Staphylococcus aureus'} 144
{573: 'Klebsiella pneumoniae'} 94
{1351: 'Enterococcus faecalis'} 11
{562: 'Escherichia coli'} 264
{1280: 'Staphylococcus aureus'} 144
{573: 'Klebsiella pneumoniae'} 94
{817: 'Bacteroides fragilis'} 3
{550: 'Enterobacter cloacae'} 16
{216816: 'Bifidobacterium longum'} 17
{1590: 'Lactobacillus plantarum'} 21
{1502: 'Clostridium perfringens'} 7
{584: 'Proteus mirabilis'} 8
{287: 'Pseudomonas aeruginosa'} 72
{823: 'Parabacteroides distasonis'} 1


In [24]:
selected_species

Unnamed: 0,taxon_id,taxon_name,rank,type,length,record_id,record_description,id_superkingdom,id_kingdom,id_phylum,id_class,id_order,id_family,id_genus,id_species,name_superkingdom,name_kingdom,name_phylum,name_class,name_order,name_family,name_genus,name_species,path
22108,1351,Enterococcus faecalis,species,complete genome,2893216,NZ_AP018538.1,NZ_AP018538.1 Enterococcus faecalis KUB3006 DN...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
3549,566546,Escherichia coli W,no rank,complete genome,4897452,NC_017664.1,"NC_017664.1 Escherichia coli W, complete genome",2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
7078,1280,Staphylococcus aureus,species,complete genome,2872771,NZ_CP010300.1,NZ_CP010300.1 Staphylococcus aureus strain 27b...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
20755,573,Klebsiella pneumoniae,species,complete genome,5478620,NZ_AP018671.1,NZ_AP018671.1 Klebsiella pneumoniae GSU10-3 DN...,2,0,1224,1236,91347,543,570,573,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
15495,1351,Enterococcus faecalis,species,complete genome,2704865,NZ_AP017623.1,"NZ_AP017623.1 Enterococcus faecalis DNA, compl...",2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
11863,562,Escherichia coli,species,complete genome,5041399,NZ_CP018206.1,NZ_CP018206.1 Escherichia coli strain MRSN3466...,2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
8540,46170,Staphylococcus aureus subsp. aureus,subspecies,complete genome,2792802,NZ_CP010402.1,NZ_CP010402.1 Staphylococcus aureus subsp. aur...,2,0,1239,91061,1385,90964,1279,1280,Bacteria,0,Firmicutes,Bacilli,Bacillales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
12194,573,Klebsiella pneumoniae,species,complete genome,5373057,NZ_CP018695.1,NZ_CP018695.1 Klebsiella pneumoniae strain Kp_...,2,0,1224,1236,91347,543,570,573,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
6635,1201292,Enterococcus faecalis ATCC 29212,no rank,complete genome,2939973,NZ_CP008816.1,NZ_CP008816.1 Enterococcus faecalis ATCC 29212...,2,0,1239,91061,186826,81852,1350,1351,Bacteria,0,Firmicutes,Bacilli,Lactobacillales,Enterococcaceae,Enterococcus,Enterococcus faecalis,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
1413,439855,Escherichia coli SMS-3-5,no rank,complete genome,5068389,NC_010498.1,"NC_010498.1 Escherichia coli SMS-3-5, complete...",2,0,1224,1236,91347,543,561,562,Bacteria,0,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...


In [25]:
seeds = []
for row in tqdm(selected_species.itertuples()):
#     print(row.taxon_id)
    #  record_id, taxon, what, s_kingdom, folder, file    row["type"]
    #  record_id, taxon, genome, what, s_kingdom, path_complete, description
    sr = SyntRead(row.record_id, row.taxon_id, row.taxon_name, row.type, row.name_superkingdom, row.path, row.record_description)
    sr.synthetize_read(k_window_segments=1)
    print(sr)
    seeds.append(sr)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Synthetic read <0> from Enterococcus faecalis, len=10000, [230000:240000]
Synthetic read <1> from Escherichia coli W, len=10000, [3730000:3740000]
Synthetic read <2> from Staphylococcus aureus, len=10000, [230000:240000]
Synthetic read <3> from Klebsiella pneumoniae, len=10000, [1820000:1830000]
Synthetic read <4> from Enterococcus faecalis, len=10000, [1460000:1470000]
Synthetic read <5> from Escherichia coli, len=10000, [510000:520000]
Synthetic read <6> from Staphylococcus aureus subsp. aureus, len=10000, [2710000:2720000]
Synthetic read <7> from Klebsiella pneumoniae, len=10000, [3050000:3060000]
Synthetic read <8> from Enterococcus faecalis ATCC 29212, len=10000, [1960000:1970000]
Synthetic read <9> from Escherichia coli SMS-3-5, len=10000, [4700000:4710000]
Synthetic read <10> from Staphylococcus aureus, len=10000, [630000:640000]
Synthetic read <11> from Klebsiella pneumoniae, len=10000, [2060000:2070000]
Synthetic read <12> from Bacteroides fragilis YCH46, len=10000, [890000:90

In [27]:
nb_reads = 10**5
synt_reads = []
for seed in tqdm(seeds):
    synt_reads.extend([seed] + seed.deep_random_copy(int(nb_reads/len(seeds)) -1))

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




In [28]:
path_out_syntreads = f"/home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/" \
                     f"2019-12-05_{nb_reads}-WindowReads_{len(seeds)}-BacGut.fastq"
SyntRead.to_fastq(synt_reads, path_out_syntreads)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


File filled with reads at: /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-12-05_100000-WindowReads_20-BacGut.fastq


In [29]:
path_out_ground_truth = f"/home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/" \
                     f"2019-12-05_{nb_reads}-WindowReads_{len(seeds)}-BacGut.GT.pd"
df = SyntRead.save_ground_truth(synt_reads, path_out_ground_truth)

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))


File writen at: /home/ubuntu/Data/Segmentation/Test-Data/Synthetic_from_Genomes/2019-12-05_100000-WindowReads_20-BacGut.GT.pd


In [30]:
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'33.5 MB'

In [20]:
df

Unnamed: 0,taxon,genome,gen_chr_plas,read_len,start,stop,record_id,object_id,description,path
0,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,9676,2588160,2597836,NC_019770.1,0,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
1,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,6270,940429,946699,NC_019770.1,20,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
2,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,7470,1466599,1474069,NC_019770.1,21,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
3,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,3294,184431,187725,NC_019770.1,22,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
4,1261557,Enterococcus faecalis str. Symbioflor 1,complete genome,2879,844312,847191,NC_019770.1,23,NC_019770.1 Enterococcus faecalis str. Symbiof...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
...,...,...,...,...,...,...,...,...,...,...
99995,435591,Parabacteroides distasonis ATCC 8503,complete genome,2287,98603,100890,NC_009615.1,99995,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99996,435591,Parabacteroides distasonis ATCC 8503,complete genome,7327,3373009,3380336,NC_009615.1,99996,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99997,435591,Parabacteroides distasonis ATCC 8503,complete genome,9824,3633633,3643457,NC_009615.1,99997,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...
99998,435591,Parabacteroides distasonis ATCC 8503,complete genome,4971,2241208,2246179,NC_009615.1,99998,NC_009615.1 Parabacteroides distasonis ATCC 85...,/home/ubuntu/Disks/HDD1000/NCBI/20190704/refse...


# End

In [79]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'461.5 MB'

In [39]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 500)

In [76]:
df[(df["type"] == "undefined") 
   & ~(df.record_description.str.lower().str.contains("drosophila melanogaster")) 
   & ~(df.record_description.str.lower().str.contains("plasmid")) 
   & ~(df.record_description.str.lower().str.contains("scaffold")) 
   & ~(df.record_description.str.lower().str.contains("whole genome shotgun sequence")) 
   & ~(df.record_description.str.lower().str.contains("contig"))][["record_description", "length"]] 

Unnamed: 0,record_description,length
27,"NC_005791.1 Methanococcus maripaludis strain S2, complete sequence",1661137
236,NC_023044.1 Methanobacterium sp. MB1 complete sequence,2029766
249,NZ_CP007551.1 Haloferax mediterranei ATCC 33500 genome,2946877
257,NZ_CP008822.1 Metallosphaera sedula strain CuR1 genome,2191492
327,NZ_CP013695.1 Sulfolobus acidocaldarius strain NG05B_CO5_07 genome,2217426
356,NZ_CP019470.1 Methanopyrus sp. KOL6 genome,1430309
368,NZ_CP015193.1 Complete genome sequence of Thermococcus chitonophagus type strain GC74,1961979
379,NZ_CP017881.1 Methanohalophilus portucalensis strain FDF-1T genome,2084975
525,NC_002655.2 Escherichia coli O157:H7 str. EDL933 genome,5528445
595,"NC_002940.2 [Haemophilus] ducreyi 35000HP, complete sequence",1698955


PD from CSV

In [76]:
# path_ncbi_csv
df = pd.read_csv("/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.csv", sep="\t", )

  interactivity=interactivity, compiler=compiler, result=result)


In [77]:
# df.info(memory_usage='deep')
f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB"

'1137.2 MB'

In [78]:
df.shape

(877820, 25)

In [79]:
for col in df.columns:
    if "id_" in col:
        print(col)
        df[col].fillna(0, inplace=True)
        df[col] = df[col].astype(int)

id_superkingdom
id_kingdom
id_phylum
id_class
id_order
id_family
id_genus
id_species


In [80]:
col_categories = ['rank', 'type', 'path_folder', 
                  'id_superkingdom', 'name_superkingdom', 'id_kingdom', 'name_kingdom', 'id_phylum', 'name_phylum', 
                  'id_class', 'name_class', 'id_order', 'name_order', 'id_family', 'name_family', 
                  'id_genus', 'name_genus', 'id_species', 'name_species']

for col in tqdm(col_categories):
    df[col] = df[col].astype('category')
print(f"{df.memory_usage(deep=True).sum()/1000000:.1f} MB")

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))


438.1 MB


In [81]:
# path_ncbi_pd
df.to_pickle('/home/ubuntu/Disks/HDD1000/NCBI/ncbi_2019-11-26.pd')

In [6]:
desired_ranks = ['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
headers = ["taxon_id", "taxon_name", "rank", "type", "length", 
           "record_id", "record_description", "path_folder", "path_file", ]
[headers.extend([f"id_{v}", f"name_{v}"]) for v in desired_ranks]
headers

['taxon_id',
 'taxon_name',
 'rank',
 'type',
 'length',
 'record_id',
 'record_description',
 'path_folder',
 'path_file',
 'id_superkingdom',
 'name_superkingdom',
 'id_kingdom',
 'name_kingdom',
 'id_phylum',
 'name_phylum',
 'id_class',
 'name_class',
 'id_order',
 'name_order',
 'id_family',
 'name_family',
 'id_genus',
 'name_genus',
 'id_species',
 'name_species']

# Read the csv and transform to pandas, plus column dtype conversion

Might take a while if file is big...

# End

In [144]:
path_file = "/home/ubuntu/Disks/HDD1000/NCBI/20190704/refseq/bacteria/GCF_000006845.1/GCF_000006845.1_ASM684v1_genomic.fna"

In [150]:
record = SeqIO.parse(path_file, "fasta")

In [151]:
r = next(record)
r

SeqRecord(seq=Seq('ATAAATTTTTGCACGGGTTGTGGATAAAATATCGGCGAGTCGGTATAATCGGTT...TGG', SingleLetterAlphabet()), id='NC_002946.2', name='NC_002946.2', description='NC_002946.2 Neisseria gonorrhoeae FA 1090 chromosome, complete genome', dbxrefs=[])

In [59]:
len(record.seq)

1936387

In [85]:
record.seq.upper()

Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())

In [73]:
print(record)

ID: NC_014222.1
Name: NC_014222.1
Description: NC_014222.1 Methanococcus voltae A3, complete genome
Number of features: 0
Seq('AATTTAAAGATTAAAATTAGTAGACTGTCGATTTACAATATCATATTTATGAGT...TAA', SingleLetterAlphabet())


In [62]:
dir(record)

['__add__',
 '__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__le___',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_per_letter_annotations',
 '_seq',
 '_set_per_letter_annotations',
 '_set_seq',
 'annotations',
 'dbxrefs',
 'description',
 'features',
 'format',
 'id',
 'letter_annotations',
 'lower',
 'name',
 'reverse_complement',
 'seq',
 'translate',
 'upper']

In [86]:
record.description

'NC_014222.1 Methanococcus voltae A3, complete genome'

In [176]:
get_desired_ranks(2, desired_ranks)

{'superkingdom_id': 2,
 'kingdom_id': 'NaN',
 'phylum_id': 'NaN',
 'class_id': 'NaN',
 'order_id': 'NaN',
 'family_id': 'NaN',
 'genus_id': 'NaN',
 'species_id': 'NaN'}

In [154]:
get_desired_ranks(9606, desired_ranks, tolist=True)

[2759, 33208, 7711, 40674, 9443, 9604, 9605, 9606]

In [155]:
ncbi.translate_to_names(get_desired_ranks(9606, desired_ranks, tolist=True))

['Eukaryota',
 'Metazoa',
 'Chordata',
 'Mammalia',
 'Primates',
 'Hominidae',
 'Homo',
 'Homo sapiens']

In [184]:
ranks_id_str = ncbi.get_taxid_translator(get_desired_ranks(6, desired_ranks, tolist=True))
ranks_id_str

{2: 'Bacteria',
 6: 'Azorhizobium',
 356: 'Rhizobiales',
 1224: 'Proteobacteria',
 28211: 'Alphaproteobacteria',
 335928: 'Xanthobacteraceae'}

In [185]:
l=[]
[l.extend([k,v]) for k,v in ranks_id_str.items()]

[None, None, None, None, None, None]

In [186]:
l

[2,
 'Bacteria',
 6,
 'Azorhizobium',
 356,
 'Rhizobiales',
 1224,
 'Proteobacteria',
 28211,
 'Alphaproteobacteria',
 335928,
 'Xanthobacteraceae']