Split the NCBI database to prepare it for Kraken.

Update NCBI database, original suggested command: </br>
wget manual : https://www.computerhope.com/unix/wget.htm

used: <br>
`python3.7 ~/Tools/ncbi-genome-download/ncbi-genome-download-runner.py viral -F fasta,gff -l complete,chromosome -o ~/Data/NCBI/20190704_genome-download-py -p 4 -r 10 -v` <br>
additional options: <br>
archaea,bacteria,invertebrate,plant,protozoa,vertebrate_mammalian,vertebrate_other <br>
--human-readable 

Suggested by PPT : <br>
`wget -c -N -v -r -t 45 -nd -A *.gff,*.fna "ftp://ftp.ncbi.nih.gov/genomes/refseq/bacteria/Escherichia_coli*" -P .`

Once all bins of the Database have been prepared, add genomes into the Kraken library with the following commands:

`kraken2-build --add-to-library taxo_*.fa --db /home/ubuntu/SSD/NCBI_bins/10_clusters_v1/DB_10_v1`


`find ~/Disks/SSD500/Segmentation/Kraken_10_clusters_V1/0/*/ -name '*.fa' -print0 | xargs -0 -I{} -n1 -P0 kraken2-build --add-to-library {} --db /home/ubuntu/Disks/SSD500/Segmentation/Kraken_10_clusters_V1/Kraken2_building/0/`

`find /home/ubuntu/Disks/SSD500/NCBI/Bacteria_2015/*/  -name '*.fna' -print0 | xargs -0 -I{} -n1 -P2 kraken2-build --add-to-library {} --db /home/ubuntu/Disks/SSD500/Segmentation/Kraken_10_clusters_V1/whole/`

Then build it:

`kraken2-build --build --db /home/ubuntu/SSD/NCBI_bins/10_clusters_v1/DB_10_v1`


In [51]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from joblib import Parallel, delayed
from multiprocessing import cpu_count
from itertools import islice

In [52]:
# from shutil import copyfile
import os

In [25]:
path_ncbi = "/home/ubuntu/Data/NCBI/Bacteria_2015"

In [26]:
path_DB_splits = "/home/ubuntu/Disks/SSD500/Segmentation/Kraken_10_clusters_V1"

10_kmeans_256_LenScale_2019-06-26_07-23.pkl <br>
genomes_split_10_LenScale.pd  <br>
genomes_splits_combined.pd

In [39]:
path_kmer_freq = "/home/ubuntu/Data/kmer_freq/"
kmer_folder = "4mer/V4"

In [27]:
window_size = 10000

In [13]:
n_cores = cpu_count()

combine the splits if same cluster

In [11]:
path_splits = '/home/ubuntu/Disks/SSD500/kmer_freq/4mer/V4/clustering/genomes_split_10_LenScale.pd'
genomes_split = pd.read_pickle(path_splits)

In [53]:
path_splits_combined = '/home/ubuntu/Disks/SSD500/kmer_freq/4mer/V4/clustering/genomes_splits_combined.pd'

In [12]:
genomes_split

Unnamed: 0,bacteria,fna,start,cluster
0,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,0,7
1,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,10000,7
2,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,20000,2
3,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,30000,7
4,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,40000,7
5,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,50000,7
6,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,60000,7
7,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,70000,7
8,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,80000,7
9,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,90000,7


In [None]:
split_file = []
split_fna = []
split_cluster = []
split_start = []
split_end = []
a = 0

for i, df in tqdm(genomes_split.groupby([(genomes_split.cluster != genomes_split.cluster.shift()).cumsum()])):
    split_file.append(df.bacteria.iloc[0])
    split_fna.append(df.fna.iloc[0])
    split_cluster.append(df.cluster.iloc[0])
    split_start.append(df.start.iloc[0])
    split_end.append(max(df.start.iloc[-1] + window_size, split_start[-1] + window_size))

    a += 1
    if a > 1000000: break

In [27]:
file_split = pd.DataFrame({
    "file": split_file, 
    "fna": split_fna, 
    "cluster": split_cluster, 
    "start": split_start, 
    "end": split_end, 
})

In [33]:
file_split.head()

Unnamed: 0,file,fna,cluster,start,end
0,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,7,0,20000
1,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,2,20000,30000
2,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,7,30000,110000
3,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,0,110000,130000
4,Propionibacterium_acnes_KPA171202_uid58101,NC_006085,7,130000,420000


In [30]:
file_split.to_pickle(path_splits_combined)

Need to copy parts of files into various folders for Kraken

In [54]:
file_split = pd.read_pickle(path_splits_combined)

In [55]:
file_split.tail(4)

Unnamed: 0,file,fna,cluster,start,end
255516,Mycoplasma_gallisepticum_CA06_2006_052_5_2P_uid172630,NC_018412,3,860000,870000
255517,Mycoplasma_gallisepticum_CA06_2006_052_5_2P_uid172630,NC_018412,1,870000,890000
255518,Mycoplasma_gallisepticum_CA06_2006_052_5_2P_uid172630,NC_018412,3,890000,950000
255519,Mycoplasma_gallisepticum_CA06_2006_052_5_2P_uid172630,NC_018412,6,950000,970000


In [30]:
def read_fna(file_path):
    with open(file_path) as f:
        rec = f.readlines()
        return "".join(rec[1:]).replace("\n", "")

In [34]:
def kmer_pkl_path(kmer_folder, fna_path, taxo_ext="gff"):
    """ Return a file name based on the taxonomy id instead of the file name.
        We retrieve the taxo id from the .gff file.
        To avoid re-reading file, taxo id is stored into <bac>.taxon
    """
    assert taxo_ext in ("gbk", "gff"), "Only extensions .gbk and .gff are implemented"
    
    bacteria_name = os.path.split(os.path.split(fna_path)[0])[1]
    fna_name      = os.path.split(os.path.splitext(fna_path)[0])[1]
    
    path_taxon = fna_path.replace(".fna", ".taxon")
    if os.path.isfile(path_taxon):
        with open(path_taxon) as f:
            taxo = f.read()
    
    else:
        path_gbk = fna_path.replace(".fna", f".{taxo_ext}")
        assert os.path.isfile(path_gbk), f"{fna_path} DOESN'T have a .{taxo_ext} file ??"

        with open(path_gbk) as gbk:
            description=gbk.read()  #.replace('\n', '')

        if taxo_ext == "gbk":
            identificator = 'db_xref="taxon:'
        elif taxo_ext == "gff":
            identificator = 'Taxonomy/Browser/wwwtax.cgi?id='
        taxo_start = description.find(identificator)
        taxo = description[taxo_start+len(identificator):
                           taxo_start+description[taxo_start:].find('\n')]
            
        assert len(taxo) < 10, f"The taxo id search failed, found an id of length {len(taxo)}, \n" \
                               f"for the file: {path_gbk} \n" \
                               f"found string : {taxo[:min(50, len(taxo))]}..."
        
        with open(path_taxon, "w") as f:
            f.write(taxo)
    
    out_path = os.path.join(path_kmer_freq, kmer_folder, f"{taxo}__{bacteria_name}.pd")
    return taxo, bacteria_name, fna_name, out_path

In [41]:
def copy_windows_bunch_to_cluster(pd_window_bunch, src=path_ncbi, dst=path_DB_splits):
    bacteria = pd_window_bunch.file.iloc[0]
    fna = pd_window_bunch.fna.iloc[0]
    path_fna = os.path.join(src, bacteria, fna) + ".fna"

    # Create the clusters
    bac_clusters = pd_window_bunch.cluster.unique().tolist()
    for cluster in bac_clusters:
        path_dir = f"{dst}/{cluster}/{bacteria}/"
        os.makedirs(path_dir, exist_ok=True)

#     try:
    # Find the taxo id
    taxo, bacteria_name, fna_name, _ = kmer_pkl_path(kmer_folder, path_fna, taxo_ext="gff")

    # Check if already done
    already_done = True
    for i, row in enumerate(pd_window_bunch.itertuples()):
        path_windows_bunch = f"taxo_{taxo}__{fna}__{i:05}.fa"
        path_out = f"{dst}/{row.cluster}/{bacteria}/{path_windows_bunch}"
        if not os.path.isfile(path_out):
            already_done = False
            break
    if already_done:
        return 0, bac_clusters, path_fna, 

    # Iterate over the whole file. First gather the input information, and convert into one line DNA
    with open(path_fna) as src_genome:
        first_line = src_genome.readline()
        last_token = first_line.find(" ")
        assert last_token > 0, f"The format of the .fna file is different, can't find a space: {first_line}"
        dna = src_genome.read().replace("\n", "")

    # For each bunch of windows, create a new file with the k windows
    for i, row in enumerate(pd_window_bunch.itertuples()):
        path_windows_bunch = f"taxo_{taxo}__{fna}__{i:05}.fa"
        path_out = f"{dst}/{row.cluster}/{bacteria}/{path_windows_bunch}"
        with open(path_out, "w") as dst_w_bunch:
            new_first_line = f"{first_line[:last_token]}kraken:taxid|{taxo}|{first_line[last_token:]}"
            dst_w_bunch.write(new_first_line)
            dst_w_bunch.write(dna[row.start:row.end] + "\n")

    return i+1, bac_clusters, path_fna, 
    
#     except Exception as e:
#         print(f"FAIL for {path_fna}")
#         print(e, flush=True)
#         return -1, bac_clusters, path_fna, 

In [None]:
from itertools import islice
a = 0
grouped_windows = file_split.groupby([(file_split.fna != file_split.fna.shift()).cumsum()])
results = Parallel(n_jobs=n_cores)(delayed(copy_windows_bunch_to_cluster)(df, ) \
            for i, df in tqdm(islice(grouped_windows, 500), total=len(grouped_windows)))
results

In [42]:
a = 0
grouped_windows = file_split.groupby([(file_split.fna != file_split.fna.shift()).cumsum()])
results = Parallel(n_jobs=n_cores-1)(delayed(copy_windows_bunch_to_cluster)(df, ) \
            for i, df in tqdm(grouped_windows))
results

HBox(children=(IntProgress(value=0, max=4338), HTML(value='')))

[(23,
  [7, 2, 0, 5],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Propionibacterium_acnes_KPA171202_uid58101/NC_006085.fna'),
 (51,
  [5, 2, 6],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_019697.fna'),
 (1,
  [1],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_019698.fna'),
 (7,
  [5, 1, 2, 6],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_020053.fna'),
 (65,
  [7, 8, 0, 2, 4, 5, 6],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Geobacter_sulfurreducens_KN400_uid161977/NC_017454.fna'),
 (82,
  [1, 5, 6, 3],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Streptococcus_thermophilus_MN_ZLW_002_uid166827/NC_017927.fna'),
 (56,
  [7, 2, 5],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Thermoproteus_tenax_Kra_1_uid74443/NC_016070.fna'),
 (121,
  [0, 4, 8, 7],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Mycobacterium_bovis_BCG_Mexico_uid86889/NC_016804.fna'),
 (143,
  [4, 7, 0, 2],
  '/home/ubuntu/Data/NCBI

In [15]:
a = 0
grouped_windows = file_split.groupby([(file_split.fna != file_split.fna.shift()).cumsum()])
results = Parallel(n_jobs=n_cores)(delayed(copy_windows_bunch_to_cluster)(df, ) \
            for i, df in tqdm(grouped_windows))
results

HBox(children=(IntProgress(value=0, max=4338), HTML(value='')))




[(23,
  [7, 2, 0, 5],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Propionibacterium_acnes_KPA171202_uid58101/NC_006085.fna'),
 (51,
  [5, 2, 6],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_019697.fna'),
 (1,
  [1],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_019698.fna'),
 (7,
  [5, 1, 2, 6],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_020053.fna'),
 (65,
  [7, 8, 0, 2, 4, 5, 6],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Geobacter_sulfurreducens_KN400_uid161977/NC_017454.fna'),
 (82,
  [1, 5, 6, 3],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Streptococcus_thermophilus_MN_ZLW_002_uid166827/NC_017927.fna'),
 (56,
  [7, 2, 5],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Thermoproteus_tenax_Kra_1_uid74443/NC_016070.fna'),
 (121,
  [0, 4, 8, 7],
  '/home/ubuntu/Data/NCBI/Bacteria_2015/Mycobacterium_bovis_BCG_Mexico_uid86889/NC_016804.fna'),
 (143,
  [4, 7, 0, 2],
  '/home/ubuntu/Data/NCBI

In [43]:
df = pd.DataFrame(results, columns=["n_splits", "clusters", "fna"])

In [None]:
df[df.n_splits == 0].fna.iloc[0]

In [47]:
pd.set_option('display.max_colwidth', 400)

In [45]:
df[df.n_splits != 0].shape

(4338, 3)

In [48]:
df

Unnamed: 0,n_splits,clusters,fna
0,23,"[7, 2, 0, 5]",/home/ubuntu/Data/NCBI/Bacteria_2015/Propionibacterium_acnes_KPA171202_uid58101/NC_006085.fna
1,51,"[5, 2, 6]",/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_019697.fna
2,1,[1],/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_019698.fna
3,7,"[5, 1, 2, 6]",/home/ubuntu/Data/NCBI/Bacteria_2015/Chamaesiphon_minutus_PCC_6605_uid183005/NC_020053.fna
4,65,"[7, 8, 0, 2, 4, 5, 6]",/home/ubuntu/Data/NCBI/Bacteria_2015/Geobacter_sulfurreducens_KN400_uid161977/NC_017454.fna
5,82,"[1, 5, 6, 3]",/home/ubuntu/Data/NCBI/Bacteria_2015/Streptococcus_thermophilus_MN_ZLW_002_uid166827/NC_017927.fna
6,56,"[7, 2, 5]",/home/ubuntu/Data/NCBI/Bacteria_2015/Thermoproteus_tenax_Kra_1_uid74443/NC_016070.fna
7,121,"[0, 4, 8, 7]",/home/ubuntu/Data/NCBI/Bacteria_2015/Mycobacterium_bovis_BCG_Mexico_uid86889/NC_016804.fna
8,143,"[4, 7, 0, 2]",/home/ubuntu/Data/NCBI/Bacteria_2015/Maricaulis_maris_MCS10_uid58689/NC_008347.fna
9,89,"[7, 4, 0, 2, 5]",/home/ubuntu/Data/NCBI/Bacteria_2015/Pseudomonas_aeruginosa_PAO581_uid219357/NC_022361.fna


In [49]:
def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return total_size

In [50]:
total = 0
for folder in df[df.n_splits == 0].fna.apply(lambda x: x[:x[37:].find("/") + 38]).tolist():
    subtotal = get_size(folder)
    print(subtotal/1000, folder[0:])
    total += subtotal
total

0

In [49]:
unsure_files = sorted(df[df.n_splits == 0].fna.apply(lambda x: x[37:]).tolist())

In [50]:
unsure_files

['_Cellvibrio__gilvus_ATCC_13127_uid68143/NC_015671.fna',
 '_Clostridium__sticklandii_uid59585/NC_014614.fna',
 '_Nostoc_azollae__0708_uid49725/NC_014248.fna',
 '_Nostoc_azollae__0708_uid49725/NC_014249.fna',
 '_Nostoc_azollae__0708_uid49725/NC_014250.fna',
 '_Ruminococcus__obeum_uid197165/NC_021022.fna',
 '_Ruminococcus__torques_uid197166/NC_021015.fna',
 'alpha_proteobacterium_HIMB59_uid175778/NC_018644.fna',
 'alpha_proteobacterium_HIMB5_uid175779/NC_018643.fna',
 'archaeon_Mx1201_uid196597/NC_020913.fna',
 'bacterium_BT_1_uid184079/NC_019907.fna',
 'beta_proteobacterium_CB_uid191340/NC_020417.fna',
 'butyrate_producing_bacterium_SM4_1_uid197180/NC_021024.fna',
 'butyrate_producing_bacterium_SS3_4_uid197159/NC_021035.fna',
 'butyrate_producing_bacterium_SSC_2_uid197181/NC_021016.fna',
 'candidate_division_SR1_bacterium_RAAC1_SR1_1_uid230714/NC_023002.fna',
 'candidate_division_WWE3_bacterium_RAAC2_WWE3_1_uid230713/NC_023001.fna',
 'cyanobacterium_UCYN_A_uid43697/NC_013771.fna',
 'de

In [51]:
import pickle

In [58]:
with open("/home/ubuntu/Data/NCBI/unsure_files_list.pkl", "wb") as f:
    pickle.dump(unsure_files, f)

Need to loop through each folder/bacteria, through each .fna file, copy only the fragment that is part of the cluster.
Need to check what does Kraken need to build the taxonomy ID

In [90]:
a = 0
for i in file_split.values:
    print(i)
    
    a += 1
    if a > 10: break

['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 7 0 20000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 2 20000 30000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 7 30000 110000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 0 110000 130000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 7 130000 420000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 0 420000 430000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 7 430000 460000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 0 460000 470000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 7 470000 870000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 2 870000 880000]
['Propionibacterium_acnes_KPA171202_uid58101' 'NC_006085' 7 880000 920000]


In [59]:
pd.set_option('display.max_colwidth', 400)

In [62]:
pd.set_option('display.max_columns', 10)

In [60]:
grouped_windows = file_split.groupby([(file_split.fna != file_split.fna.shift()).cumsum()])

In [77]:
chunks = {"0": [], "1": [], "2": [], "3": [], "4": [], "5": [], "6": [], "7": [], "8": [], "9": [], }

for i, df in tqdm(grouped_windows):
#     print("file: ", i)
    for cluster in df.cluster.unique():
#         print(f"cluster number {cluster} has {df[df.cluster == cluster].shape[0]} chunks")
        chunks[str(cluster)].append(df[df.cluster == cluster].shape[0])
    a += 1
#     if a > 200:
#         break

HBox(children=(IntProgress(value=0, max=4338), HTML(value='')))

In [78]:
chunks_sums = [sum(chunks[cluster]) for cluster in chunks.keys()]

In [75]:
[chunks[cluster] for cluster in chunks.keys()]

[25556, 34255, 35690, 12210, 16989, 41955, 34192, 35872, 12897, 5904]

In [80]:
for i in chunks_sums:
    print(i)

25556
34255
35690
12210
16989
41955
34192
35872
12897
5904
