# Kraken analysis

Standard kraken databases are based upon refseq, which has only around 300 fungal members, and is missing key cereal pathogens, so we need to use a custom database derived from ensembl data, which includes >1000 fungi.

In [7]:
import pandas as pd
import glob
from os.path import basename
import os
import sys
from Bio import SeqIO

kraken_dir='kraken_db'

Now to build the database...

## 1: Create Ensembl-based kraken database

The `bin/dl_kraken_db.py` script downloads all bacteria and fungi sequences from ensembl, in a structure appropriate for building a kraken db. First we need to establish the base structure populated with taxonomy data using kraken-build.

In [1]:
%%bash
bin/dl_kraken_db.py --db kraken_db

In [None]:
%%bash
DB_NAME="kraken_db"
kraken2-build --download-taxonomy --db $DB_NAME

In [None]:
%%bash
# iterate through directories to avoid exceeding argument length limits
mkdir -p logs
for dir in $(ls kraken_db/ensembl); do
    ls kraken_db/ensembl/$dir/*.fa|xargs -I{} -n1 -P16 kraken2-build --add-to-library {} --db kraken_db 2>> logs/$dir.build.log
done

Adding barley genome sequences to the database as well...

In [None]:
genome='barley.fasta'
out_handle=open('kraken_db/barley/{}'.format(basename(genome)),'w')
with open(genome,'rt') as in_handle:
    for record in SeqIO.parse(in_handle, 'fasta'):
        record.id='{}|kraken:taxid|4513'.format(record.id)
        SeqIO.write(record,out_handle,'fasta')

In [None]:
%%bash
kraken2-build --add-to-library kraken_db/barley/barley.fasta --db kraken_db

In [None]:
%%bash
kraken2-build --build -t 16 --db kraken_db

## 2: Database check

Let's get a dump of the database members  using `kraken2-inspect` to ensure what we expect to see is in there...

In [None]:
%%bash

kraken2-inspect --threads 24 --db kraken_db > kraken_db_contents.txt

The output is ordered with bacteria first, followed by fungi then archae. The taxid can be used to subset the dataframe into separate taxonomic chunks...

In [4]:
taxa=pd.read_csv('kraken_db_contents.txt',sep='\t',header=None)

fungi_row=taxa.index[taxa[4]==4751].tolist()[0]
archae_row=taxa.index[taxa[4]==2157].tolist()[0]
bacteria=taxa.iloc[0:fungi_row-1]
fungi=taxa.iloc[fungi_row:archae_row-1]
archae=taxa.iloc[archae_row:-1]

Now we can extract the species from each of these sets - note this is just the species, not sub-species etc.

In [5]:
bacteria_species=bacteria[bacteria[3]=='S']
fungi_species=fungi[fungi[3]=='S']
archae_species=archae[archae[3]=='S']

bacteria_species=bacteria_species[[4,5]]
fungi_species=fungi_species[[4,5]]
archae_species=archae_species[[4,5]]

bacteria_species.columns=('TaxID','Species')
fungi_species.columns=('TaxID','Species')
archae_species.columns=('TaxID','Species')

bacteria_species['Species']=bacteria_species['Species'].apply(lambda x:x.strip())
fungi_species['Species']=fungi_species['Species'].apply(lambda x:x.strip())
archae_species['Species']=archae_species['Species'].apply(lambda x:x.strip())

bacteria_species.to_csv('kraken_bacteria_species.txt',sep='\t',index=False)
fungi_species.to_csv('kraken_fungi_species.txt',sep='\t',index=False)
archae_species.to_csv('kraken_archae_species.txt',sep='\t',index=False)

## 3: Run kraken

In [None]:
%%bash

qsub -t 1-12 bin/kraken.sh -i fastq -o kraken_outputs -d kraken_db

# Output parsing

Kraken's output format now needs to be converted to give us read-counts per taxa, including superkindom to allow separation of results.

The following functions are used for this process.

In [1]:
def parse_taxonomy(data_dir):

    """
    Parse the NCBI taxonomy nodes.dmp and names.dmp files to produce a dict containing
    the name, rank and parent node id of each node in the database

    arguments:
    data_dir -- path to directory containing taxonomy files

    returns:
    nodes -- dict of dicts
    """

    # The nodes.dmp file is delimited by '\t|\t' and contains the following
    # fields (from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_readme.txt)

    #tax_id                 -- the id of node associated with this name
    #name_txt               -- name itself
    #unique name                -- the unique variant of this name if name not unique
    #name class             -- (synonym, common name, ...)

    # The species names are found in the 'scientific name' lines

    names = dict()
    nodes = dict()

    for line in open(data_dir + '/names.dmp','r'):
        fields = line.rstrip("\n").split("\t|\t")
        if 'scientific name' in fields[3]:
            names[int(fields[0])] = fields[1]
    
    # The nodes.dmp file is delimited by '\t|\t' and contains the following
    # fields (from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump_readme.txt)

    # tax_id -- node id in GenBank taxonomy database
    # parent tax_id             -- parent node id in GenBank taxonomy database
    # rank                  -- rank of this node (superkingdom, kingdom, ...)
    # embl code             -- locus-name prefix; not unique
    # division id               -- see division.dmp file
    # ....

    for line in open(data_dir + '/nodes.dmp', 'r'):
        fields = line.rstrip("\n").split("\t|\t")
        node_data = {
            'parent': int(fields[1]),
            'rank': fields[2],
            'name': names[int(fields[0])]
        }
        nodes[int(fields[0])] = node_data
    
    f=lambda x: x.replace('\t|','')
    tax_names=pd.read_csv('{}/names.dmp'.format(data_dir), sep="\t\|\t",header=0,engine='python',converters={3:f},
                      names=['taxid','name','dontknow','type'])
    tax_names=tax_names.loc[tax_names['type']=='scientific name']
    tax_names=tax_names[['taxid','name']]

    return(nodes,tax_names)

def walk_tree(tax_id):

    """
    Walk up taxonomy tree from provided taxid until superkingdom  node is found, which is returned
    This enables us to separate taxa into bacteria, fungi and archae

    arguments:
        taxid -- string

    returns:
        superkingdom -- string
    """
    
    if int(tax_id) in nodes:
        node_data=nodes[int(tax_id)]
        rank=node_data['rank']
        while node_data['rank'] != 'superkingdom':
            node_data=nodes[node_data['parent']]
            name=node_data['name']
            rank=node_data['rank']
            
        return(name) 

def read_file(file):
    
    """
    Parses a kraken report file (see https://ccb.jhu.edu/software/kraken/MANUAL.html).
    This is a tab-delimited file containing the following fields:
    
    * Percentage of reads covered by the clade rooted at this taxon
    * Number of reads covered by the clade rooted at this taxon
    * Number of reads assigned directly to this taxon
    * A rank code, indicating (U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies. 
    * NCBI taxonomy ID
    * indented scientific name

    This function reads the file, and selects just the (D)omain, (P)hylym, (C)lass,
    (O)rder and (F)amily ranks.
    
    For each rank of Phylum, Class, Order and Family, the rank, NCBI taxid and read count are 
    selected. The taxonomy tree is then walked using the taxid to obtain the superkingdom for the taxa, 
    which is added to the dataframe.
    
    arguments:
        file(str): path of kraken report file to parse
        
    Returns:
        res(dict): Dict of abundance dataframes, keyed on rank
    """
    
    file_df=pd.read_csv(file,sep='\t',header=None)
    # Select just domain, phylum, class, order and family...
    file_df=file_df[(file_df[3] == 'D') | (file_df[3] == 'P') | (file_df[3] == 'C') 
                    | (file_df[3] == 'O') | (file_df[3] == 'F')]
    
    res=dict()
    levels=('Phylum','Class','Order','Family')
    for level in levels:
        l=level[0]
    
        df=file_df[[5,4,1]][file_df[3]==l]
    
        df.columns=(level,'Taxid','Count')
        df.reset_index(inplace=True)
        df=df[[level,'Taxid','Count']]
        df[level]=df[level].apply(lambda x:x.strip())
        df['Superkingdom']=df['Taxid'].apply(lambda x:walk_tree(x))
        res[level]=df
    return(res)
    


Firstly, parse the taxonomy files to give us a pandas DataFrame of taxonomy id to name mappings (names), and a dictionary (nodes) containing each node keyed on the taxonomy id.

In [None]:
data_dir='{}/taxonomy'.format(kraken_dir)
nodes,names=parse_taxonomy(data_dir)

Obtain a list of the output reports generated by kraken, create a directory for storing the parsed results and create a dictionary mapping sample IDs to names.

In [28]:
files=glob.glob('kraken_outputs/*.report.txt')

if not os.path.exists('parsed_outputs'):
    os.mkdir('parsed_outputs')

samples={
    '2000':'Elite_1',  '2001':'Elite_2',  '2002':'Elite_3',
    '2006':'Desert_1', '2007':'Desert_2', '2009':'Desert_3',
    '2011':'North_1',  '2012':'North_2',  '2013':'North_3',
    '2023':'Bulk_1',   '2024':'Bulk_2',   '2025':'Bulk_3'
}

Iterate over the report files, reading each in turn and writing each of the resulting dataframes into separate text files, split on superkingdom.

In [34]:
for file in files:
    fn=basename(file)
    sample=samples[fn.split('.')[0]]
    results=read_file(file)
    for result in results.keys():
        df=results[result]
        bact_df=df[df['Superkingdom']=='Bacteria']
        fun_df=df[df['Superkingdom']=='Eukaryota']
        arch_df=df[df['Superkingdom']=='Archaea']
        
        bact_fn='parsed_outputs/{}_{}_{}.txt'.format(sample,'bacteria',result.lower())
        fun_fn='parsed_outputs/{}_{}_{}.txt'.format(sample,'fungi',result.lower())
        arch_fn='parsed_outputs/{}_{}_{}.txt'.format(sample,'archaea',result.lower())
        
        bact_df.to_csv(bact_fn,sep='\t',index=False)
        fun_df.to_csv(fun_fn,sep='\t',index=False)
        arch_df.to_csv(arch_fn,sep='\t',index=False)
        

Plots of the results are produced using the `kraken_plots.Rmd` R markdown document.