# 01: Pre-processing data with Qiime2

This notebook takes the combined data outputs from Qiita (metadata, taxonomy, and ASV biome files) and splits them into genus-level biome tables for downstream analysis.

In [89]:
from biom import Table
from biom.util import biom_open
from skbio import DistanceMatrix
from os.path import abspath, join
from qiime2 import Artifact
from os import makedirs
from qiime2.plugins import diversity
from qiime2.plugins.feature_table.methods import filter_samples
from qiime2 import Metadata
import pandas as pd

### Data filepaths

In [10]:
# get biom qza

biom_fp = './data/primate_micro_filtered_rarefied_table.qza' 


# get taxonomy qza

tax_fp = './data/taxonomy_assignment_primate_micro_rarefied.qza'


# get metadata

md_fp = './data/primate_micro_filtered_metadata.txt'

### Read in data

#### Biom table

In [11]:
# read biom qza into qiime2 Artifact class

biom_art = Artifact.load(abspath(biom_fp))

# load the qiime2 artifact into biom Table class

biom = biom_art.view(Table)

#### Taxonomy table

In [12]:
# read biom tax into qiime2 Artifact class

tax_art = Artifact.load(abspath(tax_fp))

# read taxonomy artifact as Pandas DF

tax_df = tax_art.view(pd.DataFrame)

#### Metadata file

In [13]:
# read in metadata

metadata = Metadata.load(md_fp)

### Write separate Biom tables per genus

In [50]:
# group all the code into a single method to facilitate rerunning

def split_otu_tables_by_tax(biom_t, tax_df, output_dir,
                            metadata,
                            threshold=5,
                            level=5,
                            tax_names=['Kingdom',
                                       'Phylum',
                                       'Class',
                                       'Order',
                                       'Family', 
                                       'Genus',
                                       'Species'],
                            sampling_depth=5,
                            export_viz=False):
    # fix the taxonomy
    tax_cols = tax_df['Taxon'].str.split('; ', expand=True)

    tax_cols.columns = tax_names
    
    # make concatenated tax string at appropriate level
    cat_cols = tax_names[:level+1]
    print(cat_cols)
    tax_str = tax_cols[cat_cols].fillna(' ').apply(lambda x: '; '.join(x), axis=1)
    
    # find taxa above threshold number of OTUs
    tax_thr = pd.Series(tax_str.value_counts()).where(lambda x : x >= threshold).dropna().index
    
    # make output dir
    makedirs(output_dir, exist_ok=True)
    
    # for each tax_thr value, filter the OTU table and write to file
    
    # also, make a dict of all filtered tables and keep in memory for downstream analysis
    tax_arts = {}
    
    for t in tax_thr:
        t_ids =  pd.Series(tax_str).where(lambda x : x == t).dropna().index
        tax_otu = biom_t.filter(t_ids, axis='observation', inplace=False)
        tax_otu.remove_empty(inplace=True)
        
        
        output_f = t.replace(';','_').replace(' ','')
        output_fn = '{0}.{1}.qza'.format(tax_names[level], output_f)
        output_fp = join(output_dir, output_fn)

        # export as q2 artifact
        tax_art = Artifact.import_data("FeatureTable[Frequency]", tax_otu)
        tax_art.save(output_fp)
        
        tax_arts[t] = tax_art
        
        if export_viz:
            # export the bc and jaccard emperor viz
            (rarefied_table,
             observed_otus_vector,
             shannon_vector,
             evenness_vector,
             jaccard_distance_matrix,
             bray_curtis_distance_matrix,
             jaccard_pcoa_results,
             bray_curtis_pcoa_results,
             jaccard_emperor,
             bray_curtis_emperor) = diversity.pipelines.core_metrics(table=tax_art, 
                                                                    sampling_depth=sampling_depth,
                                                                    metadata=metadata)

            jaccard_fp = join(output_dir, '{0}.{1}.emperor.jaccard.qzv'.format(tax_names[level], output_f))
            bc_fp = join(output_dir, '{0}.{1}.emperor.braycurtis.qzv'.format(tax_names[level], output_f))
            jaccard_emperor.save(jaccard_fp)
            bray_curtis_emperor.save(bc_fp)

    return(tax_arts)

In [51]:
output_dir = './output/genus_asv_tables'

makedirs(output_dir, exist_ok=True)

In [52]:
genus_tables = split_otu_tables_by_tax(biom,
                                       tax_df,
                                       output_dir,
                                       metadata,
                                       level=5,
                                       threshold=5)

['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus']


## Calculate Sorensen-Dice beta diversity and Host Specificity metrics

In [None]:
metric = 'dice'
distance_dir = 'output/distance'
metadata_col = 'species_geo_captivity'
makedirs(distance_dir, exist_ok=True)

for genus in genus_tables:
    tax_dir = join(distance_dir,
                   genus.replace(';','_').replace(' ',''))
    makedirs(tax_dir, exist_ok=True)
    
    # filter empty samples
    genus_filtered = filter_samples(genus_tables[genus],
                                    min_frequency=1)
    
    # get distance matrix
    dm = diversity.actions.beta(genus_filtered.filtered_table,
                            metric)
    
    # write distance matrix to file 
    dm.distance_matrix.view(DistanceMatrix).to_series().to_csv(join(tax_dir,
                                                                    'distance_list.dice.tsv'),
                                                               sep='\t')
    
    # calculate beta group significance
    bgs = diversity.actions.beta_group_significance(dm.distance_matrix,
                                                    metadata.get_column(metadata_col),
                                                    pairwise=True,
                                                    method='permanova')
    
    # write results to directory
    bgs.visualization.export_data(join(tax_dir,'permanova'))
    

