# Data generation: using Python to sweep over methods and parameters

In this notebook, we illustrate how to use Python to perform *parameter sweeps* for a taxonomic assigner and integrate the results into the TAX CREdiT framework.

## Environment preparation

In [1]:
from os.path import join, exists, split, sep, expandvars 
from os import makedirs, getpid
from glob import glob
from shutil import rmtree
import csv
import json

from qiime2.plugins import feature_classifier
from qiime2 import Artifact
from joblib import Parallel, delayed
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import CalibratedClassifierCV
from q2_feature_classifier.custom import LowMemoryMultinomialNB
from sklearn.naive_bayes import MultinomialNB
from q2_feature_classifier.classifier import spec_from_pipeline
from q2_types.feature_data import DNAIterator
from pandas import DataFrame

from tax_credit.framework_functions import gen_param_sweep

In [2]:
project_dir = expandvars('$HOME/Projects/short-read-tax-assignment-bk/')
analysis_name = 'mock-community'
data_dir = join(project_dir, 'data', analysis_name)

reference_database_dir = join(project_dir, 'data', 'ref_dbs')
results_dir = join(project_dir, 'sandpit', analysis_name)

## Preparing data set sweep

First, we're going to define the data sets that we'll sweep over. The following cell does not need to be modified unless if you wish to change the datasets or reference databases used in the sweep.

In [3]:
mock_dirs = ['mock-'+str(i) for i in range(1,11)]
dataset_reference_combinations = \
    zip(mock_dirs, ['gg_13_8_otus']*8 + ['unite-97-rep-set']*2)

reference_dbs = {'gg_13_8_otus' : (join(reference_database_dir, 'gg_13_8_otus/99_otus_clean.fasta'), 
                                   join(reference_database_dir, 'gg_13_8_otus/99_otu_taxonomy_clean.tsv')),
                 'unite-97-rep-set' : (join(reference_database_dir, 'unite_20.11.2016/sh_refs_qiime_ver7_99_20.11.2016_dev_clean.fasta'), 
                                       join(reference_database_dir, 'unite_20.11.2016/sh_taxonomy_qiime_ver7_99_20.11.2016_dev_clean.tsv'))}

## Preparing the method/parameter combinations and generating commands

Now we set the methods and method-specific parameters that we want to sweep. Modify to sweep other methods. Note how method_parameters_combinations feeds method/parameter combinations to parameter_sweep() in the cell below.

In [4]:
method_parameters_combinations = {
              'q2-multinomialNB': {'confidence': [0.0, 0.8, 1.0],
                                   'classify__alpha': [0.001, 0.01, 0.1]}
    }

## Preparing the pipelines
The below pipelines are used to specify the scikit-learn classifiers that are used for assignment.

In [5]:
hash_params = dict(analyzer='char_wb', n_features=8192,
                   ngram_range=[8, 8], non_negative=True)
nb_params = dict(alpha=0.01)
steps = [('feat_ext', HashingVectorizer(**hash_params)),
         ('classify', MultinomialNB(**nb_params))]
pipelines = {'q2-multinomialNB': Pipeline(steps=steps)}

### Utility Methods
The below methods are used to load the data, prepare the data, parse the classifier and classification parameters, and fit and run the classifier.

In [6]:
def load_primers(primer_file):
    with open(primer_file) as csvfile:
        data = next(csv.DictReader(csvfile, delimiter='\t'))
        return data['LinkerPrimerSequence'], data['ReversePrimer']
    
def guess_read_length(seqs):
    seqs = Artifact.load(seqs)
    lengths = [len(s) for s in seqs.view(DNAIterator)]
    lengths.sort()
    return lengths[len(lengths)//2]

def load_trimmed_ref_seqs(input_dir, ref_seqs):
    primer_file = join(input_dir, 'sample-metadata.tsv')
    fprimer, rprimer = load_primers(primer_file)
    rep_seqs = join(input_dir, 'rep_seqs.qza')
    length = guess_read_length(rep_seqs)
    ref_seqs = Artifact.import_data('FeatureData[Sequence]', ref_seqs)
    return feature_classifier.methods.extract_reads(
                sequences=ref_seqs, length=length,
                f_primer=fprimer, r_primer=rprimer).reads

def split_params(params):
    classifier_params = feature_classifier.methods.\
                        classify.signature.parameters.keys()
    pipeline_params = {k:v for k, v in params.items()
                        if k not in classifier_params}
    classifier_params = {k:v for k, v in params.items() 
                         if k in classifier_params}
    return classifier_params, pipeline_params

def train_and_run_classifier(output_dir, input_dir, ref_seqs, ref_taxa, method, params):    
    # Trim the reference seqs
    ref_reads = load_trimmed_ref_seqs(input_dir, ref_seqs)
    
    # Train the classifier
    ref_taxa = Artifact.import_data('FeatureData[Taxonomy]', ref_taxa)
    classifier_params, pipeline_params = split_params(params)
    pipeline = pipelines[method]
    pipeline.set_params(**pipeline_params)
    spec = json.dumps(spec_from_pipeline(pipeline))
    ref_reads.save('ref_reads.qza')
    ref_taxa.save('ref_taxa.qza')
    with open('spec.json', 'w') as spec_out:
        spec_out.write(spec)
    classifier = feature_classifier.methods.fit_classifier(ref_reads, ref_taxa, spec)
    classifier = classifier.classifier
    
    # Classify the sequences
    rep_seqs = Artifact.load(join(input_dir, 'rep_seqs.qza'))
    classification = feature_classifier.methods.classify(rep_seqs, classifier, **classifier_params)
    classification = classification.classification
    
    # Save the results
    makedirs(output_dir, exist_ok=True)
    output_file = join(output_dir, 'rep_set_tax_assignments.txt')
    dataframe = classification.view(DataFrame)
    dataframe.to_csv(output_file, sep='\t', header=False)

## Do the Sweep

In [8]:
sweep = gen_param_sweep(data_dir, results_dir, reference_dbs,
                        dataset_reference_combinations,
                        method_parameters_combinations)
Parallel(n_jobs=4)(delayed(train_and_run_classifier)(*p) for p in sweep);

## Generate per-method biom tables

Modify the taxonomy_glob below to point to the taxonomy assignments that were generated above. This may be necessary if filepaths were altered in the preceding cells.

In [8]:
taxonomy_glob = join(results_dir, '*', '*', '*', '*', 'rep_set_tax_assignments.txt')
generate_per_method_biom_tables(taxonomy_glob, data_dir)

NameError: name 'generate_per_method_biom_tables' is not defined

## Move result files to repository

Add results to the short-read-taxa-assignment directory (e.g., to push these results to the repository or compare with other precomputed results in downstream analysis steps). The precomputed_results_dir path and methods_dirs glob below should not need to be changed unless if substantial changes were made to filepaths in the preceding cells.

In [17]:
precomputed_results_dir = join(project_dir, "data", "precomputed-results", analysis_name)
method_dirs = glob(join(results_dir, '*', '*', '*', '*'))
move_results_to_repository(method_dirs, precomputed_results_dir)