## Environment preparation

This notebook relies on ``mothur`` being in the user's ``$PATH``. The QIIME 1.9.1 AWS has version 1.25.0 of mothur installed by default. To test with version 1.35.1 (the most recent at the time of this analysis) I installed that in my instance, and added it to my ``$PATH`` by adding the following line at the end of ``$HOME/.bashrc``:

```
export PATH=/home/ubuntu/data/mothur-1.35.1/source/:$PATH
```

In [1]:
!which mothur

/home/ubuntu/data/mothur-1.35.1/source//mothur


In [2]:
from os.path import join, exists, split, expandvars, sep
from os import makedirs
from tempfile import mkstemp
from glob import glob
from itertools import product
from random import shuffle

In [3]:
project_dir = expandvars("$HOME/data/short-read-tax-assignment")
data_dir = join(project_dir, "data")

reference_database_dir = expandvars("$HOME/data/")
results_dir = expandvars("$HOME/data/2015.06.24-tax-parameter-sweep-simulated")

## Preparing data set sweep

In [4]:
from IPython.parallel import Client
rc = Client()
lview = rc.load_balanced_view()

@lview.parallel()
def call_cmd(cmd):
    from qcli import qcli_system_call
    stdout, stderr, retval = qcli_system_call(cmd)
    # return stdout, stderr, the return value, and the command
    # the command is useful in case it needs to be re-run
    return stdout, stderr, retval, cmd

First, we're going to define the data sets that we'll sweep over.

In [5]:
num_iterations = 5
dataset_reference_combinations = []
for iteration in range(num_iterations):
    dataset_reference_combinations.append(('B1-iter%d' % iteration, 'gg_13_8_otus'))
    dataset_reference_combinations.append(('B2-iter%d' % iteration, 'gg_13_8_otus'))
    dataset_reference_combinations.append(('F1-iter%d' % iteration, 'unite-97-rep-set'))
    dataset_reference_combinations.append(('F2-iter%d' % iteration, 'unite-97-rep-set'))

reference_dbs = {'gg_13_8_otus' : (join(reference_database_dir, 'gg_13_8_otus/rep_set/97_otus.fasta'), 
                                   join(reference_database_dir, 'gg_13_8_otus/taxonomy/97_otu_taxonomy.txt')),
                 'unite-97-rep-set' : (join(reference_database_dir, 'unite-14.11/97_otus.fasta'), 
                                       join(reference_database_dir, 'unite-14.11/97_otu_taxonomy.txt'))}

## Uncompress simulated reference database files

In [6]:
commands = []
for e in dataset_reference_combinations:
    zipped_refseqs_fp = join(data_dir, 'simulated-community', e[0], 'ref.fna.gz')
    unzipped_refseqs_fp = join(data_dir, 'simulated-community', e[0], 'ref.fna')
    cmd = "gunzip -c %s > %s" % (zipped_refseqs_fp, unzipped_refseqs_fp)
    commands.append(cmd)

In [7]:
r = call_cmd.map(commands)

## Preparing the method/parameter combinations

In [8]:
method_parameters_combinations = { # probabalistic classifiers
              'mothur': {'confidence': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},
              }

In [9]:
analyses = ['simulated-community']

In [14]:
commands = []
command_template = "mkdir -p %s ; assign_taxonomy.py -i %s -o %s -r %s -t %s -m %s %s"
for analysis in analyses:
    analysis_input_dir = join(data_dir, analysis)
    analysis_output_dir = join(results_dir, analysis)
    for dataset, reference in dataset_reference_combinations:
        dataset_input_dir = join(analysis_input_dir, dataset)
        dataset_input_seqs = join(dataset_input_dir, 'rep_set.fna')
        dataset_input_table = join(dataset_input_dir, 'table.biom')
        unzipped_reference_seqs = join(dataset_input_dir, 'ref.fna')
        full_reference_seqs = reference_dbs[reference][0]
        reference_tax = reference_dbs[reference][1]
        dataset_output_dir = join(analysis_output_dir, dataset, reference)
        for method, parameters in method_parameters_combinations.items():
            method_output_dir = join(dataset_output_dir, method)
            parameter_ids = parameters.keys()
            parameter_ids.sort()
            for parameter_combination in product(*[parameters[id_] for id_ in parameter_ids]):
                parameter_comb_id = ':'.join(map(str,parameter_combination))
                parameter_output_dir = join(method_output_dir, ''.join([parameter_comb_id, ':partial-ref']))
                parameter_str = ' '.join(['--%s %s' % e for e in zip(parameter_ids, parameter_combination)])
                command = command_template % (parameter_output_dir,
                                              dataset_input_seqs, parameter_output_dir, unzipped_reference_seqs, 
                                              reference_tax, method, parameter_str)
                if not exists(parameter_output_dir): commands.append(command)
                full_output_dir = join(method_output_dir, ''.join([parameter_comb_id, ':full-ref']))
                command = command_template % (full_output_dir,
                                              dataset_input_seqs, full_output_dir, full_reference_seqs, 
                                              reference_tax, method, parameter_str)
                if not exists(full_output_dir): commands.append(command)

In [15]:
shuffle(commands)
print commands[0]

IndexError: list index out of range

In [16]:
len(commands)

0

In [13]:
r = call_cmd.map(commands)

## Generate per-method biom tables

In [17]:
simulated_community_data_dir = join(project_dir, 'data', 'simulated-community')
biom_output_fps = []

glob_str = join(results_dir, 'simulated-community', '*', '*', '*', '*', 'rep_set_tax_assignments.txt')

In [18]:
commands = []

taxonomy_map_fps = glob(glob_str)
for taxonomy_map_fp in taxonomy_map_fps:
    dataset_id = taxonomy_map_fp.split(sep)[-5]
    biom_input_fp = join(simulated_community_data_dir, dataset_id, 'table-no-tax.biom')
    output_dir = split(taxonomy_map_fp)[0]
    biom_output_fp = join(output_dir,'table.biom')
    if exists(biom_output_fp):
        print "Output file already exists: %s" % biom_output_fp
    cmd = "biom add-metadata -i %s -o %s --observation-metadata-fp %s --observation-header otuid,taxonomy --sc-separated taxonomy --output-as-json" % (biom_input_fp, biom_output_fp, taxonomy_map_fp)
    commands.append(cmd)

In [19]:
print len(commands)
print commands[0]

440
biom add-metadata -i /home/ubuntu/data/short-read-tax-assignment/data/simulated-community/F1-iter2/table-no-tax.biom -o /home/ubuntu/data/2015.06.24-tax-parameter-sweep-simulated/simulated-community/F1-iter2/unite-97-rep-set/mothur/0.0:partial-ref/table.biom --observation-metadata-fp /home/ubuntu/data/2015.06.24-tax-parameter-sweep-simulated/simulated-community/F1-iter2/unite-97-rep-set/mothur/0.0:partial-ref/rep_set_tax_assignments.txt --observation-header otuid,taxonomy --sc-separated taxonomy --output-as-json


In [20]:
r = call_cmd.map(commands)

## Remove uncompressed reference databases

In [21]:
commands = []
for e in dataset_reference_combinations:
    unzipped_refseqs_fp = join(data_dir, 'simulated-community', e[0], 'ref.fna')
    cmd = "rm %s" % unzipped_refseqs_fp
    commands.append(cmd)

In [22]:
print len(commands)
print commands[0]

20
rm /home/ubuntu/data/short-read-tax-assignment/data/simulated-community/B1-iter0/ref.fna


In [23]:
r = call_cmd.map(commands)

## Move result files into repository

It's a good idea to back up your ``results_dir`` prior to running this step (e.g., by creating a ``tgz`` of it). 

In [42]:
precomputed_results_dir = join(project_dir, "data", "precomputed-results", "simulated-community")
method_dirs = glob(join(results_dir, '*', '*', '*', '*'))

In [43]:
commands = []
for method_dir in method_dirs:
    fields = method_dir.split(sep)
    dataset_id, database_id, method_id  = fields[-3], fields[-2], fields[-1] 

    new_location = join(precomputed_results_dir, dataset_id, database_id)
    if exists(join(new_location, method_id)):
        rmtree(join(new_location, method_id))
    cmd = "mv -f %s %s" % (method_dir, new_location)
    commands.append(cmd)

In [44]:
print len(commands)
print commands[0]

80
mv -f /home/ubuntu/data/2015.02.25-tax-parameter-sweep-simulated/simulated-community/F1-iter2/unite-97-rep-set/sortmerna /home/ubuntu/data/short-read-tax-assignment/data/precomputed-results/simulated-community/F1-iter2/unite-97-rep-set


In [45]:
r = call_cmd.map(commands)

## Remove directories for any failed runs

In [9]:
from os.path import getsize
dirs = glob(expandvars("$HOME/data/2015.02.25-tax-parameter-sweep-simulated/simulated-community/*/*/*/*/"))
bad_dirs = []
for d in dirs:
    fp = join(d,'rep_set_tax_assignments.txt')
    if not exists(fp) or getsize(fp) < 38:
        bad_dirs.append(d)
print len(dirs), len(bad_dirs)

2720 1


In [10]:
print bad_dirs

['/home/ubuntu/data/2015.02.25-tax-parameter-sweep-simulated/simulated-community/B1-iter1/gg_13_8_otus/sortmerna/0.76:0.9:5:0.8:1.0:partial-ref/']


In [11]:
for e in bad_dirs:
    !rm -r $e