Prepare the environment
-----------------------

First we'll import various functions that we'll need for generating the report.

In [1]:
%matplotlib inline

from os.path import join, exists, expandvars
import pandas as pd
from skbio.draw import boxplots

from taxcompare.eval_framework import (get_expected_tables_lookup, 
                                       find_and_process_result_tables,
                                       compute_mock_results,
                                       compute_mantel,
                                       generate_pr_scatter_plots,
                                       boxplot_from_data_frame,
                                       heatmap_from_data_frame,
                                       method_by_dataset_a1,
                                       method_by_dataset_a2,
                                       performance_rank_comparisons,
                                       parameter_comparisons)

Configure local environment-specific values
-------------------------------------------

**This is the only cell that you will need to edit to generate reports locally.** After editing this cell, you can run all cells in this notebook to generate your analysis report. Some of the analyses make take a few minutes to run, and analyses at more specific taxonomic levels (e.g., genus or species) will be slower than analyses at more general taxonomic levels (e.g., phylum, class). 

**This cell will not run until you fill in a taxonomic level (``2`` through ``7``).**

In [2]:
## project_dir should be the directory where you've downloaded (or cloned) the 
## short-read-tax-assignment repository. 
project_dir = expandvars("$HOME/data/short-read-tax-assignment")

precomputed_results_dir = expandvars("$HOME/data/short-read-tax-assignment/data/precomputed-results/")
expected_results_dir = join(precomputed_results_dir, "simulated-community")

## results_dirs should contain the directory or directories where
## results can be found. By default, this is just the precomputed 
## results included with the project. If other results should be included, 
## absolute paths to those directories should be added to this list.
results_dirs = \
 [precomputed_results_dir,
  expandvars("$HOME/data/2015.06.24-tax-parameter-sweep-simulated")
  ]

    
new_param_ids = {'mothur':['confidence']}
## Taxonomic level at which analyses should be performed. Edit this to
## the desired taxonomic level. 
# 2: phylum, 3: class, 4: order, 5: family, 6: genus, 7: species
taxonomic_level = 2

## Reference choice (must be partial-ref or full-ref)
reference_choice = "partial-ref"

## Minimum number of times an OTU must be observed for it to be included in analyses. Edit this
## to analyze the effect of the minimum count on taxonomic results.
min_count = 1

# set to true if select tables should be written as Excel files (useful for publication)
write_xls_files = False

In [3]:
# Define the subdirectories where the data should be, and confirm that they exist.
simulated_results_dirs = [join(results_dir,"simulated-community") for results_dir in results_dirs]

for simulated_results_dir in simulated_results_dirs:
    assert exists(simulated_results_dir), "Simulated community result directory doesn't exist: %s" % simulated_results_dir

Find pre-computed tables, expected tables, and "query" tables
-------------------------------------------------------------

Next we'll use the paths defined above to find all of the tables that will be compared. These include the *pre-computed result* tables (i.e., the ones that the new methods will be compared to), the *expected result* tables (i.e., the tables containing the known composition of the mock microbial communities), and the *query result* tables (i.e., the tables generated with the new method(s) that we want to compare to the *pre-computed result* tables).

In [4]:
results = []
for simulated_results_dir in simulated_results_dirs:
    results += find_and_process_result_tables(simulated_results_dir)

In [None]:
# Uncomment for test runs (looks at a small subset of the data)

# from random import shuffle
# shuffle(results)
# results = results[:10]

In [None]:
result_fp = join(precomputed_results_dir,'simulated-community', 'level%d-results-w-mothur.csv') % taxonomic_level

if exists(result_fp):
    simulated_results = pd.DataFrame.from_csv(result_fp)
else:
    expected_tables = get_expected_tables_lookup(expected_results_dir, level=taxonomic_level)
    simulated_results = compute_mock_results(results, expected_tables, taxonomy_level=taxonomic_level, min_count=min_count, new_param_ids=new_param_ids)
    simulated_results.to_csv()

In [None]:
refernece_choice_v = [e.endswith(reference_choice) for e in simulated_results['Parameters']]
simulated_results = simulated_results[refernece_choice_v]

Evalution 1: Compute and summarize precision, recall, and F-measure
-------------------------------------------------------------------

In this evaluation, we compute and summarize precision, recall, and F-measure of each result (pre-computed and query) based on the known composition of the simulated communities. We then summarize the results in two ways: first with boxplots, and second with a table of the top methods based on their F-measures.

In [None]:
boxplot_from_data_frame(simulated_results, group_by="Method", metric="Precision")

In [None]:
boxplot_from_data_frame(simulated_results, group_by="Method", metric="Recall")

In [None]:
boxplot_from_data_frame(simulated_results, group_by="Method", metric="F-measure")

In [None]:
boxplot_from_data_frame(simulated_results, group_by="Dataset", metric="Precision")

In [None]:
boxplot_from_data_frame(simulated_results, group_by="Dataset", metric="Recall")

In [None]:
boxplot_from_data_frame(simulated_results, group_by="Dataset", metric="F-measure")

In [None]:
heatmap_from_data_frame(simulated_results, "Precision")

In [None]:
heatmap_from_data_frame(simulated_results, "Recall")

In [None]:
heatmap_from_data_frame(simulated_results, "F-measure")

In [None]:
method_by_dataset_a1(simulated_results, 'B1-iter0')

In [None]:
method_by_dataset_a1(simulated_results, 'B2-iter0')

In [None]:
method_by_dataset_a1(simulated_results, 'F1-iter0')

In [None]:
method_by_dataset_a1(simulated_results, 'F2-iter0')

## Evaluation 2: Rank-based statistics comparing the performance of the optimal parameter setting run for each method on each data set.

*Count best* column indicates how many samples a given method achieved the best result or tied for the best result (which is why they sum to more than the total number of samples).

### Within-method comparisons of parameter performance

In [None]:
rdp_top_params = parameter_comparisons(simulated_results, "rdp", metrics=['Precision', 'Recall', 'F-measure'])
rdp_top_params[:15]

In [None]:
uclust_top_params = parameter_comparisons(simulated_results, "uclust", metrics=['Precision', 'Recall', 'F-measure'])
uclust_top_params[:15]

In [None]:
sortmerna_top_params = parameter_comparisons(simulated_results, "sortmerna", metrics=['Precision', 'Recall', 'F-measure'])
sortmerna_top_params[:15]

In [None]:
blast_top_params = parameter_comparisons(simulated_results, "blast", metrics=['Precision', 'Recall', 'F-measure'])
blast_top_params[:15]

In [None]:
mothur_top_params = parameter_comparisons(simulated_results, "mothur", metrics=['Precision', 'Recall', 'F-measure'])
mothur_top_params[:15]

### Between-method performance comparisons based on best parameter set determined above

In [None]:
mp_combs = {"rdp": rdp_top_params.index[0],
            "blast": blast_top_params.index[0], 
            "sortmerna": sortmerna_top_params.index[0],
            "uclust": uclust_top_params.index[0],
            "mothur": mothur_top_params.index[0]}

In [None]:
metric = 'Precision'
df = performance_rank_comparisons(simulated_results, metric, mp_combs)
if write_xls_files: 
    df.to_excel('tables/level%d_%s_rankstats.xlsx' % (taxonomic_level, metric), 
                                      na_rep='NA', float_format="%1.3f")
df

In [None]:
metric = 'Recall'
df = performance_rank_comparisons(simulated_results, metric, mp_combs)
if write_xls_files: 
    df.to_excel('tables/level%d_%s_rankstats.xlsx' % (taxonomic_level, metric), 
                                      na_rep='NA', float_format="%1.3f")
df

In [None]:
metric = 'F-measure'
df = performance_rank_comparisons(simulated_results, metric, mp_combs)
if write_xls_files: 
    df.to_excel('tables/level%d_%s_rankstats.xlsx' % (taxonomic_level, metric), 
                                      na_rep='NA', float_format="%1.3f")
df