In [1]:
import os
from os.path import join
import csv
from collections import defaultdict, Counter
import json

import biom
import pandas
from qiime2 import Artifact
from qiime2.plugins.clawback.methods import generate_class_weights
from qiime2.plugins.feature_classifier.methods import (
    classify_sklearn, fit_classifier_naive_bayes)
from qiime2.plugins import feature_table
from qiime2.plugins import taxa

from paycheck.cross_validate import save_observed, extract_sample

In [2]:
data_dir = '/Users/benkaehler/Data/HMP/'

### Define some dodgy functions

In [3]:
def _collapse_frequency_table_to_relative_frequency(table, taxonomy, level=7):
    for level in range(level, 0, -1):
        # to cater for truncated classifications
        try:
            table_l7 = taxa.methods.collapse(table=table, level=level, taxonomy=taxonomy)
            break
        except ValueError:
            pass
    rel_freq = feature_table.methods.relative_frequency(table_l7.collapsed_table)
    obs_table = rel_freq.relative_frequency_table.view(pandas.DataFrame)
    return obs_table

def sixteen_ess_2_shotgun(taxonomy, abundances, corrections):
    freqs = _collapse_frequency_table_to_relative_frequency(abundances, taxonomy)
    for taxon in freqs.columns:
        correction = get_correction(taxon, corrections)
        freqs[taxon]['Frequency'] = freqs[taxon]['Frequency']*correction
    return freqs / freqs.sum(axis=1)[0]

def shotgun_2_sixteen_ess(weights, corrections):
    freqs = weights.view(pandas.DataFrame)
    for taxon in freqs.columns:
        correction = get_correction(taxon, corrections)
        freqs[taxon]['Weight'] = freqs[taxon]['Weight']/correction
    freqs = freqs / freqs.sum(axis=1)[0]
    return Artifact.import_data('FeatureTable[RelativeFrequency]', freqs)

def get_correction(taxon, corrections):
    for i in range(taxon.count(';') + 1):
        if taxon in corrections[i]:
            return corrections[i][taxon]
        taxon, _ = taxon.rsplit(';', 1)
        
def train_corrections(weights_16s, weights_shotgun, min_weight=0.001):
    def unpack(weights):
        weights = weights.view(pandas.DataFrame).T
        return dict(zip(weights.index, weights['Weight']))
    def contract(weights):
        coarser = Counter()
        for taxon, weight in weights.items():
            coarser[taxon.rsplit(';', 1)[0]] += weight
        return coarser
    
    weights_16s = unpack(weights_16s)
    weights_shotgun = unpack(weights_shotgun)
    levels = None
    for taxon in weights_16s:
        if levels is None:
            levels = taxon.count(';') + 1
        else:
            assert taxon.count(';') + 1 == levels
    for taxon in weights_shotgun:
        assert taxon.count(';') + 1 == levels
    corrections = []
    assert set(weights_16s) == set(weights_shotgun)
    for i in range(levels):
        correction = {}
        for taxon in weights_16s:
            if weights_shotgun[taxon] > min_weight and weights_16s[taxon] > min_weight:
                correction[taxon] = weights_shotgun[taxon] / weights_16s[taxon]
            elif i == levels - 1:
                correction[taxon] = 1.
        corrections.append(correction)
        weights_16s = contract(weights_16s)
        weights_shotgun = contract(weights_shotgun)
    return corrections    

### Generate the non-shotgun weights
```bash
cd $data_dir
qiime tools import --type FeatureData[Sequence] --input-path narfgenes/refseq-formatted.fasta --output-path tmp/ref-seq.qza
qiime tools import --type FeatureData[Taxonomy] --input-path narfgenes/refseq-taxonomy.txt --output-path tmp/ref-tax.qza --source-format HeaderlessTSVTaxonomyFormat
qiime feature-classifier extract-reads --p-f-primer CCTACGGGAGGCAGCAG --p-r-primer CCGTCAATTCMTTTRAGT --i-sequences tmp/ref-seq.qza --o-reads tmp/ref-seq-v35.qza
qiime feature-classifier fit-classifier-naive-bayes --i-reference-reads tmp/ref-seq-v35.qza --i-reference-taxonomy tmp/ref-tax.qza --p-classify--alpha 0.001 --p-feat-ext--ngram-range '[7,7]' --o-classifier tmp/uniform-classifier.qza
qiime clawback sequence-variants-from-samples --i-samples ../empo_3/animal-distal-gut/sv.qza --o-sequences tmp/adg-sv.qza
qiime feature-classifier classify-sklearn --i-reads tmp/adg-sv.qza --i-classifier tmp/uniform-classifier.qza --p-n-jobs 8 --p-confidence -1 --o-classification tmp/adg-classification.qza
qiime clawback generate-class-weights --i-reference-taxonomy tmp/ref-tax.qza --i-reference-sequences tmp/ref-seq-v35.qza --i-samples ../empo_3/animal-distal-gut/sv.qza --i-taxonomy-classification tmp/adg-classification.qza --o-class-weight tmp/adg-weights.qza
qiime feature-classifier fit-classifier-naive-bayes --i-reference-reads tmp/ref-seq-v35.qza --i-reference-taxonomy tmp/ref-tax.qza --i-class-weight tmp/adg-weights.qza --p-classify--alpha 0.001 --p-feat-ext--ngram-range '[7,7]' --o-classifier tmp/adg-classifier.qza
```

### Load some data

In [4]:
ref_tax = Artifact.load(join(data_dir, 'tmp', 'ref-tax.qza'))
ref_seq = Artifact.load(join(data_dir, 'tmp', 'ref-seq-v35.qza'))
seqs_16s = Artifact.load(join(data_dir, '16s', 'rep-seqs-340.qza'))
test_samples = Artifact.load(join(data_dir, '16s', 'table-340.qza'))
test_samples = test_samples.view(biom.Table)
shotgun_table = biom.load_table(join(data_dir, 'shotgun', 'shotgun_samples.biom'))
def kludge(_id, x):
    return _id[:-3] if _id.endswith(';__') else _id
shotgun_table = shotgun_table.collapse(kludge, norm=False, axis='observation')

### Partition samples for cross validation

In [5]:
with open(join(data_dir, '16s', 'metadata.tsv')) as fh:
    reader = csv.reader(fh, delimiter='\t')
    metadata = [(r[2], r[0], r[1], r[3]) for r in reader]
k = 5
partitions = defaultdict(list)
for i, row in enumerate(sorted(metadata)):
    partitions[i % k].append(row)
# check that donors are stratified as well as possible
assert sum([len(set(r[1] for r in partitions[i])) for i in range(k)]) == len(metadata)
partitions = [[r[2] for r in partitions[i]] for i in range(k)]

### Generate the shotgun weights

In [6]:
shotgun_weights = []
for i in range(k):
    sample_ids = [_id for j in range(k) if j != i for _id in partitions[j]]
    samples = shotgun_table.filter(sample_ids, inplace=False)
    samples = Artifact.import_data('FeatureTable[Frequency]', samples)
    weights = generate_class_weights(ref_tax, ref_seq, samples, upsample=True)
    shotgun_weights.append(weights.class_weight)

### Save 16S abundances

In [7]:
abundance_dir = join(data_dir, 'results', 'abundance_16s')
if not os.path.exists(abundance_dir):
    os.mkdir(abundance_dir)
for sample_id in test_samples.ids():
    sample = extract_sample([sample_id], test_samples)
    ids = sample.ids(axis='observation')
    df = pandas.DataFrame(dict(zip(ids, sample.data(sample_id))),
                          index=['Frequency'], columns=ids)
    Artifact.import_data('FeatureTable[Frequency]', df).save(
        join(abundance_dir, sample_id + '.qza'))

### Save shotgun abundances

In [8]:
expected_dir = join(data_dir, 'results', 'expected')
if not os.path.exists(expected_dir):
    os.mkdir(expected_dir)
abundance_dir = join(data_dir, 'results', 'abundance')
if not os.path.exists(abundance_dir):
    os.mkdir(abundance_dir)
for sample_id in [_id for j in range(k) for _id in partitions[j]]:
    sample = extract_sample([sample_id], shotgun_table)
    ids = sample.ids(axis='observation')
    df = pandas.DataFrame({'Taxon': ids}, index=ids, columns=['Taxon'])
    df.index.name = 'Feature ID'
    Artifact.import_data('FeatureData[Taxonomy]', df).save(
        join(expected_dir, sample_id + '.qza'))
    df = pandas.DataFrame(dict(zip(ids, sample.data(sample_id))),
                          index=['Frequency'], columns=ids)
    Artifact.import_data('FeatureTable[Frequency]', df).save(
        join(abundance_dir, sample_id + '.qza'))

### Classify under uniform weights

In [9]:
classifier = Artifact.load(join(data_dir, 'tmp', 'uniform-classifier.qza'))
classification = classify_sklearn(seqs_16s, classifier, n_jobs=4).classification

In [10]:
save_observed(join(data_dir, 'results'), test_samples, classification, 'uniform')

In [11]:
dodge_dir = join(data_dir, 'results', 'uniform-rel-freqs')
if not os.path.exists(dodge_dir):
    os.mkdir(dodge_dir)
for _, _, id_, _ in metadata: 
    classification = Artifact.load(join(data_dir, 'results', 'uniform', id_+'.qza'))
    abundances = Artifact.load(join(data_dir, 'results', 'abundance_16s', id_+'.qza'))
    freqs = _collapse_frequency_table_to_relative_frequency(abundances, classification)
    freqs.to_pickle(join(dodge_dir, id_+'.pkl'))

#uniform_corrections = []
#for i in range(k):
#    sample_ids = [_id for j in range(k) if j != i for _id in partitions[j]]
#    samples = test_samples.filter(sample_ids, inplace=False)
#    samples = Artifact.import_data('FeatureTable[Frequency]', samples)
#    weights = generate_class_weights(ref_tax, ref_seq, samples, taxonomy, normalise=True)
#    weights = weights.class_weight
#    corrections = train_corrections(weights, shotgun_weights[i], 100)
#    uniform_corrections.append(corrections)
#    for id_ in partitions[i]:
#        classification = Artifact.load(join(data_dir, 'results', 'uniform', id_+'.qza'))
#        abundances = Artifact.load(join(data_dir, 'results', 'abundance_16s', id_+'.qza'))
#        sg = sixteen_ess_2_shotgun(classification, abundances, corrections)
#        sg.to_pickle(join(dodge_dir, id_+'.pkl'))

### Classify under shotgun weights

In [12]:
dodge_dir = join(data_dir, 'results', 'shotgun-rel-freqs')
if not os.path.exists(dodge_dir):
    os.mkdir(dodge_dir)
uniform_taxonomy = classify_sklearn(seqs_16s, classifier, confidence=-1, n_jobs=4).classification
for i in range(k):
    sample_ids = [_id for j in range(k) if j != i for _id in partitions[j]]
    train_samples = test_samples.filter(sample_ids, inplace=False)
    train_samples = Artifact.import_data('FeatureTable[Frequency]', train_samples)
    weights = generate_class_weights(ref_tax, ref_seq, train_samples, uniform_taxonomy, normalise=True)
    weights = weights.class_weight
    corrections = train_corrections(weights, shotgun_weights[i])
    weights = shotgun_2_sixteen_ess(shotgun_weights[i], corrections)
    
    classifier = fit_classifier_naive_bayes(
        ref_seq, ref_tax, weights,
        classify__alpha=0.001, feat_ext__ngram_range='[7,7]').classifier
    classification = classify_sklearn(seqs_16s, classifier, n_jobs=4).classification
    samples = test_samples.filter(partitions[i], inplace=False)
    save_observed(join(data_dir, 'results'), samples, classification, 'shotgun')
    
    taxonomy = classify_sklearn(seqs_16s, classifier, confidence=-1, n_jobs=4).classification
    weights = generate_class_weights(ref_tax, ref_seq, train_samples, taxonomy)
    weights = weights.class_weight
    
    corrections = train_corrections(weights, shotgun_weights[i])
    for id_ in partitions[i]:
        cl_fp = join(data_dir, 'results', 'shotgun', id_+'.qza')
        print(cl_fp)
        classification = Artifact.load(cl_fp)
        ab_fp = join(data_dir, 'results', 'abundance_16s', id_+'.qza')
        print(ab_fp)
        abundances = Artifact.load(join(data_dir, 'results', 'abundance_16s', id_+'.qza'))
        sg = sixteen_ess_2_shotgun(classification, abundances, corrections)
        sg_fp = join(dodge_dir, id_+'.pkl')
        print(sg_fp)
        sg.to_pickle(sg_fp)
        
        wtf1 = Artifact.load(cl_fp).view(pandas.DataFrame)
        wtf2 = pandas.read_pickle(sg_fp).T
        wtf2 = set(id_.replace(';__', '') for id_ in wtf2.index)
        assert set(wtf1['Taxon']) == wtf2



/Users/benkaehler/Data/HMP/results/shotgun/700097859.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700097859.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700097859.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700032244.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700032244.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700032244.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700106170.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700106170.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700106170.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700023788.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700023788.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700023788.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700024866.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700024866.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700024866.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700016142.qza
/Users/b



/Users/benkaehler/Data/HMP/results/shotgun/700101243.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700101243.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700101243.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700034254.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700034254.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700034254.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700023845.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700023845.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700023845.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700114653.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700114653.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700114653.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700037738.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700037738.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700037738.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700016610.qza
/Users/b



/Users/benkaehler/Data/HMP/results/shotgun/700015981.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700015981.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700015981.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700034794.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700034794.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700034794.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700103710.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700103710.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700103710.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700114480.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700114480.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700114480.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700097196.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700097196.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700097196.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700016765.qza
/Users/b



/Users/benkaehler/Data/HMP/results/shotgun/700101534.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700101534.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700101534.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700106465.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700106465.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700106465.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700023267.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700023267.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700023267.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700106056.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700106056.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700106056.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700098561.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700098561.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700098561.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700100312.qza
/Users/b



/Users/benkaehler/Data/HMP/results/shotgun/700033153.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700033153.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700033153.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700038761.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700038761.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700038761.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700114125.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700114125.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700114125.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700024752.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700024752.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700024752.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700015250.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700015250.qza
/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700015250.pkl
/Users/benkaehler/Data/HMP/results/shotgun/700016716.qza
/Users/b

### Save down the fold information

In [13]:
if not os.path.exists(join(data_dir, 'results', 'folds')):
    os.mkdir(join(data_dir, 'results', 'folds'))
for i in range(k):
    fold_dir = join(data_dir, 'results', 'folds', 'fold-' + str(i))
    if not os.path.exists(fold_dir):
        os.mkdir(fold_dir)
    with open(join(fold_dir, 'sample_test.json'), 'w') as st_fh:
        json.dump(partitions[i], st_fh)

### Classify under shotgun weights without bias correction

In [14]:
dodge_dir = join(data_dir, 'results', 'nude-shotgun-rel-freqs')
if not os.path.exists(dodge_dir):
    os.mkdir(dodge_dir)
for i in range(k):
    classifier = fit_classifier_naive_bayes(
        ref_seq, ref_tax, shotgun_weights[i],
        classify__alpha=0.001, feat_ext__ngram_range='[7,7]').classifier
    classification = classify_sklearn(seqs_16s, classifier, n_jobs=4).classification
    samples = test_samples.filter(partitions[i], inplace=False)
    save_observed(join(data_dir, 'results'), samples, classification, 'nude-shotgun')
    
    for id_ in partitions[i]:
        cl_fp = join(data_dir, 'results', 'nude-shotgun', id_+'.qza')
        print(cl_fp)
        classification = Artifact.load(cl_fp)
        ab_fp = join(data_dir, 'results', 'abundance_16s', id_+'.qza')
        print(ab_fp)
        abundances = Artifact.load(ab_fp)
        freqs = _collapse_frequency_table_to_relative_frequency(abundances, classification)
        freqs.to_pickle(join(dodge_dir, id_+'.pkl'))
        freqs_fp = join(dodge_dir, id_+'.pkl')
        print(freqs_fp)
        freqs.to_pickle(freqs_fp)
        
        wtf1 = Artifact.load(cl_fp).view(pandas.DataFrame)
        wtf2 = pandas.read_pickle(freqs_fp).T
        wtf2 = set(id_.replace(';__', '') for id_ in wtf2.index)
        assert set(wtf1['Taxon']) == wtf2



/Users/benkaehler/Data/HMP/results/nude-shotgun/700097859.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700097859.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700097859.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700032244.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700032244.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700032244.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700106170.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700106170.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700106170.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700023788.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700023788.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700023788.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700024866.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700024866.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700024866.pkl
/Users/benkaehl



/Users/benkaehler/Data/HMP/results/nude-shotgun/700101243.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700101243.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700101243.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700034254.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700034254.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700034254.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700023845.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700023845.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700023845.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700114653.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700114653.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700114653.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700037738.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700037738.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700037738.pkl
/Users/benkaehl



/Users/benkaehler/Data/HMP/results/nude-shotgun/700015981.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700015981.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700015981.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700034794.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700034794.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700034794.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700103710.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700103710.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700103710.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700114480.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700114480.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700114480.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700097196.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700097196.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700097196.pkl
/Users/benkaehl



/Users/benkaehler/Data/HMP/results/nude-shotgun/700101534.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700101534.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700101534.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700106465.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700106465.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700106465.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700023267.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700023267.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700023267.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700106056.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700106056.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700106056.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700098561.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700098561.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700098561.pkl
/Users/benkaehl



/Users/benkaehler/Data/HMP/results/nude-shotgun/700033153.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700033153.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700033153.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700038761.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700038761.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700038761.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700114125.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700114125.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700114125.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700024752.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700024752.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700024752.pkl
/Users/benkaehler/Data/HMP/results/nude-shotgun/700015250.qza
/Users/benkaehler/Data/HMP/results/abundance_16s/700015250.qza
/Users/benkaehler/Data/HMP/results/nude-shotgun-rel-freqs/700015250.pkl
/Users/benkaehl

### Code graveyard follows

In [15]:
json.dump?

In [17]:
wtf = weights.view(pandas.DataFrame).T
wtf.loc[wtf['Weight'] > 6.8e-11]

Unnamed: 0,Weight
Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Odoribacteraceae;Odoribacter;Odoribacter splanchnicus,0.004446
Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Blautia;Blautia stercoris,0.000085
Bacteria;Firmicutes;Erysipelotrichia;Erysipelotrichales;Erysipelotrichaceae;Erysipelatoclostridium;[Clostridium] innocuum,0.000079
Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Papillibacter;Papillibacter cinnamivorans,0.000040
Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiaceae;Clostridium;Clostridium tepidum,0.000976
Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnoclostridium;[Clostridium] bolteae,0.000326
Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiales Family XIII. Incertae Sedis;;[Eubacterium] sulci,0.000199
Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Blautia;Blautia luti,0.004951
Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus;Streptococcus lactarius,0.000028
Bacteria;Actinobacteria;Coriobacteriia;Eggerthellales;Eggerthellaceae;Slackia;Slackia isoflavoniconvertens,0.000031


In [36]:
weights_16s = sixteen_ess_weights[0]
weights_shotgun = shotgun_weights[0]
corrections = train_corrections(weights_16s, weights_shotgun)

In [37]:
list(zip([0, 2, 25, 50, 75, 98, 100], percentile(list(corrections[0].values()), [0, 2, 25, 50, 75, 98, 100])))

[(0, 0.069086977965707397),
 (2, 0.086647033353656894),
 (25, 0.93878120443938995),
 (50, 1.4474392098304012),
 (75, 2.2432180683795089),
 (98, 4.2411384409637325),
 (100, 28.291996508914472)]

In [38]:
for taxon in corrections[0]:
    if corrections[0][taxon] == min(corrections[0].values()):
        print(taxon)

Bacteria;Firmicutes;Clostridia;Clostridiales;Oscillospiraceae;Oscillibacter;Oscillibacter ruminantium


In [14]:
from glob import glob

for fn in glob('/Users/benkaehler/Data/HMP/results/uniform-rel-freqs/*'):
    uniform = pandas.read_pickle(fn).T
    shotgun = pandas.read_pickle('/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/' 
                                 + os.path.basename(fn)).T
    print(set(shotgun.index).symmetric_difference(set(uniform.index)))

{'Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Rikenellaceae;Alistipes;Alistipes inops', 'Bacteria;Proteobacteria;Alphaproteobacteria;__;__;__;__', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;Eubacterium uniforme', 'Bacteria;Tenericutes;Mollicutes;Anaeroplasmatales;Anaeroplasmataceae;Anaeroplasma;__', 'Bacteria;Firmicutes;Negativicutes;Acidaminococcales;Acidaminococcaceae;__;__', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia;Roseburia faecis', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;Eubacterium oxidoreducens', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia;__', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Anaerotruncus;Anaerotruncus colihominis', 'Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides rodentium', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminococcus;Ruminococcus champanellensis', 'Bac

In [29]:
uniform = pandas.read_pickle('/Users/benkaehler/Data/HMP/results/uniform-rel-freqs/700024509.pkl')
shotgun = pandas.read_pickle('/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700024509.pkl')

In [30]:
uniform.T.join(shotgun.T, lsuffix='-uniform', rsuffix='-shotgun').sort_values('Frequency-shotgun')

Unnamed: 0,Frequency-uniform,Frequency-shotgun
Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Lactococcus;Lactococcus lactis,0.00021,0.000236
Bacteria;Firmicutes;Erysipelotrichia;Erysipelotrichales;Erysipelotrichaceae;Turicibacter;Turicibacter sanguinis,0.000336,0.000295
Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;Eubacterium coprostanoligenes,0.000323,0.000369
Bacteria;Firmicutes;Clostridia;Clostridiales;Oscillospiraceae;Oscillibacter;__,0.000388,0.000392
Bacteria;Firmicutes;Negativicutes;Acidaminococcales;Acidaminococcaceae;Phascolarctobacterium;Phascolarctobacterium faecium,0.000376,0.00043
Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Fusicatenibacter;Fusicatenibacter saccharivorans,0.000339,0.000435
Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Anaerotignum;Anaerotignum aminivorans,0.000406,0.000522
Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Anaerostipes;Anaerostipes hadrus,0.000474,0.000609
Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;;[Clostridium] leptum,0.000574,0.00063
Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Sporobacter;Sporobacter termitidis,0.000574,0.00063


In [15]:
uniform = Artifact.load('/Users/benkaehler/Data/HMP/results/uniform/700015250.qza').view(pandas.DataFrame)
shotgun = Artifact.load('/Users/benkaehler/Data/HMP/results/shotgun/700015250.qza').view(pandas.DataFrame)

In [16]:
set(shotgun['Taxon']) - set(uniform['Taxon'])

{'Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides xylanolyticus',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;Eubacterium ruminantium',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;[Eubacterium] eligens',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia;Roseburia intestinalis',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Agathobaculum;Agathobaculum butyriciproducens',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Anaerotruncus',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminococcus',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Subdoligranulum;Subdoligranulum variabile',
 'Bacteria;Firmicutes;Erysipelotrichia;Erysipelotrichales;Erysipelotrich

In [19]:
shotgun.join(uniform, lsuffix='_shotgun', rsuffix='_uniform')

Unnamed: 0_level_0,Taxon_shotgun,Taxon_uniform
Feature ID,Unnamed: 1_level_1,Unnamed: 2_level_1
2defbbf4f65550cb4c6714ac95e17c6f,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides vulgatus,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides vulgatus
6d087510fd2f3cbe836ac2fa7335caca,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Rikenellaceae;Alistipes;Alistipes putredinis,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Rikenellaceae;Alistipes;Alistipes putredinis
c8926624eb0ca91bffab42ed0dd75f78,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium;Faecalibacterium prausnitzii,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium;Faecalibacterium prausnitzii
f9067962f2ce85468feb04b33036664e,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri
ba37919f04c1f088c354c1e052a78346,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides vulgatus,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides vulgatus
0d2fbdeb2f74e55d50cbc797da0d3d85,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Tannerellaceae;Parabacteroides;Parabacteroides merdae,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Tannerellaceae;Parabacteroides;Parabacteroides merdae
f210b256c793f8c87a6f4438a314c35c,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides massiliensis,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides massiliensis
b895d93e83532c6f3cc7f168810710cd,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium;Faecalibacterium prausnitzii,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium;Faecalibacterium prausnitzii
464279c7a6ece6a4068d37f5a969205b,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides uniformis,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides uniformis
7cc89af8ff09833bf3c803ec2b1c4f20,Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;;[Eubacterium] rectale,Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;;[Eubacterium] rectale


In [24]:
pandas.set_option('max_colwidth',400)
pandas.set_option('max_rows',200)

In [105]:
classification = Artifact.load('/Users/benkaehler/Data/HMP/results/shotgun/700015250.qza')
abundances = Artifact.load('/Users/benkaehler/Data/HMP/results/abundance_16s/700015250.qza')
shotgun_sg = sixteen_ess_2_shotgun(classification, abundances, corrections).T
classification = Artifact.load('/Users/benkaehler/Data/HMP/results/uniform/700015250.qza')
abundances = Artifact.load('/Users/benkaehler/Data/HMP/results/abundance_16s/700015250.qza')
uniform_sg = sixteen_ess_2_shotgun(classification, abundances, corrections).T

In [87]:
set(uniform_sg.index) - set(shotgun_sg.index)

{'Bacteria;Bacteroidetes;__;__;__;__;__',
 'Bacteria;Firmicutes;Bacilli;Lactobacillales;Lactobacillaceae;Lactobacillus;Lactobacillus rogosae',
 'Bacteria;Firmicutes;Bacilli;__;__;__;__',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Clostridiales Family XIII. Incertae Sedis;__;__',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Hungateiclostridiaceae;Hungateiclostridium;__',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Hungateiclostridiaceae;__;__',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Anaerostipes;Anaerostipes butyraticus',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Eisenbergiella;__',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Kineothrix;Kineothrix alysoides',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Lachnoclostridium;__',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia;Roseburia faecis',
 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia;Roseburia

In [88]:
uniform = pandas.read_pickle('/Users/benkaehler/Data/HMP/results/uniform-rel-freqs/700015250.pkl').T
print(set(uniform_sg.index) - set(uniform.index))
shotgun = pandas.read_pickle('/Users/benkaehler/Data/HMP/results/shotgun-rel-freqs/700015250.pkl').T
print(set(shotgun_sg.index) - set(shotgun.index))

set()
{'Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides xylanolyticus', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;__', 'Bacteria;Proteobacteria;Deltaproteobacteria;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;__', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia;Roseburia intestinalis', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Agathobaculum;Agathobaculum butyriciproducens', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;[Eubacterium] eligens', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Roseburia;__', 'Bacteria;Firmicutes;Erysipelotrichia;Erysipelotrichales;Erysipelotrichaceae;__;__', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Anaerotruncus;__', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Ruminococcus;__', 'Bacteria;Firmicutes;Clostridia;Clostridiales;Eubacteriaceae;Eubacterium;Eubacte

In [108]:
classification.view(pandas.DataFrame)

Unnamed: 0_level_0,Taxon
Feature ID,Unnamed: 1_level_1
2defbbf4f65550cb4c6714ac95e17c6f,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides vulgatus
6d087510fd2f3cbe836ac2fa7335caca,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Rikenellaceae;Alistipes;Alistipes putredinis
c8926624eb0ca91bffab42ed0dd75f78,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium;Faecalibacterium prausnitzii
f9067962f2ce85468feb04b33036664e,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Prevotellaceae;Prevotella;Prevotella copri
ba37919f04c1f088c354c1e052a78346,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides vulgatus
0d2fbdeb2f74e55d50cbc797da0d3d85,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Tannerellaceae;Parabacteroides;Parabacteroides merdae
f210b256c793f8c87a6f4438a314c35c,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides massiliensis
b895d93e83532c6f3cc7f168810710cd,Bacteria;Firmicutes;Clostridia;Clostridiales;Ruminococcaceae;Faecalibacterium;Faecalibacterium prausnitzii
464279c7a6ece6a4068d37f5a969205b,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides;Bacteroides uniformis
7cc89af8ff09833bf3c803ec2b1c4f20,Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;;[Eubacterium] rectale


### Classifiy under Animal distal gut weights

In [18]:
classifier = Artifact.load(join(data_dir, 'tmp', 'adg-classifier.qza'))
classification = classify_sklearn(seqs_16s, classifier, n_jobs=4).classification

In [19]:
save_observed(join(data_dir, 'results'), test_samples, classification, 'animal-distal-gut')