# Benchmarks for the first million reads in mock-6

In [42]:
from __future__ import division
import csv
from pandas import Series
import numpy
from collections import Counter

In [65]:
def read_classifications(filename):
    classifications = []
    with open(filename) as nb_file:
        reader = csv.reader(nb_file, delimiter='\t')
        for row in reader:
            if row[1] != 'taxonomy':
                classifications.append(row[1].replace(' ',''))
    return classifications

def read_mock(filename):
    classifications = []
    with open(filename) as nb_file:
        reader = csv.reader(nb_file, delimiter='\t')
        for row in reader:
            if row[0] != '#Taxonomy':
                classifications.append(row[0])
    return set(classifications)

def get_stats(mock, results):
    TP = 0
    FP = 0
    FN = 0
    results = set(results)
    for taxon in mock.union(results):
        if taxon in mock and taxon in results:
            TP += 1
        elif taxon not in mock and taxon in results:
            FP += 1
        elif taxon in mock and taxon not in results:
            FN += 1
    p = TP/(TP+FP)
    r = TP/(TP+FN)
    stats = {'precision': p,
             'recall': r,
             'f' : 2*p*r/(p + r)}
    return stats

def get_correlation(mock, results):
    m = numpy.ones(len(mock))
    results = Counter(results)
    r = numpy.array([results[t] for t in mock])
    return m.dot(r)/numpy.sqrt(m.dot(m)*r.dot(r))

In [37]:
fc_nb = read_classifications('../processed/mock-6/mock_6_fewer_classification/data/taxonomy.tsv')
rdp = read_classifications('../processed/mock-6/rdp_noc/mock_6_fewer_reads_tax_assignments.txt')
mock = read_mock('../data/mockrobiota/data/mock-6/greengenes/13_8/expected-taxonomy.tsv')

#### q2-feature-classifier naive bayes

In [66]:
stats = get_stats(mock, fc_nb)
stats['rho'] = get_correlation(mock, fc_nb)
stats = Series(stats, name='statistic')
stats.to_frame()

Unnamed: 0,statistic
f,0.027817
precision,0.014125
recall,0.906977
rho,0.312815


#### RDP Classifier

In [67]:
stats = get_stats(mock, rdp)
stats['rho'] = get_correlation(mock, rdp)
stats = Series(stats, name='statistic')
stats.to_frame()

Unnamed: 0,statistic
f,0.038787
precision,0.019817
recall,0.906977
rho,0.259367
