In [1]:
import pandas as pd

In [2]:
otu_data = pd.read_csv("../data/ecam-table-taxa.tsv",
                       header=1, sep='\t')
otu_data = otu_data.set_index('feature-id')

In [3]:
meta_data = pd.read_csv("../data/ecam-sample-metadata.tsv", sep='\t')
meta_data = meta_data[1:]
meta_data = meta_data.rename(columns={'#SampleID': 'Sample.ID'})

In [4]:
# ANCOM imports
import numpy as np
from scipy.stats import norm

In [5]:
def _outlier_check(x, out_cut):
    # Fitting the mixture model using the algorithm of Peddada, S. Das, and JT Gene Hwang (2002)
    mu1, mu2 = np.nanquantile(x, (0.25, 0.75))
    sigma1 = mu2 - mu1
    sigma2 = sigma1
    pi = 0.75
    n = len(x)
    epsilon = 100
    tol = 1e-5
    score = pi * norm.pdf(x, mu1, sigma1) / \
        ((1 - pi) * norm.pdf(x, mu2, sigma2))
    while epsilon > tol:
        grp1_ind = score >= 1
        not_grp1_ind = np.logical_not(grp1_ind)
        mu1_new = x[grp1_ind].mean()
        mu2_new = x[not_grp1_ind].mean()
        sigma1_new = x[grp1_ind].std()
        if np.isnan(sigma1_new):
            sigma1_new = 0.
        sigma2_new = x[not_grp1_ind].std()
        if np.isnan(sigma2_new):
            sigma2_new = 0.
        pi_new = sum(grp1_ind) / n
        
        para = [mu1_new, mu2_new, sigma1_new, sigma2_new, pi_new]
        if np.any(np.isnan(para)):
            break
        
        if sigma1_new == 0.:
            pdf1 = np.zeros_like(x)
            pdf1[x == mu1_new] = np.inf
        else:
            pdf1 = norm.pdf(x, mu1_new, sigma1_new)
        if sigma2_new == 0.:
            pdf2 = np.zeros_like(x)
            pdf2[x == mu2_new] = np.inf
        else:
            pdf2 = norm.pdf(x, mu2_new, sigma2_new)
        score = pi_new * pdf1 / ((1 - pi_new) * pdf2)
        
        old = np.array([mu1, mu2, sigma1, sigma2, pi])
        epsilon = np.linalg.norm(old - para)
        mu1, mu2, sigma1, sigma2, pi = para

    if mu1 + 1.96 * sigma1 < mu2 - 1.96 * sigma2:
        if pi < out_cut:
            return pd.Series(grp1_ind)
        elif pi > 1 - out_cut:
            return pd.Series(not_grp1_ind)
    return pd.Series([False]*n)
    

def identify_outliers(feature_table, meta_data, group_var, out_cut):
    z = feature_table + 1 # Add pseudo-count (1) # EEEE is this ok if data isn't counts?
    f = z.apply(np.log)
    f[f == 0] = np.nan # EEEE [sic]
    f = f.mean(axis=0, skipna=True)
    groups = meta_data[group_var]
    groups.index = feature_table.columns
    group_means = f.groupby(groups).mean()
    notna_groups = pd.notna(groups)
    group_means = group_means[groups[notna_groups].values]
    group_means.index = groups[notna_groups].index
    e = pd.Series([0]*f.size, index=f.index)
    e[notna_groups] = f[notna_groups] - group_means
    y = z - e

    def row_outlier_check(row):
        return row.groupby(groups).apply(
            lambda x: _outlier_check(x, out_cut))
    out_ind = y.apply(row_outlier_check, axis=1)
    return np.array(out_ind)


def feature_table_pre_process(
    feature_table, meta_data, sample_var, group_var=None,
    out_cut=0.05, zero_cut=0.9, lib_cut=1000, neg_lb=True
):
    # OTU table should be a pandas.DataFrame with each feature in rows and sample in columns.
    # Metadata should be a pandas.DataFrame containing the sample identifier.
    
    # Drop unused levels
    # meta_data[] = lapply(meta_data, function(x) if(is.factor(x)) factor(x) else x) # EEEE is this step necessary? Assuming "no" for now
    
    # Match sample IDs between metadata and feature table
    sample_ID = meta_data[sample_var]
    sample_ID = sample_ID[sample_ID.isin(feature_table.columns)]
    feature_table = feature_table[sample_ID]
    meta_data = meta_data[meta_data[sample_var].isin(sample_ID)]
    
    # 1. Identify outliers within each taxon
    if (group_var is not None):
        out_ind = identify_outliers(
            feature_table, meta_data, group_var, out_cut)
        feature_table[out_ind] = np.nan


In [6]:
feature_table_pre_process(otu_data, meta_data, 'Sample.ID', 'day_of_life')

  x = np.asarray((x - loc)/scale, dtype=dtyp)
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [7]:
def test_identify_outliers():
    feature_table = pd.read_csv("../data/ecam-table-taxa.tsv",
                           header=1, sep='\t')
    feature_table = feature_table.set_index('feature-id')
    meta_data = pd.read_csv(
        "../data/ecam-sample-metadata.tsv", sep='\t')
    meta_data = meta_data[1:]
    meta_data = meta_data.rename(columns={'#SampleID': 'Sample.ID'})
    
    sample_ID = meta_data['Sample.ID']
    sample_ID = sample_ID[sample_ID.isin(feature_table.columns)]
    feature_table = feature_table[sample_ID]
    meta_data = meta_data[meta_data['Sample.ID'].isin(sample_ID)]
    
    out_ind = identify_outliers(
        feature_table, meta_data, 'delivery', 0.05)
    answer = pd.read_csv('outliers.tsv', sep=' ', header=None)
    assert np.all(out_ind == answer)

In [8]:
test_identify_outliers()

  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]
