#### This notebook is apply Discrete-FDR to identify significant OTUs (reference: Jiang et al, msystems, 2017)

In [1]:
import numpy as np
import pandas as pd
from biom import load_table
from gneiss.util import match
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from dsfdr import dsfdr
from dsfdr import simulation
from dsfdr import statistics
from dsfdr import transform

In [3]:
np.random.seed(2018)

## Prepare biom table and mapping file¶

### load biom table

In [4]:
def convert_biom_to_pandas(table):
    otu_table = pd.DataFrame(np.array(table.matrix_data.todense()).T,
                             index=table.ids(axis='sample'),
                             columns=table.ids(axis='observation'))
    return otu_table

In [5]:
table = load_table('./data/haddad_6weeks_deblur_otus_rare2k_matched.biom')
otu_table = convert_biom_to_pandas(table)

In [7]:
otu_table = otu_table.T

In [8]:
otu_table.shape

(182, 735)

### load mapping file

In [9]:
mapping = pd.read_table("./data/haddad_6weeks_metadata_matched.txt", 
                        sep='\t', header=0, index_col=0)

In [10]:
mapping.shape

(182, 69)

In [11]:
mapping.head()

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,center_name,experiment_design_description,extraction_robot,extractionkit_lot,instrument_model,library_construction_protocol,linker,mastermix_lot,...,physical_specimen_location,physical_specimen_remaining,sample_type,scientific_name,sex,title,weekly_cage_food_consumption,weight,weight_units,Description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10422.17.F.10,GTTGTTCTGGGA,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF1,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,Missing: Not provided,25.6,g,feces mouse 17 collection 10 of 13
10422.17.F.11,TGTGCTTGTAGG,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF2,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,71.8,25.2,g,feces mouse 17 collection 11 of 13
10422.17.F.12,AGAATCCACCAC,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF1,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,Missing: Not provided,25.7,g,feces mouse 17 collection 12 of 13
10422.17.F.13,CTGTAAAGGTTG,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF2,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,71.7,26.3,g,final feces mouse 17 collection 13 of 13
10422.17.F.3,CTCCCGAGCTCC,GTGTGCCAGCMGCCGCGGTAA,UCSDMI,Mouse cohort exposed to apnea and controls to ...,HOWE_KF2,PM16B24,Illumina HiSeq 2500,"EMP 16S V4 protocol 515fbc, 806r",GT,14663,...,UCSD LBR -80 freezer,True,stool,mouse gut metagenome,male,OSA,105.7,24.9,g,feces mouse 17 collection 3 of 13


In [12]:
mapping.exposure_type.value_counts()

IHH    92
Air    90
Name: exposure_type, dtype: int64

# match mapping file and biom table

In [13]:
mapping, otu_table = match(mapping, otu_table)

In [14]:
print(mapping.shape)
print(otu_table.shape)

(182, 69)
(182, 735)


In [15]:
# convert values in exposure_type to be integers
labels = np.array((mapping['exposure_type'] == 'IHH').astype(int))

In [16]:
labels

array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0])

In [17]:
# transpose otu_table to have OTUs as rows and samples as columns
dat = np.transpose(np.array(otu_table))

In [18]:
dat.shape

(735, 182)

## Apply DS-FDR

In [19]:
result = dsfdr.dsfdr(dat, labels, transform_type='rankdata', 
                     method='meandiff', alpha=0.01, 
                     numperm=10000, fdr_method='dsfdr')

In [20]:
# total number of significant hypotheses
rej=(result[0])
np.sum(rej)

87

## Output result

In [21]:
pvals=[]
teststat=[]

for i in range(len(result[0])):
    if result[0][i]==True:
        pvals.append(result[2][i])
        teststat.append(result[1][i])

In [22]:
s = pd.Series(rej, name='bools')

In [23]:
out = otu_table.T[s.values]

In [24]:
out.shape

(87, 182)

In [25]:
out['pvalue']=pvals
out['test_statistic']=teststat

In [26]:
out.to_csv('./data/dblr_dsfdr.txt', sep='\t')