In [2]:
import biom
import pandas as pd
import numpy as np
import qiime2 as q2
from scipy.stats import norm
from biom import load_table, Table
pd.options.mode.chained_assignment = None  # default='warn'


In [4]:
def match_data(bt, mf, n_sample = 0, n_feature = 0):

    """
    Import data and filter
    by sum or reads/taxa

    filter cutoffs
    int, int
    """

    #match
    keep_subject = list(set(mf.index) & set(bt.ids()))
    bt.filter(keep_subject,axis='sample',inplace=True)
    # filter table by sum cutoffs
    keep_subject = bt.ids('sample')[bt.sum('sample') >= n_sample]
    keep_subject = list(set(mf.index) & set(keep_subject))
    bt.filter(keep_subject,axis='sample',inplace=True)
    # filter taxa
    keep_taxa = bt.ids('observation')[bt.sum('observation') > n_feature]
    # drop and filter
    bt.filter(keep_taxa,axis='observation',inplace=True)
    # df
    btdf = pd.DataFrame(bt.matrix_data.toarray(),
                        bt.ids('observation'),
                        bt.ids('sample'))
    # ensure matched 
    mf = mf.reindex(btdf.columns)

    return  bt, btdf, mf


# import table
bt = load_table('../../data/Halfvarson-IBD-Qiita-1629/reference-hit.biom')
# import metadata
mf = pd.read_csv('../../data/Halfvarson-IBD-Qiita-1629/1629_20180101-113841.txt',sep='\t',index_col=0)
print(bt.shape)
print(mf.shape)

(7589, 681)
(683, 38)


In [5]:
# keep only Crohns and Control
mf = mf[mf.ibd_subtype.isin(['HC', 'CCD'])]
# ensure timepoints are numeric
mf['timepoint'] = pd.to_numeric(mf['timepoint'], errors='coerce')
# rename for readability
mf['IBD'] = mf['ibd_subtype'].replace({'CCD': 'Crohns',
                                       'HC': 'Control'})
mf.shape

(156, 39)

In [6]:
# very few samples hvae timepoints 9,10
print('before')
print(mf.timepoint.value_counts())
# so we remove them
mf = mf[mf.timepoint < 9]
print('after')
print(mf.timepoint.value_counts())


before
1    27
3    24
2    24
4    22
5    20
6    15
7    13
8     9
9     2
Name: timepoint, dtype: int64
after
1    27
3    24
2    24
4    22
5    20
6    15
7    13
8     9
Name: timepoint, dtype: int64


In [7]:
# next filter subjects that at least three timepoints
print(mf.shape)
keep_sub = [k for k,df in mf.groupby('host_subject_id') if len(set(df.timepoint)) > 2]
mf = mf[mf.host_subject_id.isin(keep_sub)]
print(len(keep_sub))
print(mf.shape)

(154, 39)
23
(148, 39)


In [8]:
# match the cleaned data
bt, btdf, mf = match_data(bt, mf, n_sample = 500)
print(bt.shape)
print(btdf.shape)
print(mf.shape)


(3210, 134)
(3210, 134)
(134, 39)


In [9]:
# write mapping file to use for downstream
mf.index.name = '#SampleID'
mf.to_csv('../../data/Halfvarson-IBD-Qiita-1629/metadata-matched.tsv',sep='\t')
btdf.to_csv('../../data/Halfvarson-IBD-Qiita-1629/table-matched.tsv',sep='\t')
with biom.util.biom_open('../../data/Halfvarson-IBD-Qiita-1629/table-matched.biom', 'w') as f:
    bt.to_hdf5(f, "ibdlongmatch")


In [10]:
mf.IBD.value_counts()

Crohns     76
Control    58
Name: IBD, dtype: int64

In [11]:
len(set(mf.host_subject_id))

23

In [12]:
from collections import Counter
Counter([list(set(df.IBD))[0] for k,df in mf.groupby('host_subject_id')])

Counter({'Crohns': 14, 'Control': 9})