In [1]:
import biom
import numpy as np
import pandas as pd
from biom import Table
from scipy.spatial import distance
from scripts.large_pickle import pickle_dump,pickle_load


In [2]:
"""
Import data and filter
by sum or reads/taxa

filter cutoffs
int, int
"""
n_sample = 0
n_feature = 0

# load data
bt = biom.load_table('../../data/DIABIMMUNE-Qiita-11884/72835_reference-hit.biom')
mf = pd.read_csv('../../data/DIABIMMUNE-Qiita-11884/11884_20190508-173103.txt',sep='\t',index_col=0)
# filter table by sum cutoffs
keep_subject = bt.ids('sample')[bt.sum('sample') >= n_sample]
keep_subject = list(set(mf.index) & set(keep_subject))
bt.filter(keep_subject,axis='sample',inplace=True)
# filter taxa
keep_taxa = bt.ids('observation')[bt.sum('observation') > n_feature]
# drop and filter
bt.filter(keep_taxa,axis='observation',inplace=True)
# df
btdf = pd.DataFrame(bt.matrix_data.toarray(),
                    bt.ids('observation'),
                    bt.ids('sample'))
# ensure matched 
mf = mf.reindex(btdf.columns)


"""
Bin by monthly intervals
"""

## bin days by bin size 
size_bin_days = 30 # bin size
# generate bins from min to max
# time point
bins = np.arange(min(mf.age_at_collection),
                 max(mf.age_at_collection),
                 size_bin_days)
# generate ids for bins to replace day labels
bin_map = {i:[bins[i-1],bins[i]]
           for i in range(1,len(bins))}
# map the binned days into a list
longbin = []
for day_ in mf.age_at_collection.values:
    if day_ >= bins[-1]:
        longbin.append(max(bin_map.keys())+1)
        continue
    for k,v in bin_map.items():
        if v[0] <= day_ and v[1] > day_:
            longbin.append(k)
            break 
if len(longbin)!=len(mf.age_at_collection.values):
    raise RuntimeError('Mapping does not match!')
else:
    mf['age_at_collection_binned'] = longbin
# get the labels for each bin (days) and add to metadata
bins_ = {x:[] for x in set(mf.age_at_collection_binned)}
for x,y in zip(mf.age_at_collection,
               mf.age_at_collection_binned):
    bins_[y].append(x)
bin_minmax = {k:[round(min(v)/size_bin_days),
                 round(max(v)/size_bin_days)]
              for k,v in bins_.items()}
bin_minmax_labels = {k:str(v[0])+'-'+str(v[1]) 
                     for k,v in bin_minmax.items()}
mf['age_at_collection_binned_month'] = [bin_minmax_labels[x] 
                                        for x in mf.age_at_collection_binned]
month_order = {str(i-1)+'-'+str(i):i-2 for i in range(2,38)}
mf['month'] = [month_order[x] for x in mf.age_at_collection_binned_month]

"""
add antibiotic history
"""

# add antibiotics
anidf = pd.read_csv('../../data/DIABIMMUNE-Qiita-11884/abx.csv',index_col=0)
anidf = anidf[~anidf['Age (months)'].isin(['Not_known'])]
anidf['month'] = anidf['Age (months)'].astype(float).astype(int)
anti_map = {}
anti_count = {}
for ind_ in mf.subjectid:
    if ind_ in set(anidf.index):
        anti_map[ind_] = 'Antibiotics'       
        anti_count[ind_] = anidf[~anidf['Duration (days)'].isin(['Not known'])].loc[ind_,:][['Duration (days)']].values.astype(int).sum()
        
    else:
        anti_count[ind_] = 0
        anti_map[ind_] = 'No Antibiotics'  
mf['number_days_abx'] = [anti_count[ind_] for ind_ in mf.subjectid]
mf['abx_ever'] = [anti_map[ind_] for ind_ in mf.subjectid]
abx_time = {(k1,k2):True for k1,k2 in zip(anidf.index,anidf.month)}
mf['abx_month'] = ['Antibiotics' if (k1,k2) in abx_time.keys() else 'No Antibiotics' 
                   for k1,k2 in zip(mf.subjectid,mf.month) ]

"""
filter
"""

# write mapping file to use for downstream
mf.index.name = '#SampleID'
mf.to_csv('../../data/DIABIMMUNE-Qiita-11884/11884_20190508-173103-added-month-abx.txt',sep='\t')
btdf.to_csv('../../data/DIABIMMUNE-Qiita-11884/table-added-month-abx.txt',sep='\t')
with biom.util.biom_open('../../data/DIABIMMUNE-Qiita-11884/DIABIMMUNE_month.biom', 'w') as f:
    Table(btdf.values, btdf.index, btdf.columns).to_hdf5(f, "DIABbinned")


In [3]:
from collections import Counter

class_counted = {k:Counter(df.csection) for k,df in mf.groupby('month')}
time_points_filter = [k for k,v in class_counted.items() if v[True] < 3]
mf_clean = mf[~mf.month.isin(time_points_filter)]
mf_clean.to_csv('../../data/DIABIMMUNE-Qiita-11884/metadata-added-month-abx-lowtime-rm.txt',sep='\t')


In [4]:
from collections import Counter

mf_chk = pd.read_csv('../../data/DIABIMMUNE-Qiita-11884/11884_20190508-173103-added-month-abx.txt', sep='\t')
mf_chk.shape
Counter([list(set(df.csection))[0] for k,df in mf_chk.groupby('host_subject_id')])


Counter({False: 36, True: 4})