In [1]:
import pandas as pd
from biom import Table
from biom.util import biom_open
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(42)
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
# import metadata
metadata = pd.read_csv('../network/metadata_v0.csv', index_col=0)
tables = {}

# import/match metabolomics
metabolomics = pd.read_csv('../network/GNPS-raw-metabolomics-quant-table.csv', index_col=0)
metabolomics = metabolomics.drop(['row m/z', 'row retention time'], axis=1)
map_names = pd.read_csv('../network/metabolite_sample_names.txt', sep='\t', index_col=0).orig_sample_name.to_dict()
metabolomics.columns = [c.replace(' Peak area','') for c in metabolomics.columns]
metabolomics.index = ['metab_' + str(i) for i in metabolomics.index]
metabolomics = metabolomics[set(metabolomics.columns) & set(map_names.keys())]
metabolomics.columns = [map_names[c] for c in metabolomics.columns]
tables['metabolomics'] = metabolomics.copy()

# import the rest
all_tables = {'mag':'../network/MAG_bact_table.tsv',
            '18S':'../network/18S_ASV_table.tsv',
            '16S':'../network/16S_ASV_table.tsv',
            'gene':'../network/MAG_bact_genes_table.tsv',
            'gene_module':'../network/MAG_bact_gene_modules_table.tsv'}
for tbl_id, tbl_path in all_tables.items():
    tables[tbl_id] = pd.read_csv(tbl_path, sep='\t', index_col=0)
    
# match and filter the tables
def filter_table(df, use_ids):
    df = df[use_ids]
    df = df[df.sum(1) > 0]
    return df

all_shared_samples = set(metadata.index) & set.intersection(*[set(t.columns) for t in tables.values()])
tables = {tblid:filter_table(tbl, all_shared_samples) for tblid, tbl in tables.items()}
metadata = metadata.reindex(all_shared_samples)

# subset the tables to last active time point
#metadata_subset = pd.concat([df[df.add_0c_group == 'active'].sort_values('timepoint').iloc[[-1], :]
#                             for _, df in metadata.groupby('subjects') if 'active' in list(df.add_0c_group)])

metadata_subset = pd.concat([df
                             for _, df in metadata.groupby('subjects') if len(set(df.add_0c_group)) == 4])

tables_subset = {tblid:filter_table(tbl.copy(), metadata_subset.index) for tblid, tbl in tables.items()}
tables_subset = {tblid:Table(tbl.values, tbl.index, tbl.columns) for tblid, tbl in tables_subset.items()}

# train-tests
for i_ in range(10):
    train, test = train_test_split(metadata_subset, test_size=0.25, shuffle=True, random_state=42,
                                   stratify=metadata_subset[['facility']])
    metadata_subset.loc[:, 'traintest_%i' % i_] = 'train'
    metadata_subset.loc[test.index, 'traintest_%i' % i_] = 'test'
    
# save data 
metadata_subset.to_csv('../network/split-matched-data/metadata.tsv', sep='\t')
for tblid, tbl in tables_subset.items():
   with biom_open('../network/split-matched-data/%s.biom' % (tblid), 'w') as f:
       tbl.to_hdf5(f, "filtered-table-cm")

tables_subset

  metabolomics = metabolomics[set(metabolomics.columns) & set(map_names.keys())]
  df = df[use_ids]
  df = df[use_ids]
  df = df[use_ids]
  df = df[use_ids]
  df = df[use_ids]
  df = df[use_ids]


{'metabolomics': 2333 x 374 <class 'biom.table.Table'> with 104825 nonzero entries (12% dense),
 'mag': 257 x 374 <class 'biom.table.Table'> with 58498 nonzero entries (60% dense),
 '18S': 5473 x 374 <class 'biom.table.Table'> with 114755 nonzero entries (5% dense),
 '16S': 14237 x 374 <class 'biom.table.Table'> with 318875 nonzero entries (5% dense),
 'gene': 2457 x 374 <class 'biom.table.Table'> with 901890 nonzero entries (98% dense),
 'gene_module': 377 x 374 <class 'biom.table.Table'> with 117093 nonzero entries (83% dense)}

In [18]:
display(metabolomics.head())
print(metabolomics.shape)
display(metadata.head())
print(metadata.shape)
display(metadata_subset.head())
print(metadata_subset.shape)

Unnamed: 0,skin.hip.SHSU.2016.007.2016.04.19.day5,soil.hip.UTK.K016.12.2017.06.06.day13,skin.hip.SHSU.2016.076.2016.12.08.day17,soil.hip.CMU.16.05.2016.mo.day12,skin.hip.UTK.K016.12.2017.06.05.day12,skin.hip.UTK.K016.04.2016.10.02.day7,soil.hip.CMU.16.15.2016.12.day17,NaN,skin.hip.CMU.16.04.2016.mo.day16,skin.hip.CMU.16.04.2016.mo.day3,...,soil.hip.CMU.17.12.2017.10.27.day18,soil.hip.SHSU.2016.067.2016.09.29.day14,soil.hip.CMU.17.04.2017.03.day20,NaN.1,skin.hip.CMU16.15.2017.01.day21,skin.hip.UTK.K016.01.2016.04.10.day6,skin.hip.SHSU.2017.045.2017.12.23.day12,soil.hip.CMU.16.09.2016.07.30.day5,skin.hip.UTK.K016.01.2016.04.20.day16,soil.hip.SHSU.2016.064.2016.09.12.day19
metab_1,158871.795,10473.0375,179298.041,112037.2305,198376.116,74712.4035,66960.57,0.0,150675.174,274379.8865,...,35707.362,58849.1425,38239.685,116851.8925,151473.05,201226.74,349938.014,81220.396,217498.24,103126.119
metab_2,212073.4985,26614.984,211718.8375,150607.518,188884.356,225981.664,92532.0855,111684.386,259870.27,262189.7795,...,78978.7975,94656.5595,49992.5045,326404.9625,254827.1485,225625.574,206177.979,243058.195,217967.1735,134703.93
metab_3,72460.643,5657.9625,39945.163,27398.4035,50884.95,53861.367,19854.838,12014.1995,55685.133,68002.8485,...,11179.51,12259.56,12023.409,58299.4885,57950.2125,68184.78,82608.1645,29782.033,92682.2475,19853.5145
metab_4,32689.5045,6472.35,45096.044,37877.827,46728.022,16925.5065,28319.072,12958.6715,43710.732,62148.215,...,16777.384,28053.0825,17137.031,27312.7605,41674.4075,38273.745,87354.0015,28087.006,40261.725,34571.224
metab_5,29679.176,4428.1875,42852.068,24626.043,39605.188,14625.0615,16425.796,0.0,32397.3,36852.395,...,10184.756,17671.2275,10952.74,22897.659,31313.9375,33606.645,73460.597,18406.4715,36988.16,22749.0095


(4284, 1706)


Unnamed: 0_level_0,season,subjects,add_0c,add_0c_group,facility,timepoint
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
soil.hip.UTK.K016.08.2017.Jan.27.day21,winter,K016.08,215.9,advanced,ARF,21
soil.hip.CMU.17.12.2017.10.19.day10,fall,17-12,107.85,active,FIRS,10
soil.hip.CMU.16.09.2016.08.05.day11,summer,Mrs. 16-09,269.7,advanced,FIRS,11
soil.hip.CMU.17.12.2017.10.16.day7,fall,17-12,68.95,active,FIRS,7
soil.hip.SHSU.2016.011.2016.04.27.day13,spring,shsu.2016.011,271.111111,advanced,STAFS,13


(529, 6)


Unnamed: 0_level_0,season,subjects,add_0c,add_0c_group,facility,timepoint,traintest_0,traintest_1,traintest_2,traintest_3,traintest_4,traintest_5,traintest_6,traintest_7,traintest_8,traintest_9
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
soil.hip.CMU.17.10.2017.08.11.day3,summer,17-10,49.75,early,FIRS,3,train,train,train,train,train,train,train,train,train,train
soil.hip.CMU.17.10.2017.08.22.day14,summer,17-10,322.8,advanced,FIRS,14,test,test,test,test,test,test,test,test,test,test
soil.hip.CMU.17.10.2017.08.28.day20,summer,17-10,477.8,advanced,FIRS,20,test,test,test,test,test,test,test,test,test,test
soil.hip.CMU.17.10.2017.08.09.day1,summer,17-10,0.0,initial,FIRS,1,train,train,train,train,train,train,train,train,train,train
soil.hip.CMU.17.10.2017.08.23.day15,summer,17-10,348.05,advanced,FIRS,15,train,train,train,train,train,train,train,train,train,train


(374, 16)


In [13]:
print("# unique subjects:",len(metadata.subjects.unique()))
print(metadata.season.value_counts())
print(metadata.facility.value_counts())
#print(metadata.timepoint.value_counts())

# unique subjects: 36
spring    147
winter    130
summer    130
fall      122
Name: season, dtype: int64
ARF      211
STAFS    168
FIRS     150
Name: facility, dtype: int64
