In [1]:
import pandas as pd
from biom import Table
from biom.util import biom_open
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(42)
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# import metadata
metadata = pd.read_csv('../network/metadata_v0.csv', index_col=0)
tables = {}

# import/match metabolomics
metabolomics = pd.read_csv('../network/GNPS-raw-metabolomics-quant-table.csv', index_col=0)
metabolomics = metabolomics.drop(['row m/z', 'row retention time'], axis=1)
map_names = pd.read_csv('../network/metabolite_sample_names.txt', sep='\t', index_col=0).orig_sample_name.to_dict()
metabolomics.columns = [c.replace(' Peak area','') for c in metabolomics.columns]
metabolomics.index = ['metab_' + str(i) for i in metabolomics.index]
metabolomics = metabolomics[list(set(metabolomics.columns) & set(map_names.keys()))]
metabolomics.columns = [map_names[c] for c in metabolomics.columns]
tables['metabolomics'] = metabolomics.copy()

# import the rest
all_tables = {'mag':'../network/MAG_bact_table.tsv',
            '18S':'../network/18S_ASV_table.tsv',
            '16S':'../network/16S_ASV_table.tsv',
            'gene':'../network/MAG_bact_genes_table.tsv',
            'gene_module':'../network/MAG_bact_gene_modules_table.tsv'}
for tbl_id, tbl_path in all_tables.items():
    tables[tbl_id] = pd.read_csv(tbl_path, sep='\t', index_col=0)
    
# match and filter the tables
def filter_table(df, use_ids):
    df = df[use_ids]
    df = df[df.sum(1) > 0]
    return df

all_shared_samples = set(metadata.index) & set.intersection(*[set(t.columns) for t in tables.values()])
tables = {tblid:filter_table(tbl, list(all_shared_samples)) for tblid, tbl in tables.items()}
metadata = metadata.reindex(all_shared_samples)

# subset the tables to last active time point
#metadata_subset = pd.concat([df[df.add_0c_group == 'active'].sort_values('timepoint').iloc[[-1], :]
#                             for _, df in metadata.groupby('subjects') if 'active' in list(df.add_0c_group)])

metadata_subset = pd.concat([df
                             for _, df in metadata.groupby('subjects') if len(set(df.add_0c_group)) == 4])

tables_subset = {tblid:filter_table(tbl.copy(), metadata_subset.index) for tblid, tbl in tables.items()}
tables_subset = {tblid:Table(tbl.values, tbl.index, tbl.columns) for tblid, tbl in tables_subset.items()}

# train-tests
for i_ in range(10):
    train, test = train_test_split(metadata_subset, test_size=0.25, shuffle=True, random_state=42,
                                   stratify=metadata_subset[['facility']])
    metadata_subset.loc[:, 'traintest_%i' % i_] = 'train'
    metadata_subset.loc[test.index, 'traintest_%i' % i_] = 'test'
    
# save data 
# metadata_subset.to_csv('../network/split-matched-data/metadata.tsv', sep='\t')
# for tblid, tbl in tables_subset.items():
#    with biom_open('../network/split-matched-data/%s.biom' % (tblid), 'w') as f:
#        tbl.to_hdf5(f, "filtered-table-cm")

tables_subset

{'metabolomics': 2333 x 374 <class 'biom.table.Table'> with 104825 nonzero entries (12% dense),
 'mag': 257 x 374 <class 'biom.table.Table'> with 58498 nonzero entries (60% dense),
 '18S': 5473 x 374 <class 'biom.table.Table'> with 114755 nonzero entries (5% dense),
 '16S': 14237 x 374 <class 'biom.table.Table'> with 318875 nonzero entries (5% dense),
 'gene': 2457 x 374 <class 'biom.table.Table'> with 901890 nonzero entries (98% dense),
 'gene_module': 377 x 374 <class 'biom.table.Table'> with 117093 nonzero entries (83% dense)}

In [4]:
for table_key in tables_subset.keys():
    print(table_key)
    table_df = tables_subset[table_key].to_dataframe()
    display(table_df.head())

metabolomics


Unnamed: 0,soil.hip.CMU.17.10.2017.08.19.day11,soil.hip.CMU.17.10.2017.08.23.day15,soil.hip.CMU.17.10.2017.08.17.day9,soil.hip.CMU.17.10.2017.08.26.day18,soil.hip.CMU.17.10.2017.08.18.day10,soil.hip.CMU.17.10.2017.08.14.day6,soil.hip.CMU.17.10.2017.08.25.day17,soil.hip.CMU.17.10.2017.08.11.day3,soil.hip.CMU.17.10.2017.08.10.day2,soil.hip.CMU.17.10.2017.08.16.day8,...,soil.hip.SHSU.2016.024.2016.04.29.day15,soil.hip.SHSU.2016.024.2016.04.27.day13,soil.hip.SHSU.2016.024.2016.04.18.day4,soil.hip.SHSU.2016.024.2016.04.30.day16,soil.hip.SHSU.2016.024.2016.04.21.day7,soil.hip.SHSU.2016.024.2016.04.25.day11,soil.hip.SHSU.2016.024.2016.04.23.day9,soil.hip.SHSU.2016.024.2016.05.04.day20,soil.hip.SHSU.2016.024.2016.04.20.day6,soil.hip.SHSU.2016.024.2016.04.19.day5
metab_1,119439.3585,115188.493,108044.079,0.0,0.0,72074.2785,95474.148,0.0,81871.996,40967.6795,...,0.0,47893.47,95399.895,62749.815,155731.597,68292.317,170358.896,0.0,129596.6595,41355.096
metab_2,121762.4375,84045.9385,138094.612,84238.526,66828.6075,141611.761,154412.157,85873.9075,107084.891,110861.7365,...,52621.4895,58239.162,217757.7515,141819.469,222589.8975,87977.103,128770.079,142298.562,178770.436,67319.016
metab_3,28370.7765,25681.555,29390.328,0.0,11919.6,22562.652,26922.9765,0.0,18704.764,13365.6975,...,0.0,14602.656,37552.53,20449.5215,66233.99,13945.1885,36985.492,0.0,32280.801,8171.624
metab_4,34302.1965,47238.544,33677.127,0.0,0.0,26490.996,34538.1155,0.0,25694.87,17938.7485,...,0.0,34256.496,26844.285,22647.5615,41570.893,34079.2225,55959.49,0.0,38722.047,21581.432
metab_5,24203.1735,30852.1835,21822.801,0.0,0.0,18300.1455,22323.1995,0.0,16922.82,11512.349,...,0.0,14900.175,19896.51,15420.096,30384.032,15093.819,27154.764,0.0,27462.5295,12371.712


mag


Unnamed: 0,soil.hip.CMU.17.10.2017.08.19.day11,soil.hip.CMU.17.10.2017.08.23.day15,soil.hip.CMU.17.10.2017.08.17.day9,soil.hip.CMU.17.10.2017.08.26.day18,soil.hip.CMU.17.10.2017.08.18.day10,soil.hip.CMU.17.10.2017.08.14.day6,soil.hip.CMU.17.10.2017.08.25.day17,soil.hip.CMU.17.10.2017.08.11.day3,soil.hip.CMU.17.10.2017.08.10.day2,soil.hip.CMU.17.10.2017.08.16.day8,...,soil.hip.SHSU.2016.024.2016.04.29.day15,soil.hip.SHSU.2016.024.2016.04.27.day13,soil.hip.SHSU.2016.024.2016.04.18.day4,soil.hip.SHSU.2016.024.2016.04.30.day16,soil.hip.SHSU.2016.024.2016.04.21.day7,soil.hip.SHSU.2016.024.2016.04.25.day11,soil.hip.SHSU.2016.024.2016.04.23.day9,soil.hip.SHSU.2016.024.2016.05.04.day20,soil.hip.SHSU.2016.024.2016.04.20.day6,soil.hip.SHSU.2016.024.2016.04.19.day5
UTK.bins.58,10.983619,0.0,0.0,0.0,0.0,0.0,0.0,65.55989,0.0,0.0,...,114.239845,112.13596,7.107021,0.0,268.12628,1298.4451,686.6708,19.652256,878.65063,150.72356
CMU.bins.99,33129.98,35615.68,28628.404,32022.086,19159.424,29323.494,50860.89,40582.75,43743.17,2199.4407,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.966068,2.969697,0.0
SHSU.bins.42,386.8783,4117.6772,0.0,1787.9869,281.62924,375.77414,4193.8228,159.257,0.0,0.0,...,1270.1399,470.8607,0.0,435.79318,774.5528,297.52332,638.8279,956.15186,96.14429,0.0
SHSU.bins.192,0.0,8.726143,57.518337,0.0,0.0,16.280674,0.0,0.0,0.0,0.0,...,69.920044,40.969368,383.71695,0.0,638.36505,103.8444,224.18906,28.350302,266.92642,103.83103
SHSU.bins.235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.894605,3.82396,0.0,0.0,0.0,0.0,0.0,20.759691,0.0,0.0


18S


Unnamed: 0,soil.hip.CMU.17.10.2017.08.19.day11,soil.hip.CMU.17.10.2017.08.23.day15,soil.hip.CMU.17.10.2017.08.17.day9,soil.hip.CMU.17.10.2017.08.26.day18,soil.hip.CMU.17.10.2017.08.18.day10,soil.hip.CMU.17.10.2017.08.14.day6,soil.hip.CMU.17.10.2017.08.25.day17,soil.hip.CMU.17.10.2017.08.11.day3,soil.hip.CMU.17.10.2017.08.10.day2,soil.hip.CMU.17.10.2017.08.16.day8,...,soil.hip.SHSU.2016.024.2016.04.29.day15,soil.hip.SHSU.2016.024.2016.04.27.day13,soil.hip.SHSU.2016.024.2016.04.18.day4,soil.hip.SHSU.2016.024.2016.04.30.day16,soil.hip.SHSU.2016.024.2016.04.21.day7,soil.hip.SHSU.2016.024.2016.04.25.day11,soil.hip.SHSU.2016.024.2016.04.23.day9,soil.hip.SHSU.2016.024.2016.05.04.day20,soil.hip.SHSU.2016.024.2016.04.20.day6,soil.hip.SHSU.2016.024.2016.04.19.day5
d86487a3dd8759d29390b83f38eaa04c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a8648c4dc705f1c2c400da1ceefd4508,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
eab6f2b14df9a74557a5c26b7308860d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88a85e8eff8056094e4c50498166110e,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5456cbd13c8c79133eca4d170310c215,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


16S


Unnamed: 0,soil.hip.CMU.17.10.2017.08.19.day11,soil.hip.CMU.17.10.2017.08.23.day15,soil.hip.CMU.17.10.2017.08.17.day9,soil.hip.CMU.17.10.2017.08.26.day18,soil.hip.CMU.17.10.2017.08.18.day10,soil.hip.CMU.17.10.2017.08.14.day6,soil.hip.CMU.17.10.2017.08.25.day17,soil.hip.CMU.17.10.2017.08.11.day3,soil.hip.CMU.17.10.2017.08.10.day2,soil.hip.CMU.17.10.2017.08.16.day8,...,soil.hip.SHSU.2016.024.2016.04.29.day15,soil.hip.SHSU.2016.024.2016.04.27.day13,soil.hip.SHSU.2016.024.2016.04.18.day4,soil.hip.SHSU.2016.024.2016.04.30.day16,soil.hip.SHSU.2016.024.2016.04.21.day7,soil.hip.SHSU.2016.024.2016.04.25.day11,soil.hip.SHSU.2016.024.2016.04.23.day9,soil.hip.SHSU.2016.024.2016.05.04.day20,soil.hip.SHSU.2016.024.2016.04.20.day6,soil.hip.SHSU.2016.024.2016.04.19.day5
fd1796898145db8675c03b6950633f72,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
f84cdc0e4358b155e32a2bc031503bba,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
451a305073d6f00797e39d62a1435605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
a20f6a222b37b038bff253e426d3b586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
b86e3e53c4997201202adc4480b2643c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


gene


Unnamed: 0,soil.hip.CMU.17.10.2017.08.19.day11,soil.hip.CMU.17.10.2017.08.23.day15,soil.hip.CMU.17.10.2017.08.17.day9,soil.hip.CMU.17.10.2017.08.26.day18,soil.hip.CMU.17.10.2017.08.18.day10,soil.hip.CMU.17.10.2017.08.14.day6,soil.hip.CMU.17.10.2017.08.25.day17,soil.hip.CMU.17.10.2017.08.11.day3,soil.hip.CMU.17.10.2017.08.10.day2,soil.hip.CMU.17.10.2017.08.16.day8,...,soil.hip.SHSU.2016.024.2016.04.29.day15,soil.hip.SHSU.2016.024.2016.04.27.day13,soil.hip.SHSU.2016.024.2016.04.18.day4,soil.hip.SHSU.2016.024.2016.04.30.day16,soil.hip.SHSU.2016.024.2016.04.21.day7,soil.hip.SHSU.2016.024.2016.04.25.day11,soil.hip.SHSU.2016.024.2016.04.23.day9,soil.hip.SHSU.2016.024.2016.05.04.day20,soil.hip.SHSU.2016.024.2016.04.20.day6,soil.hip.SHSU.2016.024.2016.04.19.day5
5-epimerase;_Xh,205.205194,245.607848,182.021783,278.46279,142.508572,170.504012,223.300341,190.294504,379.264076,135.392995,...,78.548953,117.749178,109.963684,71.8577,79.544285,97.7921,151.892285,66.850794,96.950857,187.6361
6,20.762837,84.374869,32.319568,34.186768,39.602774,13.141136,20.951288,21.295457,9.199818,3.977856,...,176.041818,93.971168,32.547968,49.60286,80.385601,135.504306,51.760857,78.770129,96.328424,39.18925
6-dehydratase_2;_Xh,2948.614035,3060.500612,3050.561566,3125.050442,2908.479649,3298.93366,3233.25917,3484.343439,3255.366485,2258.259123,...,1799.900337,1838.210188,2487.999779,813.886543,1609.344069,2005.085446,1897.749008,1643.155554,1577.678491,2056.631914
AA1,278.828308,283.629567,230.554486,270.54076,228.177489,217.813163,313.248817,248.466203,286.625134,91.294112,...,129.003463,194.76529,68.003341,38.676031,127.778332,139.590086,61.915949,119.783198,187.560451,74.518934
AA12,78.845731,61.006407,61.567205,53.955425,100.154621,74.19779,60.080811,55.91696,62.528618,49.368077,...,85.799706,63.492218,171.869268,25.886881,94.970067,103.209961,69.651114,19.708604,72.161615,178.879784


gene_module


Unnamed: 0,soil.hip.CMU.17.10.2017.08.19.day11,soil.hip.CMU.17.10.2017.08.23.day15,soil.hip.CMU.17.10.2017.08.17.day9,soil.hip.CMU.17.10.2017.08.26.day18,soil.hip.CMU.17.10.2017.08.18.day10,soil.hip.CMU.17.10.2017.08.14.day6,soil.hip.CMU.17.10.2017.08.25.day17,soil.hip.CMU.17.10.2017.08.11.day3,soil.hip.CMU.17.10.2017.08.10.day2,soil.hip.CMU.17.10.2017.08.16.day8,...,soil.hip.SHSU.2016.024.2016.04.29.day15,soil.hip.SHSU.2016.024.2016.04.27.day13,soil.hip.SHSU.2016.024.2016.04.18.day4,soil.hip.SHSU.2016.024.2016.04.30.day16,soil.hip.SHSU.2016.024.2016.04.21.day7,soil.hip.SHSU.2016.024.2016.04.25.day11,soil.hip.SHSU.2016.024.2016.04.23.day9,soil.hip.SHSU.2016.024.2016.05.04.day20,soil.hip.SHSU.2016.024.2016.04.20.day6,soil.hip.SHSU.2016.024.2016.04.19.day5
Alpha-galactans,362.662051,387.650589,327.285727,395.74336,499.262278,405.388168,496.179214,354.259667,433.550879,782.998889,...,495.082562,279.332383,485.262938,197.662532,272.968337,496.119154,421.639445,605.228279,266.70464,467.501885
Alpha-mannan,330.076567,418.12572,364.819128,305.072428,460.087141,435.990607,381.313102,420.882622,547.178378,1431.464824,...,821.275892,739.67051,420.996836,189.526414,251.673827,776.814393,398.733616,574.92254,266.799439,504.788099
Arabinan,2421.579288,2497.982283,2255.352159,2233.779562,2607.423559,1929.85507,1775.112034,1958.441509,2081.555404,2832.631584,...,1553.026643,875.469359,957.890073,370.085994,785.254838,1326.340395,1030.535128,1031.205241,668.485932,1455.022048
Arabinose_cleavage,521.040132,595.452542,515.794843,496.722771,550.523367,525.99721,507.302173,573.862551,537.172763,754.052743,...,533.77643,237.023883,406.359177,172.629305,233.197569,422.681683,384.027644,430.238412,203.606049,500.29174
Beta-galactan_(pectic_galactan),1126.457361,1214.744222,972.258322,1015.903971,1159.959149,1091.804346,1111.753895,1173.254109,1066.884734,1610.801135,...,863.308333,358.601633,539.032025,171.768099,499.287564,779.973846,575.626157,1341.643525,363.723345,920.565559


In [5]:
print("# unique subjects:",len(metadata.subjects.unique()))
print(metadata.season.value_counts())
print(metadata.facility.value_counts())

# unique subjects: 36
season
spring    147
winter    130
summer    130
fall      122
Name: count, dtype: int64
facility
ARF      211
STAFS    168
FIRS     150
Name: count, dtype: int64
