In [1]:
import glob
import pandas as pd

from biom import Table, load_table
from skbio.stats.composition import closure
from gemelli.rpca import rpca_table_processing
from sklearn.model_selection import train_test_split

#turn off warnings
import warnings
warnings.filterwarnings("ignore")

## Data pre-processing

Only need to run these blocks once

In [2]:
save_path = '/Users/bec5786/Desktop/Shenhav Lab/Joint-RPCA/Case studies/iHMP/Data/runtime/'

### iHMP cohort

In [2]:
ihmp_data_path = '/Users/bec5786/Desktop/Shenhav Lab/Joint-RPCA/Case studies/iHMP/Data/'
ihmp_metadata = pd.read_csv('{}ihmp_data/sample-metadata.txt'.format(ihmp_data_path), sep='\t', index_col=0)

In [4]:
# import and match the data the same as in ../../simulations-benchmarking/3.0-ihmp-benchmarks.ipynb
# we will also save each table with all the data for that table
# for exploration non-jointly based on the joint samples
ihmp_tables = {omics_.split('/')[-1].split('.')[0]:rpca_table_processing(load_table(omics_),
                                                                         min_sample_count=0,
                                                                         min_feature_count=0,
                                                                         min_feature_frequency=0)
                for omics_ in glob.glob('{}ihmp_data/*.biom'.format(ihmp_data_path))
                if not any ('_' + str(d_) in omics_ for d_ in [10, 8, 6, 4, 2, 1])}

# add pathways instead of ECs (easier to interpret)
tbl_ = pd.read_csv('{}ihmp_data/additional-data/pathabundances_3.tsv'.format(ihmp_data_path), sep='\t', index_col=0)
tbl_ = tbl_[[('UNINTEGRATED' not in x) and ('UNMAPPED' not in x) for x in tbl_.index]]
tbl_.columns = [c.replace('_pathabundance_cpm','') for c in tbl_.columns]
tbl_ = tbl_.loc[tbl_.sum(1) > 0, tbl_.sum(0) > 0]
tbl_ = Table(tbl_.values, tbl_.index, tbl_.columns)
ihmp_tables['meta_t_ecs'] = tbl_.copy()
ihmp_tables

In [5]:
# make table/metadata pair for each dataset of all samples in the data
tables_metdata_unshared = {}
for omics_, table_ in ihmp_tables.items():
    print(omics_)
    metadata_omic = ihmp_metadata.copy()
    table_omic = table_.copy()
    shared_samps = set(table_omic.ids()) & set(ihmp_metadata.index)
    table_omic = table_omic.filter(shared_samps)
    table_omic = rpca_table_processing(table_omic, min_sample_count=0,
                                       min_feature_count=0,
                                       min_feature_frequency=0)
    metadata_omic = metadata_omic.reindex(shared_samps)
    table_omic_df = pd.DataFrame(table_omic.matrix_data.toarray(),
                                 table_omic.ids('observation'),
                                 table_omic.ids())
    # these omics output in % abundance so have to use relative counts.
    # possibly try to re-scale back to counts?
    if omics_ in ['shared_meta_g_taxonomic_profiles','meta_t_ecs','HMP2_metabolomics']:
        table_omic_df = table_omic_df.apply(closure)
    tables_metdata_unshared[omics_] = [table_omic, table_omic_df, metadata_omic]
tables_metdata_unshared.keys()

virome_virmap_analysis
meta_t_ecs
shared_meta_g_taxonomic_profiles
HMP2_proteomics_ecs
HMP2_metabolomics


dict_keys(['virome_virmap_analysis', 'meta_t_ecs', 'shared_meta_g_taxonomic_profiles', 'HMP2_proteomics_ecs', 'HMP2_metabolomics'])

In [6]:
shared_samps = set.intersection(*[set(t_.ids()) for t_ in ihmp_tables.values()]) & set(ihmp_metadata.index)
metadata_shared = ihmp_metadata.reindex(shared_samps)
metadata_shared['diagnosis_binned'] = [x.replace('UC','IBD').replace('CD','IBD')
                                       for x in metadata_shared.diagnosis]
# re-close metaG/T data
metadata_shared = metadata_shared.reindex(shared_samps)
tables_shared = {t_k:t_.copy().filter(shared_samps) for t_k, t_ in ihmp_tables.items()}
for t_ in ['shared_meta_g_taxonomic_profiles','meta_t_ecs','HMP2_metabolomics']:
    tbl_tmp = tables_shared[t_].to_dataframe().copy()
    tbl_tmp = tbl_tmp.apply(closure)
    tables_shared[t_] = Table(tbl_tmp.values, tbl_tmp.index, tbl_tmp.columns)

train_, test_ = train_test_split(metadata_shared, shuffle=True,
                                 stratify=metadata_shared['diagnosis'],
                                 test_size=0.25)
metadata_shared['train_test'] = 'train'
metadata_shared.loc[test_.index, 'train_test'] = 'test'
#metadata_shared.to_csv('../data/ihmp-sample-metadata-plus-train-tests.csv')
metadata_shared.diagnosis.value_counts()

diagnosis
CD        61
nonIBD    44
UC        30
Name: count, dtype: int64

In [7]:
# #save all tables in biom format
# for k_, t_ in tables_shared.items():
#    with open('{}iHMP/{}.biom'.format(save_path,k_), 'w') as f:
#        f.write(t_.to_json(generated_by='BIOM-Format'))

### UC cohort two

In [3]:
uc_data_path = '/Users/bec5786/Desktop/Shenhav Lab/Joint-RPCA/Case studies/iHMP/Data/uc-severity-multiomics/Cohort_Two/'
uc_metadata = pd.read_csv('{}metadata_cohort_two_revised.txt'.format(uc_data_path), sep='\t', index_col=0)

In [4]:
uc_tables = {'metaproteomics': load_table('{}metaproteomics_cohort_two_matched.biom'.format(uc_data_path)),
             'metagenomics': load_table('{}metagenomics_cohort_two_matched.biom'.format(uc_data_path)),
             'metabolomics': load_table('{}metabolomics_cohort_two_matched.biom'.format(uc_data_path))}
uc_tables

{'metaproteomics': 108080 x 174 <class 'biom.table.Table'> with 5091405 nonzero entries (27% dense),
 'metagenomics': 3568 x 174 <class 'biom.table.Table'> with 163422 nonzero entries (26% dense),
 'metabolomics': 1928 x 174 <class 'biom.table.Table'> with 58524 nonzero entries (17% dense)}

In [5]:
# make table/metadata pair for each dataset of all samples in the data
uc_tables_metdata_unshared = {}
for omics_, table_ in uc_tables.items():
    print(omics_)
    metadata_omic = uc_metadata.copy()
    table_omic = table_.copy()
    shared_samps = set(table_omic.ids()) & set(uc_metadata.index)
    table_omic = table_omic.filter(shared_samps)
    table_omic = rpca_table_processing(table_omic, min_sample_count=0,
                                       min_feature_count=0,
                                       min_feature_frequency=0)
    metadata_omic = metadata_omic.reindex(shared_samps)
    table_omic_df = pd.DataFrame(table_omic.matrix_data.toarray(),
                                 table_omic.ids('observation'),
                                 table_omic.ids())
    # these omics output in % abundance so have to use relative counts.
    # possibly try to re-scale back to counts?
    if omics_ in ['metaproteomics','metagenomics','metabolomics']:
        table_omic_df = table_omic_df.apply(closure)
    uc_tables_metdata_unshared[omics_] = [table_omic, table_omic_df, metadata_omic]
uc_tables_metdata_unshared.keys()

metaproteomics
metagenomics
metabolomics


dict_keys(['metaproteomics', 'metagenomics', 'metabolomics'])

In [6]:
shared_samps = list(set.intersection(*[set(t_.ids()) for t_ in uc_tables.values()]) & set(uc_metadata.index))
uc_metadata_shared = uc_metadata.reindex(shared_samps)
uc_metadata_shared['Diagnosis'] = [x.replace('UC','IBD').replace('CD','IBD')
                                for x in uc_metadata_shared.Diagnosis]
uc_metadata_shared = uc_metadata_shared.reindex(shared_samps)
tables_shared = {t_k:t_.copy().filter(shared_samps) for t_k, t_ in uc_tables.items()}

train_, test_ = train_test_split(uc_metadata_shared, shuffle=True,
                                 stratify=uc_metadata_shared['Diagnosis'],
                                 test_size=0.25)
uc_metadata_shared['train_test'] = 'train'
uc_metadata_shared.loc[test_.index, 'train_test'] = 'test'
uc_metadata_shared.to_csv('../data/uc-metadata-plus-train-tests.csv')
uc_metadata_shared.Diagnosis.value_counts()

Diagnosis
IBD                157
Healthy_control     16
Name: count, dtype: int64

In [7]:
#save all tables in biom format
for k_, t_ in tables_shared.items():
   with open('{}UC/{}.biom'.format(save_path,k_), 'w') as f:
       f.write(t_.to_json(generated_by='BIOM-Format'))