In [1]:
import glob
import pandas as pd
import qiime2 as q2
import time
from qiime2.plugins.mmvec.actions import paired_omics                                   
from biom import load_table
import json
import itertools

#turn off warnings
import warnings
warnings.filterwarnings("ignore")
#turn off tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [2]:
#load the data
data_path = '/Users/bec5786/Desktop/Shenhav Lab/Joint-RPCA/Case studies/iHMP/Data/runtime/iHMP/'
omic_keys = [omics_.split('/')[-1].split('.')[0] for omics_ in glob.glob('{}/*.biom'.format(data_path))]
tables_shared = {k_: load_table('{}{}.biom'.format(data_path,k_)) for k_ in omic_keys}
metadata_shared = pd.read_csv('../data/ihmp-sample-metadata-plus-train-tests.csv', index_col=0)

In [3]:
#check dimension of tables
tables_shared

{'virome_virmap_analysis': 260 x 135 <class 'biom.table.Table'> with 505 nonzero entries (1% dense),
 'meta_t_ecs': 8291 x 135 <class 'biom.table.Table'> with 59744 nonzero entries (5% dense),
 'shared_meta_g_taxonomic_profiles': 181 x 135 <class 'biom.table.Table'> with 4465 nonzero entries (18% dense),
 'HMP2_proteomics_ecs': 830 x 135 <class 'biom.table.Table'> with 15760 nonzero entries (14% dense),
 'HMP2_metabolomics': 81867 x 135 <class 'biom.table.Table'> with 6886507 nonzero entries (62% dense)}

In [4]:
##formatting for running MMvec
#change index name to 'sample id'
metadata_shared.index.name = 'sample id'

#change train to Train and test to Test
metadata_shared['train_test_mmvec'] = metadata_shared['train_test'].apply(lambda x: 'Train' if x == 'train' else 'Test')

In [5]:
#create list with all possible pairs of omics
pairs = [('virome_virmap_analysis', 'virome_virmap_analysis'),
         ('meta_t_ecs', 'meta_t_ecs'),
         ('shared_meta_g_taxonomic_profiles', 'shared_meta_g_taxonomic_profiles'),
         ('HMP2_proteomics_ecs', 'HMP2_proteomics_ecs'),
         ('HMP2_metabolomics', 'HMP2_metabolomics')] + list(itertools.permutations(omic_keys, 2))
pairs

[('virome_virmap_analysis', 'virome_virmap_analysis'),
 ('meta_t_ecs', 'meta_t_ecs'),
 ('shared_meta_g_taxonomic_profiles', 'shared_meta_g_taxonomic_profiles'),
 ('HMP2_proteomics_ecs', 'HMP2_proteomics_ecs'),
 ('HMP2_metabolomics', 'HMP2_metabolomics'),
 ('virome_virmap_analysis', 'meta_t_ecs'),
 ('virome_virmap_analysis', 'shared_meta_g_taxonomic_profiles'),
 ('virome_virmap_analysis', 'HMP2_proteomics_ecs'),
 ('virome_virmap_analysis', 'HMP2_metabolomics'),
 ('meta_t_ecs', 'virome_virmap_analysis'),
 ('meta_t_ecs', 'shared_meta_g_taxonomic_profiles'),
 ('meta_t_ecs', 'HMP2_proteomics_ecs'),
 ('meta_t_ecs', 'HMP2_metabolomics'),
 ('shared_meta_g_taxonomic_profiles', 'virome_virmap_analysis'),
 ('shared_meta_g_taxonomic_profiles', 'meta_t_ecs'),
 ('shared_meta_g_taxonomic_profiles', 'HMP2_proteomics_ecs'),
 ('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics'),
 ('HMP2_proteomics_ecs', 'virome_virmap_analysis'),
 ('HMP2_proteomics_ecs', 'meta_t_ecs'),
 ('HMP2_proteomics_ecs', 'sha

In [6]:
#creates lists with or without metabolomics
pairs_no_metabolomics = [pair for pair in pairs if 'HMP2_metabolomics' not in pair]
pairs_metabolomics = [pair for pair in pairs if 'HMP2_metabolomics' in pair]

In [7]:
#change order of pairs (meta_t_ecs takes too long, make it appear last)
#pairs that are taking too long are omitted (commented out)
pairs_metabolomics = [#('HMP2_metabolomics', 'HMP2_metabolomics'),
                      ('virome_virmap_analysis', 'HMP2_metabolomics'),
                      ('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics'),
                      ('HMP2_proteomics_ecs', 'HMP2_metabolomics'),
                      ('HMP2_metabolomics', 'virome_virmap_analysis'),
                      ('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles'),
                      ('HMP2_metabolomics', 'HMP2_proteomics_ecs'),
                      ('meta_t_ecs', 'HMP2_metabolomics')]
                      #('HMP2_metabolomics', 'meta_t_ecs')]

In [8]:
#generate subsets of metabolomics data
metabolites_all = tables_shared['HMP2_metabolomics'].copy()

metabolites_sub = {}
for factor in [0.01, 0.03, 0.05, 0.1, 0.15]:
    #get metabolomics table
    metab = tables_shared['HMP2_metabolomics'].copy()
    #number of features to keep
    n_features = int(metab.shape[0]*factor)
    #take first n_features
    feature_ids = list(metab.ids(axis='observation'))[:n_features]
    #filter table
    metabolites_sub[factor] = metab.filter(feature_ids, axis='observation')
    #sanity check
    print('Metabolomics: {}% of features: {}'.format(factor*100, metabolites_sub[factor].shape[0]))

Metabolomics: 1.0% of features: 818
Metabolomics: 3.0% of features: 2456
Metabolomics: 5.0% of features: 4093
Metabolomics: 10.0% of features: 8186
Metabolomics: 15.0% of features: 12280


In [13]:
#runtime = {}
#runtime_total = {}
#CV_summaries = {}
#results = {}
#ranks = {}

In [17]:
# #run mmvec for each pair of omics without metabolomics - only once
# factor = 0
# time_factor = 0
# for pair in pairs_no_metabolomics:  
#     print(pair)
#     #get tables and set to correct format
#     t1 = tables_shared[pair[0]]
#     t2 = tables_shared[pair[1]]
#     t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
#     t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

#     #run and time mmvec
#     time_start = time.perf_counter()
#     mmvec_res = paired_omics(t1_q2, t2_q2, 
#                             summary_interval=1,
#                             metadata=q2.Metadata(metadata_shared),
#                             training_column='train_test_mmvec',
#                             min_feature_count=10)
#     time_elapsed = (time.perf_counter() - time_start)
#     time_factor += time_elapsed
#     print('Time elapsed: ', round(time_elapsed,4))
#     print()

#     #save output
#     runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
#     #CV_summaries["{}-{}".format(pair[0], pair[1])] = mmvec_res.model_stats.view(q2.Metadata).to_dataframe()
#     #results["{}-{}".format(pair[0], pair[1])] = mmvec_res
#     #ranks["{}-{}".format(pair[0], pair[1])] = mmvec_res.conditionals.view(pd.DataFrame)

# runtime_total[factor] = time_factor
# print('Total time elapsed: ', round(time_factor,4))

In [None]:
# #sanity check
# print(runtime_total)

# #save mmvec results as json/pickle
# with open("../results/mmvec-runtime-total.json", "w") as outfile: 
#    json.dump(runtime_total, outfile)

# with open("../results/mmvec-runtime.json", "w") as outfile: 
#    json.dump(runtime, outfile)

In [8]:
#load mmvec results
with open("../results/mmvec-runtime-total.json", "r") as infile:
    runtime_total = json.load(infile)

with open("../results/mmvec-runtime.json", "r") as infile:
    runtime = json.load(infile)

In [None]:
#ref: https://github.com/biocore/mmvec/blob/88ca33b408a85b6bf90fae06982936247b860272/mmvec/q2/_method.py#L14

for factor in [0.01, 0.03, 0.05, 0.1, 0.15]:
    print('\n### Metab Features: {}% ###'.format(factor*100))
    metab_table = metabolites_sub[factor]
    table_to_use = tables_shared.copy()
    table_to_use['HMP2_metabolomics'] = metab_table
    print('Metabolomics n features: {}'.format(table_to_use['HMP2_metabolomics'].shape[0]))

    time_factor = 0
    for pair in pairs_metabolomics:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(metadata_shared),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        time_factor += time_elapsed
        print('Time elapsed: ', round(time_elapsed,2))
        print()

        # update pair name before saving
        if pair[0] == 'HMP2_metabolomics':
            pair = ('HMP2_metabolomics_{}'.format(factor), pair[1])
        if pair[1] == 'HMP2_metabolomics':
            pair = (pair[0], 'HMP2_metabolomics_{}'.format(factor))
        #save output
        runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
    
    runtime_total[factor] = time_factor
    print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
                                                         round(time_factor/60, 2)))
    #if time exceeds 1.5 hours, stop the loop
    # if time_factor > 5400:
    #     print('Last runtime {} exceeds 1 hour'.format(time_factor))
    #     print('Stopping loop')
    #     break

### Add runtimes

In [9]:
#load mmvec results
with open("../results/mmvec-runtime-total.json", "r") as infile:
    runtime_total = json.load(infile)

with open("../results/mmvec-runtime.json", "r") as infile:
    runtime = json.load(infile)

In [30]:
##add an estimated time for runtime_total with factor = 0.15
#first, add an estimate for pair that did not complete running: 22:54:02 = 22*3600 + 54*60 + 2 = 82442
#runtime['HMP2_metabolomics_0.15-HMP2_proteomics_ecs'] = 82442

#first, get all metabolomic pairs
#runtime_metab = {k: v for k, v in runtime.items() if 'HMP2_metabolomics_0.15' in k}

#add to total runtime
#runtime_total['0.15'] = sum(runtime_metab.values())

In [32]:
#need to add runtime_total[0] to each individual runtime 
#runtime_total[0] is the time that all pairs excluding metabolomics took to run
#for key in ['0.01', '0.03', '0.05', '0.1', '0.15']:
#    runtime_total[key] += runtime_total['0']

In [42]:
# save final set of mmvec results
#with open("../results/mmvec-runtime-total.json", "w") as outfile: 
#   json.dump(runtime_total, outfile)

#with open("../results/mmvec-runtime.json", "w") as outfile: 
#  json.dump(runtime, outfile)

In [10]:
#print time in minutes
runtime_total_minutes = {k: v/60 for k, v in runtime_total.items()}
runtime_total_minutes

{'0.01': 33.79756434013381,
 '0': 21.83175702846711,
 '0.03': 82.18809817113376,
 '0.05': 177.59619504748377,
 '0.1': 819.7942999900505,
 '0.15': 2146.9860534352}