In [1]:
import glob
import pandas as pd
import numpy as np
import qiime2 as q2
import time
from qiime2.plugins.mmvec.actions import paired_omics                                   
from biom import load_table, 
import json
import itertools

#turn off warnings
import warnings
warnings.filterwarnings("ignore")
#turn off tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [2]:
#load the data
omic_keys = [omics_.split('/')[-1].split('.')[0] for omics_ in glob.glob('../data/*.biom')]
tables_shared = {k_: load_table('../data/{}.biom'.format(k_)) for k_ in omic_keys}
metadata_shared = pd.read_csv('../data/ihmp-sample-metadata-plus-train-tests.csv', index_col=0)

In [3]:
##formatting for running MMvec
#change index name to 'sample id'
metadata_shared.index.name = 'sample id'

#change train to Train and test to Test
metadata_shared['train_test_mmvec'] = metadata_shared['train_test'].apply(lambda x: 'Train' if x == 'train' else 'Test')

In [4]:
#create list with all possible pairs of omics
pairs = [('virome_virmap_analysis', 'virome_virmap_analysis'),
         ('meta_t_ecs', 'meta_t_ecs'),
         ('shared_meta_g_taxonomic_profiles', 'shared_meta_g_taxonomic_profiles'),
         ('HMP2_proteomics_ecs', 'HMP2_proteomics_ecs'),
         ('HMP2_metabolomics', 'HMP2_metabolomics')] + list(itertools.permutations(omic_keys, 2))
pairs

[('virome_virmap_analysis', 'virome_virmap_analysis'),
 ('meta_t_ecs', 'meta_t_ecs'),
 ('shared_meta_g_taxonomic_profiles', 'shared_meta_g_taxonomic_profiles'),
 ('HMP2_proteomics_ecs', 'HMP2_proteomics_ecs'),
 ('HMP2_metabolomics', 'HMP2_metabolomics'),
 ('virome_virmap_analysis', 'meta_t_ecs'),
 ('virome_virmap_analysis', 'shared_meta_g_taxonomic_profiles'),
 ('virome_virmap_analysis', 'HMP2_proteomics_ecs'),
 ('virome_virmap_analysis', 'HMP2_metabolomics'),
 ('meta_t_ecs', 'virome_virmap_analysis'),
 ('meta_t_ecs', 'shared_meta_g_taxonomic_profiles'),
 ('meta_t_ecs', 'HMP2_proteomics_ecs'),
 ('meta_t_ecs', 'HMP2_metabolomics'),
 ('shared_meta_g_taxonomic_profiles', 'virome_virmap_analysis'),
 ('shared_meta_g_taxonomic_profiles', 'meta_t_ecs'),
 ('shared_meta_g_taxonomic_profiles', 'HMP2_proteomics_ecs'),
 ('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics'),
 ('HMP2_proteomics_ecs', 'virome_virmap_analysis'),
 ('HMP2_proteomics_ecs', 'meta_t_ecs'),
 ('HMP2_proteomics_ecs', 'sha

In [5]:
pairs_no_metabolomics = [pair for pair in pairs if 'HMP2_metabolomics' not in pair]
pairs_metabolomics = [pair for pair in pairs if 'HMP2_metabolomics' in pair]

In [6]:
#change order of pairs (meta_t_ecs takes too long, make it appear last)
pairs_metabolomics = [#('HMP2_metabolomics', 'HMP2_metabolomics'),
                      ('virome_virmap_analysis', 'HMP2_metabolomics'),
                      ('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics'),
                      ('HMP2_proteomics_ecs', 'HMP2_metabolomics'),
                      ('HMP2_metabolomics', 'virome_virmap_analysis'),
                      ('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles'),
                      ('HMP2_metabolomics', 'HMP2_proteomics_ecs'),
                      ('meta_t_ecs', 'HMP2_metabolomics')]
                      #('HMP2_metabolomics', 'meta_t_ecs')]

In [7]:
#generate subsets of metabolomics data
metabolites_all = tables_shared['HMP2_metabolomics'].copy()

metabolites_sub = {}
for factor in [0.01, 0.03, 0.05, 0.1, 0.15, 0.2, 0.25]:
    #get metabolomics table
    metab = tables_shared['HMP2_metabolomics'].copy()
    #number of features to keep
    n_features = int(metab.shape[0]*factor)
    #take first n_features
    feature_ids = list(metab.ids(axis='observation'))[:n_features]
    #filter table
    metabolites_sub[factor] = metab.filter(feature_ids, axis='observation')
    #sanity check
    print('Metabolomics: {}% of features: {}'.format(factor*100, metabolites_sub[factor].shape[0]))

Metabolomics: 1.0% of features: 818
Metabolomics: 3.0% of features: 2456
Metabolomics: 5.0% of features: 4093
Metabolomics: 10.0% of features: 8186
Metabolomics: 15.0% of features: 12280
Metabolomics: 20.0% of features: 16373
Metabolomics: 25.0% of features: 20466


In [13]:
#runtime = {}
#runtime_total = {}
#CV_summaries = {}
#results = {}
#ranks = {}

In [17]:
# #run mmvec for each pair of omics without metabolomics
# factor = 0
# time_factor = 0
# for pair in pairs_no_metabolomics:  
#     print(pair)
#     #get tables and set to correct format
#     t1 = tables_shared[pair[0]]
#     t2 = tables_shared[pair[1]]
#     t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
#     t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

#     #run and time mmvec
#     time_start = time.perf_counter()
#     mmvec_res = paired_omics(t1_q2, t2_q2, 
#                             summary_interval=1,
#                             metadata=q2.Metadata(metadata_shared),
#                             training_column='train_test_mmvec',
#                             min_feature_count=10)
#     time_elapsed = (time.perf_counter() - time_start)
#     time_factor += time_elapsed
#     print('Time elapsed: ', round(time_elapsed,4))
#     print()

#     #save output
#     runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
#     #CV_summaries["{}-{}".format(pair[0], pair[1])] = mmvec_res.model_stats.view(q2.Metadata).to_dataframe()
#     #results["{}-{}".format(pair[0], pair[1])] = mmvec_res
#     #ranks["{}-{}".format(pair[0], pair[1])] = mmvec_res.conditionals.view(pd.DataFrame)

# runtime_total[factor] = time_factor
# print('Total time elapsed: ', round(time_factor,4))

In [37]:
# #sanity check
# print(runtime_total)

# #save mmvec results as json/pickle
# with open("../results/mmvec-runtime-total.json", "w") as outfile: 
#    json.dump(runtime_total, outfile)

# with open("../results/mmvec-runtime.json", "w") as outfile: 
#    json.dump(runtime, outfile)

{0.01: 2895.78936765901, 0: 1309.9054217080266}

In [8]:
#load mmvec results
with open("../results/mmvec-runtime-total.json", "r") as infile:
    runtime_total = json.load(infile)

with open("../results/mmvec-runtime.json", "r") as infile:
    runtime = json.load(infile)

In [24]:
#ref: https://github.com/biocore/mmvec/blob/88ca33b408a85b6bf90fae06982936247b860272/mmvec/q2/_method.py#L14

for factor in [0.01, 0.03, 0.05, 0.1, 0.15, 0.2, 0.25]:
    print('\n### Metab Features: {}% ###'.format(factor*100))
    metab_table = metabolites_sub[factor]
    table_to_use = tables_shared.copy()
    table_to_use['HMP2_metabolomics'] = metab_table
    print('Metabolomics n features: {}'.format(table_to_use['HMP2_metabolomics'].shape[0]))

    time_factor = 0
    for pair in pairs_metabolomics:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(metadata_shared),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        time_factor += time_elapsed
        print('Time elapsed: ', round(time_elapsed,2))
        print()

        # update pair name before saving
        if pair[0] == 'HMP2_metabolomics':
            pair = ('HMP2_metabolomics_{}'.format(factor), pair[1])
        if pair[1] == 'HMP2_metabolomics':
            pair = (pair[0], 'HMP2_metabolomics_{}'.format(factor))
        #save output
        runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
    
    runtime_total[factor] = time_factor
    print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
                                                         round(time_factor/60, 2)))
    #if time exceeds 1.5 hours, stop the loop
    # if time_factor > 5400:
    #     print('Last runtime {} exceeds 1 hour'.format(time_factor))
    #     print('Stopping loop')
    #     break


### Metab Features: 1.0% ###
Metabolomics n features: 818
('virome_virmap_analysis', 'HMP2_metabolomics')


100%|██████████| 442/442 [00:01<00:00, 436.98it/s]


Time elapsed:  5.33

('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics')


100%|██████████| 6278/6278 [00:08<00:00, 754.25it/s]


Time elapsed:  12.37

('HMP2_proteomics_ecs', 'HMP2_metabolomics')


100%|██████████| 23250/23250 [00:26<00:00, 868.41it/s]


Time elapsed:  32.51

('HMP2_metabolomics', 'virome_virmap_analysis')


100%|██████████| 143430/143430 [02:40<00:00, 893.59it/s] 


Time elapsed:  166.41

('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles')


100%|██████████| 143430/143430 [02:34<00:00, 929.58it/s] 


Time elapsed:  160.26

('HMP2_metabolomics', 'HMP2_proteomics_ecs')


100%|██████████| 143430/143430 [03:35<00:00, 664.10it/s]


Time elapsed:  223.17

('meta_t_ecs', 'HMP2_metabolomics')


100%|██████████| 76150/76150 [01:34<00:00, 809.68it/s]


Time elapsed:  117.91

Total time elapsed: 717.95 secs (11.97 mins)

### Metab Features: 3.0% ###
Metabolomics n features: 2456
('virome_virmap_analysis', 'HMP2_metabolomics')


100%|██████████| 442/442 [00:01<00:00, 387.75it/s]


Time elapsed:  7.95

('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics')


100%|██████████| 6278/6278 [00:16<00:00, 379.91it/s]


Time elapsed:  23.89

('HMP2_proteomics_ecs', 'HMP2_metabolomics')


100%|██████████| 23250/23250 [00:46<00:00, 495.73it/s]


Time elapsed:  56.32

('HMP2_metabolomics', 'virome_virmap_analysis')


100%|██████████| 411158/411158 [14:37<00:00, 468.62it/s]


Time elapsed:  889.87

('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles')


100%|██████████| 411158/411158 [13:19<00:00, 514.42it/s]


Time elapsed:  813.41

('HMP2_metabolomics', 'HMP2_proteomics_ecs')


100%|██████████| 411158/411158 [25:35<00:00, 267.76it/s]


Time elapsed:  1551.62

('meta_t_ecs', 'HMP2_metabolomics')


100%|██████████| 76150/76150 [04:07<00:00, 308.05it/s]


Time elapsed:  278.33

Total time elapsed: 3621.38 secs (60.36 mins)

### Metab Features: 5.0% ###
Metabolomics n features: 4093
('virome_virmap_analysis', 'HMP2_metabolomics')


100%|██████████| 442/442 [00:01<00:00, 225.52it/s]


Time elapsed:  11.53

('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics')


100%|██████████| 6278/6278 [00:23<00:00, 264.95it/s]


Time elapsed:  34.29

('HMP2_proteomics_ecs', 'HMP2_metabolomics')


100%|██████████| 23250/23250 [01:27<00:00, 264.30it/s]


Time elapsed:  101.19

('HMP2_metabolomics', 'virome_virmap_analysis')


100%|██████████| 690780/690780 [43:47<00:00, 262.94it/s]  


Time elapsed:  2647.38

('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles')


100%|██████████| 690780/690780 [39:01<00:00, 295.05it/s]  


Time elapsed:  2361.82

('HMP2_metabolomics', 'HMP2_proteomics_ecs')


100%|██████████| 690780/690780 [1:03:29<00:00, 181.35it/s]


Time elapsed:  3837.1

('meta_t_ecs', 'HMP2_metabolomics')


100%|██████████| 76150/76150 [05:17<00:00, 239.74it/s]


Time elapsed:  352.56

Total time elapsed: 9345.87 secs (155.76 mins)

### Metab Features: 10.0% ###
Metabolomics n features: 8186
('virome_virmap_analysis', 'HMP2_metabolomics')


100%|██████████| 442/442 [00:02<00:00, 183.31it/s]


Time elapsed:  17.63

('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics')


100%|██████████| 6278/6278 [00:28<00:00, 221.90it/s]


Time elapsed:  45.25

('HMP2_proteomics_ecs', 'HMP2_metabolomics')


100%|██████████| 23250/23250 [01:59<00:00, 194.26it/s]


Time elapsed:  141.38

('HMP2_metabolomics', 'virome_virmap_analysis')


100%|██████████| 1401530/1401530 [2:48:10<00:00, 138.89it/s]  


Time elapsed:  10127.89

('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles')


100%|██████████| 1401530/1401530 [2:34:05<00:00, 151.59it/s] 


Time elapsed:  9283.01

('HMP2_metabolomics', 'HMP2_proteomics_ecs')


 61%|██████    | 853619/1401530 [5:00:05<1:55:45, 78.89it/s] 

Instructions for updating:
Use standard file APIs to delete files with this prefix.


100%|██████████| 1401530/1401530 [7:40:21<00:00, 50.74it/s]  


Time elapsed:  27665.57

('meta_t_ecs', 'HMP2_metabolomics')


100%|██████████| 76150/76150 [09:09<00:00, 138.64it/s]


Time elapsed:  597.03

Total time elapsed: 47877.75 secs (797.96 mins)

### Metab Features: 15.0% ###
Metabolomics n features: 12280
('virome_virmap_analysis', 'HMP2_metabolomics')


100%|██████████| 442/442 [00:03<00:00, 141.97it/s]


Time elapsed:  22.21

('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics')


100%|██████████| 6278/6278 [00:44<00:00, 140.26it/s]


Time elapsed:  66.47

('HMP2_proteomics_ecs', 'HMP2_metabolomics')


100%|██████████| 23250/23250 [02:58<00:00, 130.17it/s]


Time elapsed:  205.0

('HMP2_metabolomics', 'virome_virmap_analysis')


100%|██████████| 2120592/2120592 [6:29:46<00:00, 90.68it/s]   


Time elapsed:  23435.04

('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles')


 89%|████████▉ | 1884358/2120592 [5:21:19<40:16, 97.74it/s]   


KeyboardInterrupt: 

In [14]:
#ref: https://github.com/biocore/mmvec/blob/88ca33b408a85b6bf90fae06982936247b860272/mmvec/q2/_method.py#L14

for factor in [0.15]:
    print('\n### Metab Features: {}% ###'.format(factor*100))
    metab_table = metabolites_sub[factor]
    table_to_use = tables_shared.copy()
    table_to_use['HMP2_metabolomics'] = metab_table
    print('Metabolomics n features: {}'.format(table_to_use['HMP2_metabolomics'].shape[0]))

    time_factor = 0
    for pair in pairs_metabolomics:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(metadata_shared),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        time_factor += time_elapsed
        print('Time elapsed: ', round(time_elapsed,2))
        print()

        # update pair name before saving
        if pair[0] == 'HMP2_metabolomics':
            pair = ('HMP2_metabolomics_{}'.format(factor), pair[1])
        if pair[1] == 'HMP2_metabolomics':
            pair = (pair[0], 'HMP2_metabolomics_{}'.format(factor))
        #save output
        runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
        #save in memory as well
        with open("../results/mmvec-runtime.json", "w") as outfile: 
            json.dump(runtime, outfile)
    
    runtime_total[factor] = time_factor
    print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
                                                         round(time_factor/60, 2)))
    #save mmvec results as json/pickle
    with open("../results/mmvec-runtime-total.json", "w") as outfile: 
        json.dump(runtime_total, outfile)


### Metab Features: 15.0% ###
Metabolomics n features: 12280
('virome_virmap_analysis', 'HMP2_metabolomics')


100%|██████████| 442/442 [00:03<00:00, 134.59it/s]


Time elapsed:  25.23

('shared_meta_g_taxonomic_profiles', 'HMP2_metabolomics')


100%|██████████| 6278/6278 [00:46<00:00, 135.19it/s]


Time elapsed:  70.29

('HMP2_proteomics_ecs', 'HMP2_metabolomics')


100%|██████████| 23250/23250 [03:07<00:00, 124.17it/s]


Time elapsed:  216.97

('HMP2_metabolomics', 'virome_virmap_analysis')


 78%|███████▊  | 1662104/2120592 [5:00:03<57:46, 132.25it/s]  

Instructions for updating:
Use standard file APIs to delete files with this prefix.


100%|██████████| 2120592/2120592 [6:23:06<00:00, 92.25it/s]   


Time elapsed:  23039.25

('HMP2_metabolomics', 'shared_meta_g_taxonomic_profiles')


100%|██████████| 2120592/2120592 [5:31:40<00:00, 106.56it/s]  


Time elapsed:  19948.17

('HMP2_metabolomics', 'HMP2_proteomics_ecs')


  4%|▎         | 76444/2120592 [22:54:02<612:22:23,  1.08s/it] 


KeyboardInterrupt: 

In [11]:
for factor in [0.15]:
    print('\n### Metab Features: {}% ###'.format(factor*100))
    metab_table = metabolites_sub[factor]
    table_to_use = tables_shared.copy()
    table_to_use['HMP2_metabolomics'] = metab_table
    print('Metabolomics n features: {}'.format(table_to_use['HMP2_metabolomics'].shape[0]))

    for pair in [('meta_t_ecs', 'HMP2_metabolomics')]:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(metadata_shared),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        print('Time elapsed: ', round(time_elapsed,2))
        print()

        # update pair name before saving
        if pair[0] == 'HMP2_metabolomics':
            pair = ('HMP2_metabolomics_{}'.format(factor), pair[1])
        if pair[1] == 'HMP2_metabolomics':
            pair = (pair[0], 'HMP2_metabolomics_{}'.format(factor))
        #save output
        runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
        #save in memory as well
        with open("../results/mmvec-runtime.json", "w") as outfile: 
            json.dump(runtime, outfile)


### Metab Features: 15.0% ###
Metabolomics n features: 12280
('meta_t_ecs', 'HMP2_metabolomics')


100%|██████████| 76150/76150 [28:17<00:00, 44.85it/s]  


NameError: name 'time_factor' is not defined

### Add runtimes

In [38]:
#load mmvec results
with open("../results/mmvec-runtime-total.json", "r") as infile:
    runtime_total = json.load(infile)

with open("../results/mmvec-runtime.json", "r") as infile:
    runtime = json.load(infile)

In [30]:
## add an estimated time for runtime_total with factor = 0.15
#add an estimate for pair that did not complete running: 22:54:02 = 22*3600 + 54*60 + 2 = 82442
runtime['HMP2_metabolomics_0.15-HMP2_proteomics_ecs'] = 82442
#first, get all metabolomic pairs
runtime_metab = {k: v for k, v in runtime.items() if 'HMP2_metabolomics_0.15' in k}
#add to total runtime
runtime_total['0.15'] = sum(runtime_metab.values())

In [32]:
#need to add runtime_total[0] to each individual runtime 
#runtime_total[0] is the time that all pairs excluding metabolomics took to run
for key in ['0.01', '0.03', '0.05', '0.1', '0.15']:
    runtime_total[key] += runtime_total['0']

In [42]:
# save final set of mmvec results
with open("../results/mmvec-runtime-total.json", "w") as outfile: 
   json.dump(runtime_total, outfile)

#with open("../results/mmvec-runtime.json", "w") as outfile: 
#  json.dump(runtime, outfile)

In [43]:
#print time in minutes
runtime_total_minutes = {k: v/60 for k, v in runtime_total.items()}
runtime_total_minutes

#runtime_minutes = {k: v/60 for k, v in runtime.items()}

{'0.01': 33.79756434013381,
 '0': 21.83175702846711,
 '0.03': 82.18809817113376,
 '0.05': 177.59619504748377,
 '0.1': 819.7942999900505,
 '0.15': 2146.9860534352}