In [1]:
import glob
import pandas as pd
import qiime2 as q2
import time
from qiime2.plugins.mmvec.actions import paired_omics                                   
from biom import load_table
import json
import itertools

#turn off warnings
import warnings
warnings.filterwarnings("ignore")
#turn off tensorflow warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [2]:
##calculate # of features needed to match percentages
def get_n_features(p_features, total_features, subtotal_features):
    total_fts = [int(0.01 * p * total_features) for p in p_features]
    prot_fts = [fts - subtotal_features for fts in total_fts]
    return total_fts, prot_fts

In [3]:
#load the data
data_path = '/Users/bec5786/Desktop/Shenhav Lab/Joint-RPCA/Case studies/iHMP/Data/runtime/'
uc_omic_keys = [omics_.split('/')[-1].split('.')[0] for omics_ in glob.glob('{}UC/*.biom'.format(data_path))]

uc_metadata_shared = pd.read_csv('../data/uc-metadata-plus-train-tests.csv', index_col=0)
uc_tables_shared = {k_: load_table('{}UC/{}.biom'.format(data_path, k_)) for k_ in uc_omic_keys}
uc_tables_shared

{'metagenomics': 3568 x 173 <class 'biom.table.Table'> with 162126 nonzero entries (26% dense),
 'metabolomics': 1928 x 173 <class 'biom.table.Table'> with 58132 nonzero entries (17% dense),
 'metaproteomics': 108080 x 173 <class 'biom.table.Table'> with 5065165 nonzero entries (27% dense)}

In [39]:
#add number of features across tables
uc_n_feat = 0
for k, v in uc_tables_shared.items():
    uc_n_feat += v.shape[0]

uc_n_noprot = uc_n_feat - uc_tables_shared['metaproteomics'].shape[0]
print(uc_n_feat, uc_n_noprot)

113576 5496


In [7]:
##formatting for running MMvec
#change index name to 'sample id'
uc_metadata_shared.index.name = 'sample id'

#change train to Train and test to Test
uc_metadata_shared['train_test_mmvec'] = uc_metadata_shared['train_test'].apply(lambda x: 'Train' if x == 'train' else 'Test')

In [8]:
#create list with all possible pairs of omics
omic_keys = list(uc_tables_shared.keys())
pairs = [('metagenomics', 'metagenomics'), 
         ('metabolomics', 'metabolomics'),
         ('metaproteomics', 'metaproteomics')] + list(itertools.permutations(omic_keys, 2))
pairs

[('metagenomics', 'metagenomics'),
 ('metabolomics', 'metabolomics'),
 ('metaproteomics', 'metaproteomics'),
 ('metagenomics', 'metabolomics'),
 ('metagenomics', 'metaproteomics'),
 ('metabolomics', 'metagenomics'),
 ('metabolomics', 'metaproteomics'),
 ('metaproteomics', 'metagenomics'),
 ('metaproteomics', 'metabolomics')]

In [9]:
#create list with and without proteomics
pairs_no_prot = [pair for pair in pairs if 'metaproteomics' not in pair]
pairs_prot = [pair for pair in pairs if 'metaproteomics' in pair]

In [54]:
#generate subsets of metabolomics data
proteomics_all = uc_tables_shared['metaproteomics'].copy()

proteomics_sub = {}
proteomics_sub_nfeats = {}
for factor in [0.01, 0.03, 0.04, 0.05, 0.06]:
    #get metabolomics table
    prot = uc_tables_shared['metaproteomics'].copy()
    #number of features to keep
    n_features = int(prot.shape[0]*factor)
    proteomics_sub_nfeats[factor] = n_features
    #take first n_features
    feature_ids = list(prot.ids(axis='observation'))[:n_features]
    #filter table
    proteomics_sub[factor] = prot.filter(feature_ids, axis='observation')
    #sanity check
    print('Proteomics: {}% of features: {}'.format(factor*100, proteomics_sub[factor].shape[0]))

Proteomics: 1.0% of features: 1080
Proteomics: 3.0% of features: 3242
Proteomics: 4.0% of features: 4323
Proteomics: 5.0% of features: 5404
Proteomics: 6.0% of features: 6484


In [13]:
uc_metadata_shared_short = uc_metadata_shared.copy()
uc_metadata_shared_short = uc_metadata_shared_short[['train_test_mmvec']]

In [20]:
# runtime = {}
# runtime_total = {}

In [None]:
# #run mmvec for each pair of omics without proteomics
# factor = 0
# time_factor = 0
# for pair in pairs_no_prot:  
#     print(pair)
#     #get tables and set to correct format
#     t1 = uc_tables_shared[pair[0]]
#     t2 = uc_tables_shared[pair[1]]
#     t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
#     t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

#     #run and time mmvec
#     time_start = time.perf_counter()
#     mmvec_res = paired_omics(t1_q2, t2_q2, 
#                             summary_interval=1,
#                             metadata=q2.Metadata(uc_metadata_shared_short),
#                             training_column='train_test_mmvec',
#                             min_feature_count=10)
#     time_elapsed = (time.perf_counter() - time_start)
#     time_factor += time_elapsed
#     print('Time elapsed: ', round(time_elapsed,4))
#     print()
#     #save output
#     runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
#     with open("../results/mmvec-runtime-uc.json", "w") as outfile: 
#         json.dump(runtime, outfile)

# runtime_total[factor] = time_factor
# print('Total time elapsed: ', round(time_factor,4))

# #save mmvec results as json/pickle
# with open("../results/mmvec-runtime-total-uc.json", "w") as outfile: 
#    json.dump(runtime_total, outfile)

In [None]:
# #sanity check
# print(runtime)
# print(runtime_total)

In [14]:
# #load mmvec results
# with open("../results/mmvec-runtime-total-uc.json", "r") as infile:
#     runtime_total = json.load(infile)

# with open("../results/mmvec-runtime-uc.json", "r") as infile:
#     runtime = json.load(infile)

In [None]:
# for factor in [0.01, 0.03]: #0.04 onwards is too slow
#     print('\n### Proteomic Features: {}% ###'.format(factor*100))
#     prot_table = proteomics_sub[factor]
#     table_to_use = uc_tables_shared.copy()
#     table_to_use['metaproteomics'] = prot_table
#     print('Proteomics n features: {}'.format(table_to_use['metaproteomics'].shape[0]))

#     time_factor = 0
#     for pair in pairs_prot:
#         print(pair)
        
#         #get tables and set to correct format
#         t1 = table_to_use[pair[0]]
#         t2 = table_to_use[pair[1]]
#         t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
#         t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

#         #run and time mmvec
#         time_start = time.perf_counter()
#         mmvec_res = paired_omics(t1_q2, t2_q2, 
#                                 summary_interval=1,
#                                 metadata=q2.Metadata(uc_metadata_shared_short),
#                                 training_column='train_test_mmvec',
#                                 min_feature_count=10)
#         time_elapsed = (time.perf_counter() - time_start)
#         time_factor += time_elapsed
#         print('Time elapsed: ', round(time_elapsed,2))
#         print()

#         # update pair name before saving
#         if pair[0] == 'metaproteomics':
#             pair = ('metaproteomics{}'.format(factor), pair[1])
#         if pair[1] == 'metaproteomics':
#             pair = (pair[0], 'metaproteomics{}'.format(factor))
#         #save output
#         runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
#         with open("../results/mmvec-runtime-uc.json", "w") as outfile: 
#             json.dump(runtime, outfile)
    
#     runtime_total[factor] = time_factor
#     print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
#                                                          round(time_factor/60, 2)))
    
#     #save mmvec results as json/pickle
#     with open("../results/mmvec-runtime-total-uc.json", "w") as outfile: 
#         json.dump(runtime_total, outfile)
    

In [14]:
# pairs_prot_sub = [#('metaproteomics', 'metaproteomics'),
#                     ('metagenomics', 'metaproteomics'),
#                     ('metabolomics', 'metaproteomics'),
#                     ('metaproteomics', 'metagenomics'),
#                     ('metaproteomics', 'metabolomics')]

# for factor in [0.04]:
#     print('\n### Proteomic Features: {}% ###'.format(factor*100))
#     prot_table = proteomics_sub[factor]
#     table_to_use = uc_tables_shared.copy()
#     table_to_use['metaproteomics'] = prot_table
#     print('Proteomics n features: {}'.format(table_to_use['metaproteomics'].shape[0]))

#     time_factor = 0
#     for pair in pairs_prot_sub:
#         print(pair)
        
#         #get tables and set to correct format
#         t1 = table_to_use[pair[0]]
#         t2 = table_to_use[pair[1]]
#         t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
#         t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

#         #run and time mmvec
#         time_start = time.perf_counter()
#         mmvec_res = paired_omics(t1_q2, t2_q2, 
#                                 summary_interval=1,
#                                 metadata=q2.Metadata(uc_metadata_shared_short),
#                                 training_column='train_test_mmvec',
#                                 min_feature_count=10)
#         time_elapsed = (time.perf_counter() - time_start)
#         time_factor += time_elapsed
#         print('Time elapsed: ', round(time_elapsed,2))
#         print()

#         # update pair name before saving
#         if pair[0] == 'metaproteomics':
#             pair = ('metaproteomics{}'.format(factor), pair[1])
#         if pair[1] == 'metaproteomics':
#             pair = (pair[0], 'metaproteomics{}'.format(factor))
#         #save output
#         runtime["{}-{}".format(pair[0], pair[1])] = time_elapsed
#         with open("../results/mmvec-runtime-uc.json", "w") as outfile: 
#             json.dump(runtime, outfile)
    
#     runtime_total[factor] = time_factor
#     print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
#                                                          round(time_factor/60, 2)))
    
#     #save mmvec results as json/pickle
#     with open("../results/mmvec-runtime-total-uc.json", "w") as outfile: 
#         json.dump(runtime_total, outfile)
    

In [18]:
runtime_small = {}
runtime_total_small = {}

In [20]:
pairs_prot_sub = [#('metaproteomics', 'metaproteomics'),
                    ('metagenomics', 'metaproteomics'),
                    ('metabolomics', 'metaproteomics'),
                    #('metaproteomics', 'metagenomics'),
                    ('metaproteomics', 'metabolomics')]

for factor in [0.01, 0.03, 0.04, 0.05, 0.06]:
    print('\n### Proteomic Features: {}% ###'.format(factor*100))
    prot_table = proteomics_sub[factor]
    table_to_use = uc_tables_shared.copy()
    table_to_use['metaproteomics'] = prot_table
    print('Proteomics n features: {}'.format(table_to_use['metaproteomics'].shape[0]))

    time_factor = 0
    for pair in pairs_prot_sub:
        print(pair)
        
        #get tables and set to correct format
        t1 = table_to_use[pair[0]]
        t2 = table_to_use[pair[1]]
        t1_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t1)
        t2_q2 = q2.Artifact.import_data('FeatureTable[Frequency]', t2)

        #run and time mmvec
        time_start = time.perf_counter()
        mmvec_res = paired_omics(t1_q2, t2_q2, 
                                summary_interval=1,
                                metadata=q2.Metadata(uc_metadata_shared_short),
                                training_column='train_test_mmvec',
                                min_feature_count=10)
        time_elapsed = (time.perf_counter() - time_start)
        time_factor += time_elapsed
        print('Time elapsed: {} secs ({} mins)'.format(round(time_elapsed,2),
                                                       round(time_elapsed/60, 2)))
        print()

        # update pair name before saving
        if pair[0] == 'metaproteomics':
            pair = ('metaproteomics{}'.format(factor), pair[1])
        if pair[1] == 'metaproteomics':
            pair = (pair[0], 'metaproteomics{}'.format(factor))
        #save output
        runtime_small["{}-{}".format(pair[0], pair[1])] = time_elapsed
        with open("../results/mmvec-runtime-uc-small.json", "w") as outfile: 
            json.dump(runtime_small, outfile)
    
    runtime_total_small[factor] = time_factor
    print('Total time elapsed: {} secs ({} mins)'.format(round(time_factor, 2),
                                                         round(time_factor/60, 2)))
    
    #save mmvec results as json/pickle
    with open("../results/mmvec-runtime-total-uc-small.json", "w") as outfile: 
        json.dump(runtime_total_small, outfile)
    


### Proteomic Features: 1.0% ###
Proteomics n features: 1080
('metagenomics', 'metaproteomics')


100%|██████████| 236508/236508 [09:10<00:00, 429.62it/s]


Time elapsed: 565.58 secs (9.43 mins)

('metabolomics', 'metaproteomics')


100%|██████████| 80740/80740 [01:51<00:00, 725.94it/s]


Time elapsed: 120.35 secs (2.01 mins)

('metaproteomics', 'metabolomics')


100%|██████████| 143664/143664 [05:43<00:00, 418.72it/s]


Time elapsed: 352.77 secs (5.88 mins)

Total time elapsed: 1038.7 secs (17.31 mins)

### Proteomic Features: 3.0% ###
Proteomics n features: 3242
('metagenomics', 'metaproteomics')


100%|██████████| 236508/236508 [23:46<00:00, 165.80it/s]


Time elapsed: 1450.95 secs (24.18 mins)

('metabolomics', 'metaproteomics')


100%|██████████| 80740/80740 [04:07<00:00, 325.67it/s]


Time elapsed: 263.8 secs (4.4 mins)

('metaproteomics', 'metabolomics')


100%|██████████| 311720/311720 [19:39<00:00, 264.28it/s]


Time elapsed: 1199.8 secs (20.0 mins)

Total time elapsed: 2914.55 secs (48.58 mins)

### Proteomic Features: 4.0% ###
Proteomics n features: 4323
('metagenomics', 'metaproteomics')


100%|██████████| 236508/236508 [38:35<00:00, 102.14it/s]


Time elapsed: 2343.91 secs (39.07 mins)

('metabolomics', 'metaproteomics')


100%|██████████| 80740/80740 [05:37<00:00, 239.48it/s]


Time elapsed: 356.42 secs (5.94 mins)

('metaproteomics', 'metabolomics')


100%|██████████| 379518/379518 [29:00<00:00, 218.05it/s] 


Time elapsed: 1764.06 secs (29.4 mins)

Total time elapsed: 4464.39 secs (74.41 mins)

### Proteomic Features: 5.0% ###
Proteomics n features: 5404
('metagenomics', 'metaproteomics')


100%|██████████| 236508/236508 [1:11:40<00:00, 55.00it/s]


Time elapsed: 4332.17 secs (72.2 mins)

('metabolomics', 'metaproteomics')


100%|██████████| 80740/80740 [06:32<00:00, 205.88it/s]


Time elapsed: 414.98 secs (6.92 mins)

('metaproteomics', 'metabolomics')


100%|██████████| 447748/447748 [41:24<00:00, 180.22it/s] 


Time elapsed: 2512.89 secs (41.88 mins)

Total time elapsed: 7260.05 secs (121.0 mins)

### Proteomic Features: 6.0% ###
Proteomics n features: 6484
('metagenomics', 'metaproteomics')


100%|██████████| 236508/236508 [3:53:52<00:00, 16.85it/s]   


Time elapsed: 14068.98 secs (234.48 mins)

('metabolomics', 'metaproteomics')


100%|██████████| 80740/80740 [07:59<00:00, 168.33it/s]


Time elapsed: 504.55 secs (8.41 mins)

('metaproteomics', 'metabolomics')


100%|██████████| 516396/516396 [1:02:26<00:00, 137.82it/s]


Time elapsed: 3780.14 secs (63.0 mins)

Total time elapsed: 18353.66 secs (305.89 mins)


### Add runtimes

In [25]:
#load mmvec results
with open("../results/mmvec-runtime-total-uc.json", "r") as infile:
    runtime_total = json.load(infile)
runtime_total_noprot = runtime_total['0']
runtime_total_noprot

# with open("../results/mmvec-runtime-uc.json", "r") as infile:
#     runtime = json.load(infile)

In [None]:
## add an estimated time for runtime_total with factor = 0.5 for ('metaproteomics', 'metaproteomics')
#100hrs = 100*60*60 = 360000
#runtime['metaproteomics0.05-metaproteomics0.05'] = 360000
#runtime

# #now, update runtime_total to include a new pair
# runtime_prot_004 = {k: v for k, v in runtime.items() if 'metaproteomics0.04' in k}
# sum(runtime_prot_004.values())
# #need to add runtime_total[0] to this sum 
# runtime_total['0.04'] += runtime_total['0']
# runtime_total
# #add to total runtime
# #runtime_total['0.05'] = sum(runtime_prot_004.values())

#now, update runtime_total to include another new pair
# runtime_prot = {k: v for k, v in runtime.items() if 'metaproteomics0.05' in k}
# #add to total runtime
# runtime_total['0.05'] = sum(runtime_prot.values())

In [31]:
#need to add runtime_total[0] to each individual runtime 
#runtime_total[0] is the time that all pairs excluding metaproteomics took to run
#print('runtime_total[0]: ', runtime_total['0'])

#runtime_total_small[0] = runtime_total_noprot

for factor in [0.01, 0.03, 0.04, 0.05, 0.06]:
    print('runtime_total_small[{}]: '.format(factor), runtime_total_small[factor])
    runtime_total_small[factor] += runtime_total_small[0]
    print('Updated runtime_total_small[{}]: '.format(factor), runtime_total_small[factor])
    print()

runtime_total_small[0.01]:  1038.7009053920629
Updated runtime_total_small[0.01]:  4074.4403780760767

runtime_total_small[0.03]:  2914.5526906710584
Updated runtime_total_small[0.03]:  5950.292163355072

runtime_total_small[0.04]:  4464.388501600013
Updated runtime_total_small[0.04]:  7500.127974284027

runtime_total_small[0.05]:  7260.049163474003
Updated runtime_total_small[0.05]:  10295.788636158017

runtime_total_small[0.06]:  18353.66015205899
Updated runtime_total_small[0.06]:  21389.399624743004



In [33]:
# save final set of mmvec results
with open("../results/mmvec-runtime-total-uc-small.json", "w") as outfile: 
   json.dump(runtime_total_small, outfile)

# with open("../results/mmvec-runtime-uc.json", "w") as outfile: 
#   json.dump(runtime, outfile)

In [34]:
#print time in minutes
runtime_total_minutes = {k: v/60 for k, v in runtime_total_small.items()}
runtime_total_minutes
#runtime_minutes = {k: v/60 for k, v in runtime.items()}

{0.01: 67.90733963460129,
 0.03: 99.17153605591787,
 0.04: 125.00213290473378,
 0.05: 171.59647726930027,
 0.06: 356.48999374571673,
 0: 50.595657878066895}

In [47]:
#prepare dataframe
uc_runtime_df = pd.DataFrame(runtime_total_small.items(), columns=['factor', 'time (s)'])
uc_runtime_df.sort_values(by='factor', inplace=True)
uc_runtime_df

Unnamed: 0,factor,time (s)
5,0.0,3035.739473
0,0.01,4074.440378
1,0.03,5950.292163
2,0.04,7500.127974
3,0.05,10295.788636
4,0.06,21389.399625


In [56]:
#add number of features
proteomics_sub_nfeats[0] = uc_n_noprot

for factor in [0.01, 0.03, 0.04, 0.05, 0.06]:
    proteomics_sub_nfeats[factor] = proteomics_sub_nfeats[factor]+uc_n_noprot

In [61]:
uc_runtime_df['n features'] = uc_runtime_df['factor'].map(proteomics_sub_nfeats)
uc_runtime_df['% features'] = round(100*(uc_runtime_df['n features']/uc_n_feat), 2)
uc_runtime_df['method'] = 'MMvec'
uc_runtime_df

Unnamed: 0,factor,time (s),n features,% features,method
5,0.0,3035.739473,5496,4.84,MMvec
0,0.01,4074.440378,6576,5.79,MMvec
1,0.03,5950.292163,8738,7.69,MMvec
2,0.04,7500.127974,9819,8.65,MMvec
3,0.05,10295.788636,10900,9.6,MMvec
4,0.06,21389.399625,11980,10.55,MMvec


In [62]:
#save to csv
uc_runtime_df.to_csv('../results/uc-runtime-small.csv', index=False)