In [1]:
import glob
import pandas as pd
import time

from biom import load_table
from gemelli.rpca import joint_rpca

#turn off warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = '/Users/bec5786/Desktop/Shenhav Lab/Joint-RPCA/Case studies/iHMP/Data/runtime/'

##calculate # of features needed to match percentages
def get_n_features(p_features, total_features, subtotal_features):
    total_fts = [int(0.01 * p * total_features) for p in p_features]
    prot_fts = [fts - subtotal_features for fts in total_fts]
    return total_fts, prot_fts

### iHMP Cohort

In [3]:
#load the data
omic_keys = [omics_.split('/')[-1].split('.')[0] for omics_ in glob.glob('{}iHMP/*.biom'.format(data_path))]

ihmp_tables_shared = {k_: load_table('{}iHMP/{}.biom'.format(data_path, k_)) for k_ in omic_keys}
ihmp_metadata_shared = pd.read_csv('../data/ihmp-sample-metadata-plus-train-tests.csv', index_col=0)
ihmp_tables_shared

{'virome_virmap_analysis': 260 x 135 <class 'biom.table.Table'> with 505 nonzero entries (1% dense),
 'meta_t_ecs': 8291 x 135 <class 'biom.table.Table'> with 59744 nonzero entries (5% dense),
 'shared_meta_g_taxonomic_profiles': 181 x 135 <class 'biom.table.Table'> with 4465 nonzero entries (18% dense),
 'HMP2_proteomics_ecs': 830 x 135 <class 'biom.table.Table'> with 15760 nonzero entries (14% dense),
 'HMP2_metabolomics': 81867 x 135 <class 'biom.table.Table'> with 6886507 nonzero entries (62% dense)}

In [4]:
#since we are subsetting metabolomics, calculate the total number of
#features in the other omics
total_features = 0
for omic in omic_keys:
    total_features += ihmp_tables_shared[omic].shape[0]

subtotal_features = total_features - ihmp_tables_shared['HMP2_metabolomics'].shape[0]
print(total_features, subtotal_features)

91429 9562


In [5]:
#generate subsets of metabolomics data
metab_all_fts = [subtotal_features]
metabolites_sub = {}
for factor in [0.01, 0.03, 0.05, 0.1, 0.15]: 
    #get metabolomics table
    metab = ihmp_tables_shared['HMP2_metabolomics'].copy()
    #number of features to keep
    n_features = int(metab.shape[0]*factor)
    metab_all_fts.append(subtotal_features+n_features)
    #select first n features
    feature_ids = metab.ids(axis='observation')[:n_features]
    #filter table
    metabolites_sub[factor] = metab.filter(feature_ids, axis='observation')
    #sanity check
    print('Metabolomics: {}% of features: {}'.format(factor*100, metabolites_sub[factor].shape[0]))
    print('Total features: {}'.format(subtotal_features+n_features))

Metabolomics: 1.0% of features: 818
Total features: 10380
Metabolomics: 3.0% of features: 2456
Total features: 12018
Metabolomics: 5.0% of features: 4093
Total features: 13655
Metabolomics: 10.0% of features: 8186
Total features: 17748
Metabolomics: 15.0% of features: 12280
Total features: 21842


In [7]:
p_features = [30, 35, 40, 100]
metab_total_n, metab_n_lst = get_n_features(p_features, total_features, subtotal_features)
metab_n_lst

[17866, 22438, 27009, 81867]

In [8]:
#save list with all features in each run
metab_all_fts = metab_all_fts + metab_total_n
print(metab_all_fts)

[9562, 10380, 12018, 13655, 17748, 21842, 27428, 32000, 36571, 91429]


In [9]:
for factor, metab_n in zip(p_features, metab_n_lst): 
    #get metabolomics table
    metab = ihmp_tables_shared['HMP2_metabolomics'].copy()
    #select first n features
    feature_ids = metab.ids(axis='observation')[:metab_n]
    #filter table
    factor = factor/100
    metabolites_sub[factor] = metab.filter(feature_ids, axis='observation')
    #sanity check
    print('Metabolomics: {}% of features: {}'.format(factor*100, metabolites_sub[factor].shape[0]))

Metabolomics: 30.0% of features: 17866
Metabolomics: 35.0% of features: 22438
Metabolomics: 40.0% of features: 27009
Metabolomics: 100.0% of features: 81867


In [10]:
#create dictionary to store run time by Joint-RPCA
runtime_joint_rpca = {}

#run Joint-RPCA on all data and subsets of metabolomics data
tables_no_metab = {k_: v_ for k_, v_ in ihmp_tables_shared.items() if k_ != 'HMP2_metabolomics'}
tables_no_metab

{'virome_virmap_analysis': 260 x 135 <class 'biom.table.Table'> with 505 nonzero entries (1% dense),
 'meta_t_ecs': 8291 x 135 <class 'biom.table.Table'> with 59744 nonzero entries (5% dense),
 'shared_meta_g_taxonomic_profiles': 181 x 135 <class 'biom.table.Table'> with 4465 nonzero entries (18% dense),
 'HMP2_proteomics_ecs': 830 x 135 <class 'biom.table.Table'> with 15760 nonzero entries (14% dense)}

In [11]:
for factor in [0, 0.01, 0.03, 0.05, 0.1, 
               0.15, 0.3, 0.35, 0.4, 1]:

    if factor == 0:
        print("No metabolomics")
        tables_use = tables_no_metab.copy()
    #elif factor==1:
    #    tables_use = ihmp_tables_shared.copy()
    #    print("All {} metabolites".format(tables_use['HMP2_metabolomics'].shape))
    else:
        tables_use = ihmp_tables_shared.copy()
        tables_use['HMP2_metabolomics'] = metabolites_sub[factor].copy()
        print("{} factor with {} metabolites".format(factor, tables_use['HMP2_metabolomics'].shape[0]))

    time_start = time.perf_counter()
    ord_, dist_, cv_plt = joint_rpca([t.copy() for t in tables_use.values()],
                                      sample_metadata=ihmp_metadata_shared,
                                      train_test_column='train_test',
                                      min_feature_frequency=0,
                                      min_sample_count=0,
                                      min_feature_count=0,
                                      n_test_samples=0,
                                      max_iterations=5)
    time_elapsed = (time.perf_counter() - time_start)
    print('Time elapsed: {} secs'.format(round(time_elapsed,2)))
    print()
    runtime_joint_rpca['Joint-RPCA-{}'.format(factor)] = time_elapsed
    # ord_.write('../results/joint-rpca-ord.txt')
    # dist_.write('../results/joint-rpca-dist.txt')
    # cv_plt.to_csv('../results/joint-rpca-cv.txt')

No metabolomics
Time elapsed: 1.22 secs

0.01 factor with 818 metabolites
Time elapsed: 1.28 secs

0.03 factor with 2456 metabolites
Time elapsed: 1.84 secs

0.05 factor with 4093 metabolites
Time elapsed: 1.93 secs

0.1 factor with 8186 metabolites
Time elapsed: 3.46 secs

0.15 factor with 12280 metabolites
Time elapsed: 4.32 secs

0.3 factor with 17866 metabolites
Time elapsed: 6.03 secs

0.35 factor with 22438 metabolites
Time elapsed: 7.7 secs

0.4 factor with 27009 metabolites
Time elapsed: 9.1 secs

1 factor with 81867 metabolites
Time elapsed: 29.14 secs



In [12]:
#create df to store the run time
runtime_df = pd.DataFrame.from_dict(runtime_joint_rpca, 
                                    orient='index', columns=['time (s)'])
runtime_df['factor'] = [float(x.split('-')[-1]) for x in runtime_df.index]
runtime_df['method'] = 'Joint-RPCA'
runtime_df['n features'] = metab_all_fts
runtime_df.reset_index(inplace=True, drop=True)
runtime_df.to_csv('../results/ihmp-joint-rpca-runtime.csv')
runtime_df

Unnamed: 0,time (s),factor,method,n features
0,1.223706,0.0,Joint-RPCA,9562
1,1.28367,0.01,Joint-RPCA,10380
2,1.837933,0.03,Joint-RPCA,12018
3,1.932425,0.05,Joint-RPCA,13655
4,3.457007,0.1,Joint-RPCA,17748
5,4.316535,0.15,Joint-RPCA,21842
6,6.031864,0.3,Joint-RPCA,27428
7,7.704982,0.35,Joint-RPCA,32000
8,9.098438,0.4,Joint-RPCA,36571
9,29.136795,1.0,Joint-RPCA,91429


### UC Cohort

In [35]:
#load the data
uc_omic_keys = [omics_.split('/')[-1].split('.')[0] for omics_ in glob.glob('{}UC/*.biom'.format(data_path))]

uc_metadata_shared = pd.read_csv('../data/uc-metadata-plus-train-tests.csv', index_col=0)
uc_tables_shared = {k_: load_table('{}UC/{}.biom'.format(data_path, k_)) for k_ in uc_omic_keys}
uc_tables_shared

{'metagenomics': 3568 x 173 <class 'biom.table.Table'> with 162126 nonzero entries (26% dense),
 'metabolomics': 1928 x 173 <class 'biom.table.Table'> with 58132 nonzero entries (17% dense),
 'metaproteomics': 108080 x 173 <class 'biom.table.Table'> with 5065165 nonzero entries (27% dense)}

In [36]:
#add number of features across tables
uc_n_feat = 0
for k, v in uc_tables_shared.items():
    uc_n_feat += v.shape[0]

uc_subtotal = uc_n_feat - uc_tables_shared['metaproteomics'].shape[0]
print(uc_n_feat, uc_subtotal)

113576 5496


In [37]:
#load mmvec results to get % features
mmvec_ihmp = pd.read_csv('../results/ihmp-runtime.csv', index_col=0)
mmvec_ihmp = mmvec_ihmp[mmvec_ihmp.index.str.contains('MMvec')]
perct = mmvec_ihmp['% features'].values
perct

array([10.46, 11.35, 13.14, 14.94, 19.41, 23.89])

In [38]:
all_factors = list(perct) + [30, 35, 40, 100]

uc_total_n, uc_prot_n = get_n_features(all_factors, uc_n_feat, uc_subtotal)
print(uc_total_n)
print(uc_prot_n)

[11880, 12890, 14923, 16968, 22045, 27133, 34072, 39751, 45430, 113576]
[6384, 7394, 9427, 11472, 16549, 21637, 28576, 34255, 39934, 108080]


In [39]:
#generate subsets of metabolomics data
protein_sub = {}
for n in uc_prot_n: 
    #get metabolomics table
    prot = uc_tables_shared['metaproteomics'].copy()
    #select first n features
    feature_ids = list(prot.ids(axis='observation'))[:n]
    #filter table
    protein_sub[n] = prot.filter(feature_ids, axis='observation')
    #sanity check
    print('Proteomics: {}'.format(protein_sub[n].shape[0]))

Proteomics: 6384
Proteomics: 7394
Proteomics: 9427
Proteomics: 11472
Proteomics: 16549
Proteomics: 21637
Proteomics: 28576
Proteomics: 34255
Proteomics: 39934
Proteomics: 108080


In [40]:
runtime_uc_joint_rpca = {}
table_to_use = uc_tables_shared.copy()
prot_table_all = uc_tables_shared['metaproteomics'].copy()

#run joint-rpca
for n, perct in zip(uc_prot_n, all_factors):

    print("{}% of features".format(perct))
    table_to_use['metaproteomics'] = protein_sub[n]
    #sanity check
    print('{} Metaproteomics features used'.format(table_to_use['metaproteomics'].shape[0]))

    time_start = time.perf_counter()
    ord_, dist_, cv_plt = joint_rpca([t.copy() for t in table_to_use.values()],
                                    sample_metadata=uc_metadata_shared,
                                    train_test_column=None,
                                    min_feature_frequency=0,
                                    min_sample_count=0,
                                    min_feature_count=0,
                                    n_test_samples=0,
                                    max_iterations=5)
    time_elapsed = (time.perf_counter() - time_start)
    print('Time elapsed: {} secs'.format(round(time_elapsed,2)))
    print()
    runtime_uc_joint_rpca[perct] = time_elapsed

runtime_uc_joint_rpca

10.46% of features
6384 Metaproteomics features used
Time elapsed: 4.6 secs

11.35% of features
7394 Metaproteomics features used
Time elapsed: 4.73 secs

13.14% of features
9427 Metaproteomics features used
Time elapsed: 5.83 secs

14.94% of features
11472 Metaproteomics features used
Time elapsed: 6.15 secs

19.41% of features
16549 Metaproteomics features used
Time elapsed: 8.3 secs

23.89% of features
21637 Metaproteomics features used
Time elapsed: 9.97 secs

30% of features
28576 Metaproteomics features used
Time elapsed: 13.2 secs

35% of features
34255 Metaproteomics features used
Time elapsed: 16.92 secs

40% of features
39934 Metaproteomics features used
Time elapsed: 17.2 secs

100% of features
108080 Metaproteomics features used
Time elapsed: 67.66 secs



{10.46: 4.598135105999972,
 11.35: 4.7300303259999055,
 13.14: 5.828596809000146,
 14.94: 6.154360853000071,
 19.41: 8.300841738000145,
 23.89: 9.971473732999812,
 30: 13.201322094000034,
 35: 16.923151209000025,
 40: 17.19847063300017,
 100: 67.66099839999993}

In [42]:
#create df to store the run time
uc_runtime_df = pd.DataFrame.from_dict(runtime_uc_joint_rpca, 
                                       orient='index', columns=['time (s)'])
uc_runtime_df['% features'] = [float(x) for x in uc_runtime_df.index]
uc_runtime_df['n features'] = uc_total_n
uc_runtime_df['method'] = 'Joint-RPCA'
uc_runtime_df.reset_index(drop=True, inplace=True)
uc_runtime_df.to_csv('../results/uc-joint-rpca-runtime.csv')
uc_runtime_df

Unnamed: 0,time (s),% features,n features,method
0,4.598135,10.46,11880,Joint-RPCA
1,4.73003,11.35,12890,Joint-RPCA
2,5.828597,13.14,14923,Joint-RPCA
3,6.154361,14.94,16968,Joint-RPCA
4,8.300842,19.41,22045,Joint-RPCA
5,9.971474,23.89,27133,Joint-RPCA
6,13.201322,30.0,34072,Joint-RPCA
7,16.923151,35.0,39751,Joint-RPCA
8,17.198471,40.0,45430,Joint-RPCA
9,67.660998,100.0,113576,Joint-RPCA
