In [1]:
import pandas as pd
import json 
import glob
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from biom import load_table

#turn off warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
#load the data
omic_keys = [omics_.split('/')[-1].split('.')[0] for omics_ in glob.glob('../data/*.biom')]
tables_shared = {k_: load_table('../data/{}.biom'.format(k_)) for k_ in omic_keys}
metadata_shared = pd.read_csv('../data/sample-metadata-plus-train-tests-case-study.csv', index_col=0)

In [None]:
#load joint-rpca runtime
joint_rpca_runtime = pd.read_csv('../results/joint-rpca-runtime.csv')
#change col names
joint_rpca_runtime.columns = ['method', 'time (s)']

#load mmvec runtime
with open('../results/mmvec-runtime-total.json') as f:
    mmvec_runtime = json.load(f)
mmvec_runtime_df = pd.DataFrame(mmvec_runtime.items(), columns=['method', 'time (s)'])

In [None]:
#reorder rows so that second row is first
mmvec_runtime_df = mmvec_runtime_df.reindex([1,0,2])
mmvec_runtime_df.reset_index(drop=True, inplace=True)
mmvec_runtime_df

In [None]:
##append mmvec runtime to joint-rpca runtime
runtime_df = joint_rpca_runtime.append(mmvec_runtime_df)
runtime_df.reset_index(drop=True, inplace=True)

#rename rows 
runtime_df['method'] = ['Joint-RPCA']*4 + ['MMvec']*3
runtime_df['metabolomics'] = ['0%', '1%', '2%', '100%', '0%', '1%', '2%']
runtime_df

In [None]:
tables_nometab = {k: v for k, v in tables_shared.items() if k != 'HMP2_metabolomics'}

nometab_nfeats = 0
for k, v in tables_nometab.items():
    nometab_nfeats += v.shape[0]

metab_nfeats = {}
metab_nfeats[0] = nometab_nfeats

In [None]:
#repeat for metabolomics
for factor in [0.01, 0.02]:
    table_touse = tables_shared.copy()
    table_touse['HMP2_metabolomics'] = metabolites_sub[factor]
    metab_nfeats[factor] = 0
    for k, v in table_touse.items():
        metab_nfeats[factor] += v.shape[0]

#all metabs
metab_nfeats[1] = 0
for k, v in tables_shared.items():
    metab_nfeats[1] += v.shape[0]

metab_nfeats

In [None]:
runtime_df['n_features'] = [9562, 10380, 11199, 91429, 9562, 10380, 11199]
runtime_df

In [None]:
runtime_test = runtime_df.copy()
#add row to new df
runtime_test.loc[7] = ['MMvec', None, '100%', 91429]
runtime_test

In [None]:
runtime_test['x-label'] = ['{}\n{} Metabs'.format(n_feats, perc) for n_feats, perc in zip(runtime_test['n_features'], runtime_test['metabolomics'])]
runtime_test

In [None]:
#plot two lines, one for each method
runtime_test['index'] = [0,1,2,3]*2
plt.figure(figsize=(9, 5))
ax = sns.lineplot(x="index", y="time (s)", hue="method", 
                  style="method", markers=True, data=runtime_test)
# Set equally spaced xticks (using the index values)
ax.set_xticks(runtime_test['index'])
# Set xtick labels to correspond to actual 'n_features' values
ax.set_xticklabels(runtime_test['x-label'].unique())
#make y axis log scale
ax.set_yscale('log')

plt.title('Runtime vs Number of Features', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylabel('Time (s)', fontsize=16)
plt.xlabel('\nNumber of Features', fontsize=16)
#legend outside of plot
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), fontsize=14)
plt.grid(False)
plt.tight_layout()
plt.show()

In [None]:
# nicer_labels = {'HMP2_proteomics_ecs': 'Proteomics',
#                 'shared_meta_g_taxonomic_profiles': 'MetaG(taxonomy)',
#                 'virome_virmap_analysis': 'Virome',
#                 'meta_t_ecs': 'MetaT(pathway)',
#                 'joint-rpca': 'Joint-RPCA'}

# #replace index with nicer labels
# for label in n_features_df.index:
#     if label != 'joint-rpca':
#         omic1, omic2 = label.split('-')
#         n_features_df = n_features_df.rename(index={label: "{} + \n{}".format(nicer_labels[omic1], nicer_labels[omic2])})
#     else:
#         n_features_df = n_features_df.rename(index={label: nicer_labels[label]})