In [1]:
import pandas as pd
import glob

from skbio import OrdinationResults
from scipy.stats import kruskal
from gemelli.rpca import joint_rpca
from biom import load_table

#plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import 
metadata = pd.read_csv('../data/split-matched-data/metadata.tsv', index_col=0, sep='\t')
tables = {i.split('/')[-1].split('.')[0]:load_table(i) for i in glob.glob('../data/split-matched-data/*') if 'metadata' not in i}
tables

{'metabolite': 2333 x 374 <class 'biom.table.Table'> with 104825 nonzero entries (12% dense),
 '16S': 14237 x 374 <class 'biom.table.Table'> with 318875 nonzero entries (5% dense),
 'mag': 257 x 374 <class 'biom.table.Table'> with 58498 nonzero entries (60% dense),
 '18S': 5473 x 374 <class 'biom.table.Table'> with 114755 nonzero entries (5% dense),
 'gene_module': 377 x 374 <class 'biom.table.Table'> with 117093 nonzero entries (83% dense),
 'gene': 2457 x 374 <class 'biom.table.Table'> with 901890 nonzero entries (98% dense),
 'metabolomics': 2333 x 374 <class 'biom.table.Table'> with 104825 nonzero entries (12% dense)}

In [3]:
print(metadata.shape)
display(metadata.head())

(374, 16)


Unnamed: 0_level_0,season,subjects,add_0c,add_0c_group,facility,timepoint,traintest_0,traintest_1,traintest_2,traintest_3,traintest_4,traintest_5,traintest_6,traintest_7,traintest_8,traintest_9
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
soil.hip.CMU.17.10.2017.08.11.day3,summer,17-10,49.75,early,FIRS,3,train,train,train,train,train,train,train,train,train,train
soil.hip.CMU.17.10.2017.08.22.day14,summer,17-10,322.8,advanced,FIRS,14,test,test,test,test,test,test,test,test,test,test
soil.hip.CMU.17.10.2017.08.28.day20,summer,17-10,477.8,advanced,FIRS,20,test,test,test,test,test,test,test,test,test,test
soil.hip.CMU.17.10.2017.08.09.day1,summer,17-10,0.0,initial,FIRS,1,train,train,train,train,train,train,train,train,train,train
soil.hip.CMU.17.10.2017.08.23.day15,summer,17-10,348.05,advanced,FIRS,15,train,train,train,train,train,train,train,train,train,train


## independent RPCA

In [4]:
n_PCS = 4

In [6]:
# # ONLY RUN ON FIRST ITERATION
# # check indpendent analysis
# fold = 0
# rpca_independent = {tblid:joint_rpca([tbl], n_components=n_PCS, max_iterations=15,
#                                      min_feature_count=10, sample_metadata=metadata,
#                                      train_test_column='traintest_%i' % fold)
#                     for tblid, tbl in tables.items()}

# cvs_ind = {(fold, datatype_):cv_tmp_dt for datatype_, (_, _, cv_tmp_dt) in rpca_independent.items()}
# cvs_ind_all = pd.concat(cvs_ind).reset_index().rename({'level_0':'fold', 'level_1':'modality'}, axis=1)
# sns.pointplot(x='iteration', y='mean_CV', hue='modality', data=cvs_ind_all)
# plt.show()

In [7]:
# # save results
# for tblid, (ord_, dist_, _) in rpca_independent.items():    
#    ord_.write('../results/joint-rpca/Ordinations/{}_ord_{}PCs.txt'.format(tblid, n_PCS))
#    dist_.write('../results/joint-rpca/Ordinations/{}_dist{}PCs.txt'.format(tblid, n_PCS))

In [9]:
# load results from prior run
omic_ids = ['metabolite', '16S', 'mag', '18S', 'gene_module', 'gene', 'metabolomics']
n_PCS = 3
rpca_independent = {omic: (OrdinationResults.read('../results/joint-rpca/Original/{}_ord.txt'.format(omic)),
                           #(OrdinationResults.read('../results/joint-rpca/Ordinations/{}_ord_4PCs.txt'.format(omic)),
                           None, ##we don't really need the distance matrix for the analysis here! 
                           None) for omic in omic_ids}

### Kruskal Wallis

In [10]:
cats_of_interest = ['facility', 'season', 'add_0c_group']
kw_df_all = []

for cat_of_interest in cats_of_interest:
    kw_df_cat = pd.DataFrame()

    for tblid, (ord_, _, _) in rpca_independent.items():    
        ord_samples = ord_.samples.rename(columns={0:"PC1", 1:"PC2", 
                                                   2:"PC3", 3:"PC4"})
        ord_plt = pd.concat([ord_samples, metadata], axis=1, sort=True)
        #drop rows with nan values
        ord_plt = ord_plt.dropna()
        kw_h = []
        kw_p = []

        df_index = []
        for i in range(n_PCS):
            grouped = ord_plt.groupby(cat_of_interest)["PC%i" % (i+1)]
            groups = [group for _, group in grouped]
            # Perform the Kruskal-Wallis test
            h, p = kruskal(*groups)
            kw_h.append(h)
            kw_p.append(p)
            df_index.append("{}_PC{}".format(tblid, i+1))

        kw_df_i = pd.DataFrame(list(zip(kw_h, kw_p)), columns=['H-statistic', 'p-value'], 
                                index=df_index)
        kw_df_cat = pd.concat([kw_df_cat, kw_df_i])

    kw_df_all.append(kw_df_cat)
kw_df_all = pd.concat(kw_df_all, axis=1, keys=cats_of_interest)

In [11]:
#save results
kw_df_all.to_csv('../results/joint-rpca/Tables/individual_rpca_kw_original.csv'.format(n_PCS))
kw_df_all

Unnamed: 0_level_0,facility,facility,season,season,add_0c_group,add_0c_group
Unnamed: 0_level_1,H-statistic,p-value,H-statistic,p-value,H-statistic,p-value
metabolite_PC1,15.581948,0.0004134499,72.80238,1.071578e-15,11.382883,0.009825812
metabolite_PC2,26.909432,1.434468e-06,20.862341,0.0001124383,5.808234,0.1213221
metabolite_PC3,14.244524,0.0008069395,56.185442,3.835053e-12,3.135028,0.3712715
16S_PC1,162.725928,4.618635e-36,45.709887,6.536971e-10,6.824725,0.07769924
16S_PC2,81.750833,1.77024e-18,113.979531,1.526772e-24,61.835421,2.382474e-13
16S_PC3,138.333824,9.14517e-31,38.418103,2.305239e-08,37.798484,3.118213e-08
mag_PC1,219.41848,2.2588239999999997e-48,30.852523,9.130475e-07,41.911506,4.189415e-09
mag_PC2,274.409683,2.5863930000000003e-60,44.087142,1.446207e-09,17.860424,0.0004699862
mag_PC3,126.523778,3.3551410000000003e-28,21.859008,6.978906e-05,64.774044,5.606498e-14
18S_PC1,91.983107,1.0619939999999999e-20,138.466259,8.093738e-30,0.371468,0.9460715


# joint-rpca

In [12]:
# # ONLY RUN ON FIRST ITERATION
# cv_all_joint = {}
# fold = 0
# ord_jnt, dist_jnt, cv_jnt = joint_rpca([v for k, v in tables.items()
#                                         if k in ['mag', '18S', 'gene_module', 'gene', 'metabolomics']],
#                                         n_components=n_PCS,
#                                         max_iterations=15,
#                                         min_feature_count=10,
#                                         sample_metadata=metadata,
#                                         train_test_column='traintest_%i' % fold)
# cv_all_joint[0] = cv_jnt
# joint_rpca_results = {'joint': (ord_jnt, None, None)}

# # check CV error
# cv_all_joint_df = pd.concat(cv_all_joint).reset_index().rename({'level_0':'fold'}, axis=1)
# for f, df_ in cv_all_joint_df.groupby('fold'):
#     plt.errorbar(df_.iteration, df_['mean_CV'], yerr=df_['std_CV'], label='fold %i' % (f + 1))
# plt.legend()
# plt.show()

In [12]:
# #save results
# ord_jnt.write('../results/joint-rpca/Ordinations/joint_ord_{}PCs.txt'.format(n_PCS))
# dist_jnt.write('../results/joint-rpca/Ordinations/joint_dist_{}PCs.txt'.format(n_PCS))
# cv_jnt.to_csv('../results/joint-rpca/Ordinations/joint_cv_{}PCs.txt'.format(n_PCS))

In [15]:
# load results form prior run
joint_rpca_results = {'joint': (OrdinationResults.read('../results/joint-rpca/Original/ord.txt'), None, None) }
n_PCS = 4
#joint_rpca_results = {'joint': (OrdinationResults.read('../results/joint-rpca/Ordinations/joint_ord_4PCs.txt'), None, None) }

### Kruskal Wallis

In [14]:
ord_jnt = joint_rpca_results['joint'][0]
ord_joint_samps = ord_jnt.samples.copy()
ord_joint_samps.index.name = 'sample_name'
ord_joint_samps.reset_index(inplace=True)
merged_table = pd.merge(ord_joint_samps, metadata, on='sample_name')
merged_table.head()

Unnamed: 0,sample_name,0,1,2,3,season,subjects,add_0c,add_0c_group,facility,...,traintest_0,traintest_1,traintest_2,traintest_3,traintest_4,traintest_5,traintest_6,traintest_7,traintest_8,traintest_9
0,soil.hip.SHSU.2016.024.2016.05.03.day19,-0.098698,-0.10951,0.005658,-0.010724,spring,shsu.2016.024,409.166667,advanced,STAFS,...,train,train,train,train,train,train,train,train,train,train
1,soil.hip.SHSU.2016.076.2016.11.26.day5,0.107789,-0.012081,-0.069731,-0.07509,fall,2016.076,63.6,active,STAFS,...,train,train,train,train,train,train,train,train,train,train
2,soil.hip.CMU.17.10.2017.08.28.day20,-0.003416,-0.064283,-0.025962,0.11218,summer,17-10,477.8,advanced,FIRS,...,test,test,test,test,test,test,test,test,test,test
3,soil.hip.SHSU.2016.024.2016.04.20.day6,-0.097067,-0.094667,-0.027954,-0.055932,spring,shsu.2016.024,111.666667,active,STAFS,...,train,train,train,train,train,train,train,train,train,train
4,soil.hip.UTK.K016.01.2016.04.16.day12,-0.09108,-0.072832,0.071708,0.001341,spring,UTK.K016.01,141.666667,active,ARF,...,train,train,train,train,train,train,train,train,train,train


In [17]:
# Group the numerical data based on the categorical data
cats_of_interest = ['facility', 'season', 'add_0c_group']
kw_df_joint = []

for cat_of_interest in cats_of_interest:
    kw_joint_cat = pd.DataFrame()
    kw_h = []
    kw_p = []

    for i in range(n_PCS):
        grouped = merged_table.groupby(cat_of_interest)[i]
        #grouped = merged_table.groupby(cat_of_interest)["PC%i" % (i+1)]
        groups = [group for _, group in grouped]

        # Perform the Kruskal-Wallis test
        h, p = kruskal(*groups)
        kw_h.append(h)
        kw_p.append(p)

    kw_joint_cat_df = pd.DataFrame(list(zip(kw_h, kw_p)), 
                                   columns = ['H-statistic', 'p-value'],
                                   index=[f'PC{i+1}' for i in range(n_PCS)])
    kw_df_joint.append(kw_joint_cat_df)

kw_df_joint = pd.concat(kw_df_joint, axis=1, keys=cats_of_interest)

In [18]:
#save results
kw_df_joint.to_csv('../results/joint-rpca/Tables/joint_rpca_kw_original.csv'.format(n_PCS))
kw_df_joint

Unnamed: 0_level_0,facility,facility,season,season,add_0c_group,add_0c_group
Unnamed: 0_level_1,H-statistic,p-value,H-statistic,p-value,H-statistic,p-value
PC1,14.146955,0.0008472816,218.48692,4.263785e-47,8.577836,0.03546368
PC2,63.879984,1.344738e-14,83.797975,4.70057e-18,111.845888,4.395985e-24
PC3,280.164284,1.45579e-61,6.056139,0.1089107,13.292691,0.004044559
PC4,267.377664,8.703182e-59,12.179555,0.006792722,9.09039,0.02811293
