In [3]:
import numpy as np
import pandas as pd
# import tensorflow as tf

import cudf
import cupy as cp
from cuml.preprocessing import MinMaxScaler
from cuml.metrics import pairwise_distances
from skbio.stats.ordination import pcoa

In [9]:
metab_1_path = 'metadata/metab_df_1_processed.csv'
metab_2_path = 'metadata/metab_df_2_processed.csv'
df_1 = cudf.read_csv(metab_1_path, delimiter='\t')
df_2 = cudf.read_csv(metab_2_path, delimiter='\t')

# print(f'df_1 size: {df_1.shape[0]} * {df_1.shape[1]}')
# print(f'Range: [{df_1.min().min()}, {df_1.max().max()}]')
# print(f'df_2 size: {df_2.shape[0]} * {df_2.shape[1]}')
# print(f'Range: [{df_2.min().min()}, {df_2.max().max()}]')

In [10]:
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()
scaled_1 = scaler1.fit_transform(df_1)
scaled_2 = scaler2.fit_transform(df_2)

In [11]:
# df_1 size: 637 * 20628
# Range: [0.0, 327369100.00000006]
# df_2 size: 744 * 17354
# Range: [0.0, 774804400.0]

In [13]:
np_1 = cp.array(scaled_1.to_cupy())
np_2 = cp.array(scaled_2.to_cupy())

In [17]:
dm_1 = pairwise_distances(np_1, metric='euclidean') # distance matrix
dm_2 = pairwise_distances(np_2, metric='euclidean')
dmc_1 = cp.asnumpy(dm_1)
dmc_2 = cp.asnumpy(dm_2)

In [56]:
dim = 16

pcoa_results_1 = pcoa(dmc_1)
pcoa_results_2 = pcoa(dmc_2)

reduced_1 = pcoa_results_1.samples.iloc[:, :dim]
reduced_2 = pcoa_results_2.samples.iloc[:, :dim]

print(reduced_1)
print(reduced_2)

          PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0   -3.853827  2.807799 -0.663436  0.177001  1.961296  0.062495 -2.881509   
1   -2.574948 -2.269183  0.436856  0.116021  1.353675  0.385819 -0.209899   
2   -3.127759 -0.464098  0.136176 -0.259064  0.566079  0.354377 -0.963455   
3   -2.049509 -3.185004  0.589837 -0.058671 -0.268711  0.570410 -0.211987   
4   -2.703816 -1.802039  0.494830 -0.190866 -0.241457 -0.145574 -1.014361   
..        ...       ...       ...       ...       ...       ...       ...   
632  5.126023 -0.506022 -3.526404 -4.172315  4.388537  0.814276 -0.092549   
633  5.366403 -2.652822 -2.574141 -3.724876  0.195018  0.671236  0.466915   
634  4.483825  2.847343 -4.017650 -3.875359 -0.469894  1.778612  4.192519   
635  5.833965 -2.283733 -3.426170 -5.134695 -0.134710  0.706618  0.469487   
636  2.166664  3.284238 -1.992699  1.977782  2.264224  0.579382  1.438475   

          PC8       PC9      PC10      PC11      PC12      PC13      PC14  

In [54]:
explained_variance_1 = pcoa_results_1.eigvals / pcoa_results_1.eigvals.sum()
explained_variance_2 = pcoa_results_2.eigvals / pcoa_results_2.eigvals.sum()

cumulative_explained_variance_1 = explained_variance_1.cumsum()
cumulative_explained_variance_2 = explained_variance_2.cumsum()
# print(explained_variance_1)
# print(cumulative_explained_variance_1)


In [55]:
# Top Largest EV
top = 16

top_components_1 = explained_variance_1.nlargest(top)
print(f'For {metab_1_path}, The Top {top} Principal Components by Explained Variance:')
print(top_components_1)

top_components_2 = explained_variance_2.nlargest(top)
print(f'For {metab_2_path}, The Top {top} Principal Components by Explained Variance:')
print(top_components_2)

# Greater than threshold
threshold = 0.01

important_components_1 = explained_variance_1[explained_variance_1 > threshold]
print(f'For {metab_1_path}, {important_components_1.shape[0]} Principal Components with Explained Variance > {threshold}:')
print(important_components_1)

important_components_2 = explained_variance_2[explained_variance_2 > threshold]
print(f'For {metab_2_path}, {important_components_2.shape[0]} Principal Components with Explained Variance > {threshold}:')
print(important_components_2)

For metadata/metab_df_1_processed.csv, The Top 16 Principal Components by Explained Variance:
PC1     0.082894
PC2     0.056882
PC3     0.049513
PC4     0.035418
PC5     0.023145
PC6     0.018391
PC7     0.017324
PC8     0.015534
PC9     0.013682
PC10    0.013342
PC11    0.011573
PC12    0.009650
PC13    0.009154
PC14    0.008743
PC15    0.008684
PC16    0.008234
dtype: float64
For metadata/metab_df_2_processed.csv, The Top 16 Principal Components by Explained Variance:
PC1     0.079195
PC2     0.060589
PC3     0.048925
PC4     0.032643
PC5     0.018776
PC6     0.016173
PC7     0.014081
PC8     0.012265
PC9     0.011748
PC10    0.010753
PC11    0.009385
PC12    0.008538
PC13    0.008033
PC14    0.007885
PC15    0.007328
PC16    0.006940
dtype: float64
For metadata/metab_df_1_processed.csv, 11 Principal Components with Explained Variance > 0.01:
PC1     0.082894
PC2     0.056882
PC3     0.049513
PC4     0.035418
PC5     0.023145
PC6     0.018391
PC7     0.017324
PC8     0.015534
PC9    

In [None]:

# dim = 5
# For metadata/metab_df_1_processed.csv, The Top 5 Principal Components by Explained Variance:
# PC1    0.082894
# PC2    0.056882
# PC3    0.049513
# PC4    0.035418
# PC5    0.023145
# dtype: float64
# For metadata/metab_df_2_processed.csv, The Top 5 Principal Components by Explained Variance:
# PC1    0.079195
# PC2    0.060589
# PC3    0.048925
# PC4    0.032643
# PC5    0.018776
# dtype: float64
# For metadata/metab_df_1_processed.csv, 11 Principal Components with Explained Variance > 0.01:
# PC1     0.082894
# PC2     0.056882
# PC3     0.049513
# PC4     0.035418
# PC5     0.023145
# PC6     0.018391
# PC7     0.017324
# PC8     0.015534
# PC9     0.013682
# PC10    0.013342
# PC11    0.011573
# dtype: float64
# For metadata/metab_df_2_processed.csv, 10 Principal Components with Explained Variance > 0.01:
# PC1     0.079195
# PC2     0.060589
# PC3     0.048925
# PC4     0.032643
# PC5     0.018776
# PC6     0.016173
# PC7     0.014081
# PC8     0.012265
# PC9     0.011748
# PC10    0.010753
# dtype: float64

In [57]:
from scipy.spatial.distance import pdist, squareform
from scipy.stats import spearmanr

reduced_distance_matrix_1 = squareform(pdist(reduced_1, metric='euclidean'))
reduced_distance_matrix_2 = squareform(pdist(reduced_2, metric='euclidean'))

correlation_1, _1 = spearmanr(dmc_1.ravel(), reduced_distance_matrix_1.ravel())
correlation_2, _2 = spearmanr(dmc_2.ravel(), reduced_distance_matrix_2.ravel())

print("For Dataset1: Spearman correlation between original and reduced distances:", correlation_1)
print("For Dataset2: Spearman correlation between original and reduced distances:", correlation_2)


For Dataset1: Spearman correlation between original and reduced distances: 0.8336962736491317
For Dataset2: Spearman correlation between original and reduced distances: 0.7175488983720534


In [48]:
# dim = 5
# For Dataset1: Spearman correlation between original and reduced distances: 0.7871718717197851
# For Dataset2: Spearman correlation between original and reduced distances: 0.6362350053099822

# dim = 16
# For Dataset1: Spearman correlation between original and reduced distances: 0.8336962736491317
# For Dataset2: Spearman correlation between original and reduced distances: 0.7175488983720534