In [3]:
import numpy as np
import pandas as pd
# import tensorflow as tf

import cudf
import cuml
from cuml.preprocessing import StandardScaler, MinMaxScaler

from cuml.decomposition import PCA
from cuml.metrics import mean_squared_error

In [3]:
metab_1_path = 'metadata/metab_df_1_processed.csv'
metab_2_path = 'metadata/metab_df_2_processed.csv'
df_1 = cudf.read_csv(metab_1_path, delimiter='\t')
df_2 = cudf.read_csv(metab_2_path, delimiter='\t')

print(f'df_1 size: {df_1.shape[0]} * {df_1.shape[1]}')
print(f'Range: [{df_1.min().min()}, {df_1.max().max()}]')
print(f'df_2 size: {df_2.shape[0]} * {df_2.shape[1]}')
print(f'Range: [{df_2.min().min()}, {df_2.max().max()}]')

df_1 size: 637 * 20628
Range: [0.0, 327369100.00000006]
df_2 size: 744 * 17354
Range: [0.0, 774804400.0]


In [4]:
# df_1 size: 637 * 20628
# Range: [0.0, 327369100.00000006]
# df_2 size: 744 * 17354
# Range: [0.0, 774804400.0]

In [5]:
# Unscaled dataset

In [6]:
reduced_dim = 128
pca_model1 = PCA(n_components=reduced_dim, svd_solver='full')
pca_model2 = PCA(n_components=reduced_dim, svd_solver='full')
df1_pca = pca_model1.fit_transform(df_1)
df2_pca = pca_model2.fit_transform(df_2)

df1_reconstructed = pca_model1.inverse_transform(df1_pca)
df2_reconstructed = pca_model2.inverse_transform(df2_pca)

In [7]:
print(f'df1_pca size: {df1_pca.shape[0]} * {df1_pca.shape[1]}')
print(f'df2_pca size: {df2_pca.shape[0]} * {df2_pca.shape[1]}')
print(f'df1_reconstructed size: {df1_reconstructed.shape[0]} * {df1_reconstructed.shape[1]}')
print(f'df2_reconstructed size: {df2_reconstructed.shape[0]} * {df2_reconstructed.shape[1]}')

df1_pca size: 637 * 128
df2_pca size: 744 * 128
df1_reconstructed size: 637 * 20628
df2_reconstructed size: 744 * 17354


In [8]:
mse_1 = mean_squared_error(df_1, df1_reconstructed)
mse_2 = mean_squared_error(df_2, df2_reconstructed)
print(f'----For {metab_1_path}:----')
print(f'Original size: {df_1.shape[0]} * {df_1.shape[1]}')
print(f'Reduced size: {df1_pca.shape[0]} * {df1_pca.shape[1]}')
print(f'Reconstructed mse: {mse_1}')

print(f'----For {metab_2_path}:----')
print(f'Original size: {df_2.shape[0]} * {df_2.shape[1]}')
print(f'Reduced size: {df2_pca.shape[0]} * {df2_pca.shape[1]}')
print(f'Reconstructed mse: {mse_2}')


----For metadata/metab_df_1_processed.csv:----
Original size: 637 * 20628
Reduced size: 637 * 128
Reconstructed mse: 224157915.67511678
----For metadata/metab_df_2_processed.csv:----
Original size: 744 * 17354
Reduced size: 744 * 128
Reconstructed mse: 5773051384.7409315


In [9]:
# MSE of Unscaled datasets with PCA
# ----For unscaled metadata/metab_df_1_processed.csv:----
# Original size: 637 * 20628
# Reduced size: 637 * 512
# Reconstructed mse: 2161562.807968613
# ----For unscaled metadata/metab_df_2_processed.csv:----
# Original size: 744 * 17354
# Reduced size: 744 * 512
# Reconstructed mse: 198845405.87344676


# ----For unscaled metadata/metab_df_1_processed.csv:----
# Original size: 637 * 20628
# Reduced size: 637 * 128
# Reconstructed mse: 224157915.67511678
# ----For unscaled metadata/metab_df_2_processed.csv:----
# Original size: 744 * 17354
# Reduced size: 744 * 128
# Reconstructed mse: 5773051384.7409315

In [10]:
# Scaled Dataset

In [11]:
scaler1 = StandardScaler()
scaler2 = StandardScaler()

scaled_1 = scaler1.fit_transform(df_1)
scaled_2 = scaler2.fit_transform(df_2)


In [12]:
print(f'scaled_1 size: {scaled_1.shape[0]} * {scaled_1.shape[1]}')
print(f'Range: [{scaled_1.min().min()}, {scaled_1.max().max()}]')
print(f'scaled_2 size: {scaled_2.shape[0]} * {scaled_2.shape[1]}')
print(f'Range: [{scaled_2.min().min()}, {scaled_2.max().max()}]')


scaled_1 size: 637 * 20628
Range: [-6.3458035046688135, 25.219040425837033]
scaled_2 size: 744 * 17354
Range: [-4.502650986590355, 27.258026340878086]


In [13]:
# scaled_1 size: 637 * 20628
# Range: [-6.3458035046688135, 25.219040425837033]
# scaled_2 size: 744 * 17354
# Range: [-4.502650986590355, 27.258026340878086]

In [14]:
reduced_dim_s = 1024
pca_scaled1 = PCA(n_components=reduced_dim_s, svd_solver='full')
pca_scaled2 = PCA(n_components=reduced_dim_s, svd_solver='full')
scaled1_pca = pca_scaled1.fit_transform(scaled_1)
scaled2_pca = pca_scaled2.fit_transform(scaled_2)

scaled1_reconstructed = pca_scaled1.inverse_transform(scaled1_pca)
scaled2_reconstructed = pca_scaled2.inverse_transform(scaled2_pca)

In [15]:
print(f'scaled1_pca size: {scaled1_pca.shape[0]} * {scaled1_pca.shape[1]}')
print(f'scaled2_pca size: {scaled2_pca.shape[0]} * {scaled2_pca.shape[1]}')
print(f'scaled1_reconstructed size: {scaled1_reconstructed.shape[0]} * {scaled1_reconstructed.shape[1]}')
print(f'scaled2_reconstructed size: {scaled2_reconstructed.shape[0]} * {scaled2_reconstructed.shape[1]}')


scaled1_pca size: 637 * 1024
scaled2_pca size: 744 * 1024
scaled1_reconstructed size: 637 * 20628
scaled2_reconstructed size: 744 * 17354


In [16]:
scaled_mse_1 = mean_squared_error(scaled_1, scaled1_reconstructed)
scaled_mse_2 = mean_squared_error(scaled_2, scaled2_reconstructed)
print(f'----For scaled {metab_1_path}:----')
print(f'Original size: {scaled_1.shape[0]} * {scaled_1.shape[1]}')
print(f'Reduced size: {scaled1_pca.shape[0]} * {scaled1_pca.shape[1]}')
print(f'Reconstructed mse: {scaled_mse_1}')

print(f'----For scaled {metab_2_path}:----')
print(f'Original size: {scaled_2.shape[0]} * {scaled_2.shape[1]}')
print(f'Reduced size: {scaled2_pca.shape[0]} * {scaled2_pca.shape[1]}')
print(f'Reconstructed mse: {scaled_mse_2}')


----For scaled metadata/metab_df_1_processed.csv:----
Original size: 637 * 20628
Reduced size: 637 * 1024
Reconstructed mse: 5.159855415010912e-29
----For scaled metadata/metab_df_2_processed.csv:----
Original size: 744 * 17354
Reduced size: 744 * 1024
Reconstructed mse: 1.012961906688683e-28


In [17]:
# ----For scaled metadata/metab_df_1_processed.csv:----
# Original size: 637 * 20628
# Reduced size: 637 * 128
# Reconstructed mse: 0.31091092994883607
# ----For scaled metadata/metab_df_2_processed.csv:----
# Original size: 744 * 17354
# Reduced size: 744 * 128
# Reconstructed mse: 0.451898048789962

# ----For scaled metadata/metab_df_1_processed.csv:----
# Original size: 637 * 20628
# Reduced size: 637 * 256
# Reconstructed mse: 0.14692968198374712
# ----For scaled metadata/metab_df_2_processed.csv:----
# Original size: 744 * 17354
# Reduced size: 744 * 256
# Reconstructed mse: 0.2537748722853918

# ----For scaled metadata/metab_df_1_processed.csv:----
# Original size: 637 * 20628
# Reduced size: 637 * 512
# Reconstructed mse: 0.01765819702708299
# ----For scaled metadata/metab_df_2_processed.csv:----
# Original size: 744 * 17354
# Reduced size: 744 * 512
# Reconstructed mse: 0.05859996044394567

In [None]:
# minmaxscaler

In [18]:
minmax1 = MinMaxScaler()
minmax2 = MinMaxScaler()

minmax_1 = minmax1.fit_transform(df_1)
minmax_2 = minmax1.fit_transform(df_2)


In [19]:
print(f'scaled_1 size: {scaled_1.shape[0]} * {scaled_1.shape[1]}')
print(f'Range: [{scaled_1.min().min()}, {scaled_1.max().max()}]')
print(f'scaled_2 size: {scaled_2.shape[0]} * {scaled_2.shape[1]}')
print(f'Range: [{scaled_2.min().min()}, {scaled_2.max().max()}]')


scaled_1 size: 637 * 20628
Range: [-6.3458035046688135, 25.219040425837033]
scaled_2 size: 744 * 17354
Range: [-4.502650986590355, 27.258026340878086]


In [20]:
reduced_dim_s = 512
pca_scaled1 = PCA(n_components=reduced_dim_s, svd_solver='full')
pca_scaled2 = PCA(n_components=reduced_dim_s, svd_solver='full')
scaled1_pca = pca_scaled1.fit_transform(scaled_1)
scaled2_pca = pca_scaled2.fit_transform(scaled_2)

scaled1_reconstructed = pca_scaled1.inverse_transform(scaled1_pca)
scaled2_reconstructed = pca_scaled2.inverse_transform(scaled2_pca)

In [21]:
print(f'scaled1_pca size: {scaled1_pca.shape[0]} * {scaled1_pca.shape[1]}')
print(f'scaled2_pca size: {scaled2_pca.shape[0]} * {scaled2_pca.shape[1]}')
print(f'scaled1_reconstructed size: {scaled1_reconstructed.shape[0]} * {scaled1_reconstructed.shape[1]}')
print(f'scaled2_reconstructed size: {scaled2_reconstructed.shape[0]} * {scaled2_reconstructed.shape[1]}')


scaled1_pca size: 637 * 512
scaled2_pca size: 744 * 512
scaled1_reconstructed size: 637 * 20628
scaled2_reconstructed size: 744 * 17354


In [22]:
scaled_mse_1 = mean_squared_error(scaled_1, scaled1_reconstructed)
scaled_mse_2 = mean_squared_error(scaled_2, scaled2_reconstructed)
print(f'----For scaled {metab_1_path}:----')
print(f'Original size: {scaled_1.shape[0]} * {scaled_1.shape[1]}')
print(f'Reduced size: {scaled1_pca.shape[0]} * {scaled1_pca.shape[1]}')
print(f'Reconstructed mse: {scaled_mse_1}')

print(f'----For scaled {metab_2_path}:----')
print(f'Original size: {scaled_2.shape[0]} * {scaled_2.shape[1]}')
print(f'Reduced size: {scaled2_pca.shape[0]} * {scaled2_pca.shape[1]}')
print(f'Reconstructed mse: {scaled_mse_2}')


----For scaled metadata/metab_df_1_processed.csv:----
Original size: 637 * 20628
Reduced size: 637 * 512
Reconstructed mse: 0.01765819702708299
----For scaled metadata/metab_df_2_processed.csv:----
Original size: 744 * 17354
Reduced size: 744 * 512
Reconstructed mse: 0.05859996044394567
