In [None]:
import numpy as np
import pandas as pd
import biom
import zipfile
import os
from biom import Table
from gemelli.rpca import rpca
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [5]:
meta_path = "./redbiom_adrc_wolr2_fecal_v2.tsv"
meta = pd.read_csv(meta_path, sep='\t')

print(meta.shape)
print(meta.head(5))


(13436, 7)
                  #SampleID  qiita_study_id  host_age host_age_units  \
0  12675.FIT076E3CAL.170122           12675     40.00          years   
1  12142.820081881.2.159050           12142     77.05          years   
2  12142.820036321.4.158997           12142     60.80          years   
3  12142.820023241.6.158977           12142     70.49          years   
4       11666.G0281R.164064           11666     61.00          years   

  host_body_site diagnosis apoe  
0   UBERON:feces       NaN  NaN  
1   UBERON:feces       NaN  NaN  
2   UBERON:feces       NaN  NaN  
3   UBERON:feces       NaN  NaN  
4   UBERON:feces       NaN  NaN  


In [6]:
zip_file_path = './redbiom_adrc_wolr2_fecal_v2.biom.zip'

extract_to_path = './' 
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

extracted_files = os.listdir(extract_to_path)
print(f"Extracted files: {extracted_files}")

biom_file = None
for file in extracted_files:
    if file.endswith('.biom'):
        biom_file = os.path.join(extract_to_path, file)
        break

if biom_file:
    print(f"Loading .biom file: {biom_file}")
    
    table = biom.load_table(biom_file)
    
    biom_df = pd.DataFrame(table.matrix_data.toarray(), 
                      index=table.ids(axis='observation'), 
                      columns=table.ids(axis='sample'))
    
    print(biom_df.head())
else:
    print("No .biom file found in the extracted files.")


Extracted files: ['redbiom_adrc_wolr2_fecal_v2.tsv', '.ipynb_checkpoints', 'redbiom.ipynb', 'redbiom_adrc_wolr2_fecal_v2.biom', 'redbiom_adrc_wolr2_fecal_v2.biom.zip', '__MACOSX', 'redbiom_pca_cuml.ipynb']
Loading .biom file: ./redbiom_adrc_wolr2_fecal_v2.biom
            10283.JS.1.14.2015.160409  10283.LS.2.23.2015.159711  \
G000005825                        1.0                        0.0   
G000006175                        0.0                        0.0   
G000006605                        0.0                        0.0   
G000006725                        0.0                        0.0   
G000006745                        0.0                        2.0   

            11129.NH001.JJK.St.160942  11129.NH002.JJK.St.160942  \
G000005825                        0.0                        0.0   
G000006175                        0.0                        0.0   
G000006605                        8.0                        1.0   
G000006725                        0.0                     

In [7]:
# biom_df.shape # features, samples(15363, 13436)

In [8]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'

biom_df['feature_type'] = biom_df.apply(classify_feature, axis=1)

binary_features = biom_df[biom_df['feature_type'] == 'binary']
numerical_features = biom_df[biom_df['feature_type'] == 'numerical']

binary_features = binary_features.drop(columns=['feature_type'])
numerical_features = numerical_features.drop(columns=['feature_type'])

print(f"Number of binary features: {binary_features.shape[0]}")
print(f"Number of numerical features: {numerical_features.shape[0]}")


Number of binary features: 1568
Number of numerical features: 13795


In [9]:
scaled_d = numerical_features.copy()
scaler = MinMaxScaler()

scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
scaled = scaler.fit_transform(numerical_features.T).T

# print("Scaled Numerical Dataset:")
# print(scaled.head())
# print(scaled.shape) # (13795, 13436)

In [10]:
import time

def rpca_fr(scaled, numerical_features, dim):
    start_time = time.time()
    print(start_time)
    
    print("Building the table...")
    scaled += 1e-10
    sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
    feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
    table_scaled = Table(scaled, feature_ids, sample_ids)

    print("Running rpca...")
    rpca_results = rpca(table_scaled, n_components=dim)

    print("Generating results...")
    ordination, distance = rpca_results
    sample_scores = ordination.samples  # Scores for samples
    feature_scores = ordination.features  # Scores for features
    
    X_reconstructed = np.dot(sample_scores, feature_scores.T)
    
    mse = mean_squared_error(scaled.T, X_reconstructed)
    
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time: {execution_time} seconds")

    print(f"Reconstruction MSE: {mse}")
    
    print(f"Sample scores shape: {sample_scores.shape}")
    print(f"Feature scores shape: {feature_scores.shape}")
    print(f"Original scaled data shape: {scaled.T.shape}")
    print(f"Reconstructed data shape: {X_reconstructed.shape}")

    return sample_scores

In [None]:
reduced_data = rpca_fr(scaled, numerical_features, 64)

1727551371.1404054
Building the table...
Running rpca...


In [None]:
def get_X_y(reduced, y):
    combined_df = combine_df(reduced, y)
    cleaned_df = combined_df.dropna(subset=['host_age'])
    X = cleaned_df.drop(columns=['host_age'])
    y = cleaned_df['host_age']
    return X, y

def training(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    print(f"Random Forest MSE: {mse_rf}")

    # MLP
    mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=5000, random_state=42)
    mlp_model.fit(X_train, y_train)
    
    y_pred_mlp = mlp_model.predict(X_test)
    mse_mlp = mean_squared_error(y_test, y_pred_mlp)
    print(f"MLP Regressor MSE: {mse_mlp}")

    # XGBoost
    xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost MSE: {mse_xgb}")

In [None]:
cat = 'host_age'
num_data = numerical_features.T
reduced_df = combine_df(reduced_data, binary_features.T)
reduced_df.index = num_data.index
reduced_df.index.name = '#SampleID'
meta_df = meta.set_index('#SampleID')

merged_df = pd.merge(reduced_df, meta_df[cat], left_index=True, right_index=True, how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])
f_cleaned_df = merged_df.dropna()
# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

print(cleaned_df.shape)
print(f_cleaned_df.shape)
X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']
training(X, Y)

In [None]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'
        
        
def rpca_fr(path, dim):
    df = pd.read_csv(path, delimiter='\t')

    df['feature_type'] = df.apply(classify_feature, axis=1)
    binary_features = df[df['feature_type'] == 'binary']
    numerical_features = df[df['feature_type'] == 'numerical']
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    print(f'Number of binary features: {binary_features.shape[0]}')
    print(f'Number of numerical features: {numerical_features.shape[0]}')

    scaled = numerical_features.copy()
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(numerical_features)

    scaled += 1e-10
    sample_ids = numerical_features.index.tolist()  # Sample IDs (rows)
    feature_ids = numerical_features.columns.tolist()   # Feature IDs (columns)
    table_scaled = Table(scaled.T, feature_ids, sample_ids)

    rpca_results = rpca(table_scaled, n_components=dim)

    ordination, distance = rpca_results
    sample_scores = ordination.samples  # Scores for samples
    feature_scores = ordination.features  # Scores for features
    
    X_reconstructed = np.dot(sample_scores, feature_scores.T)
    
    mse = mean_squared_error(scaled, X_reconstructed)
    print(f'Reconstruction MSE: {mse}')
    
    print(f'Sample scores shape: {sample_scores.shape}')
    print(f'Feature scores shape: {feature_scores.shape}')
    print(f'Original scaled data shape: {scaled.T.shape}')
    print(f'Reconstructed data shape: {X_reconstructed.shape}\n')

    reduced_df = pd.DataFrame(sample_scores, index=sample_ids)
    return reduced_df

In [None]:
def pca_fr(path, dim, svd='full'):
    df = pd.read_csv(path, delimiter='\t')

    df['feature_type'] = df.apply(classify_feature, axis=1)
    binary_features = df[df['feature_type'] == 'binary']
    numerical_features = df[df['feature_type'] == 'numerical']
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    print(f'Number of binary features: {binary_features.shape[0]}')
    print(f'Number of numerical features: {numerical_features.shape[0]}')

    scaled = numerical_features.copy()
    scaler = StandardScaler()
    scaled = scaler.fit_transform(numerical_features)

    pca = PCA(n_components=dim, svd_solver=svd)
    reduced_df = pca.fit_transform(scaled)
    X_reconstructed = pca.inverse_transform(reduced_df)
    
    mse = mean_squared_error(scaled, X_reconstructed)
    print(f'Reconstruction MSE: {mse}')
    
    print(f'Reduced data shape: {reduced_df.shape}')    
    print(f'Original scaled data shape: {scaled.shape}')
    print(f'Reconstructed data shape: {X_reconstructed.shape}\n')

    return reduced_df

In [None]:
def combine_df(df1, df2):
    df1 = pd.DataFrame(df1)
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    return pd.concat([df1, df2], axis=1)

In [None]:
def get_X_y(reduced, y):
    combined_df = combine_df(reduced, y)
    cleaned_df = combined_df.dropna(subset=['host_age'])
    X = cleaned_df.drop(columns=['host_age'])
    y = cleaned_df['host_age']
    return X, y

In [None]:
def training(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    print(f"Random Forest MSE: {mse_rf}")

    # MLP
    '''/opt/conda/lib/python3.11/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: 
    ConvergenceWarning: Stochastic Optimizer: 
    Maximum iterations (3000) reached and the optimization hasn't converged yet.
    warnings.warn(
    '''
    mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=4000, random_state=42)
    mlp_model.fit(X_train, y_train)
    
    y_pred_mlp = mlp_model.predict(X_test)
    mse_mlp = mean_squared_error(y_test, y_pred_mlp)
    print(f"MLP Regressor MSE: {mse_mlp}")

    # XGBoost
    xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost MSE: {mse_xgb}")

In [None]:
host_age_1 = meta['host_age'][:611]
host_age_2 = meta['host_age'][611:]

In [None]:
def get_result(path, bi_path, rpca_dim, pca_dim, y, svd='full'):
    print(f'--------RPCA Results--------')
    rpca_df = rpca_fr(path, rpca_dim)

    print(f'For Numerical Dataset Only')
    X_rpca_nu, Y_rpca_nu = get_X_y(rpca_df, y)
    training(X_rpca_nu, Y_rpca_nu)
    
    print(f'For Both Numerical And Binary Dataset')
    df = pd.read_csv(bi_path, delimiter='\t')
    bi = df.astype(int)
    combined_rpca = combine_df(rpca_df, bi)
    X_rpca, Y_rpca = get_X_y(combined_rpca, y)
    training(X_rpca, Y_rpca)

    print(f'--------PCA Results--------')
    pca_df = pca_fr(path, pca_dim, svd)
    
    print(f'For Numerical Dataset Only')
    X_pca_nu, Y_pca_nu = get_X_y(pca_df, y)
    training(X_pca_nu, Y_pca_nu)
    
    print(f'For Both Numerical And Binary Dataset')
    df = pd.read_csv(bi_path, delimiter='\t')
    bi = df.astype(int)
    # pca_df = pd.DataFrame(pca_df)
    combined_pca = combine_df(pca_df, bi)
    X_pca, Y_pca = get_X_y(combined_pca, y)
    X_pca.columns = X_pca.columns.astype(str)
    training(X_pca, Y_pca)

In [None]:
# Data 1:
# RPCA Dim = 3
# PCA Dim = 128
get_result(metab1_path, metab3_path, 3, 128, host_age_1)

In [None]:
# Data 1:
# RPCA Dim = 10
# PCA Dim = 256
get_result(metab1_path, metab3_path, 10, 256, host_age_1)

In [None]:
# Data 1:
# RPCA Dim = 16
# PCA Dim = 512
get_result(metab1_path, metab3_path, 16, 512, host_age_1)

In [None]:
# Data 2:
# RPCA Dim = 3
# PCA Dim = 128
get_result(metab2_path, metab4_path, 3, 128, host_age_2)

In [None]:
# Data 2:
# RPCA Dim = 10
# PCA Dim = 256
get_result(metab2_path, metab4_path, 10, 256, host_age_2)

In [None]:
# Data 2:
# RPCA Dim = 16
# PCA Dim = 512
get_result(metab2_path, metab4_path, 16, 512, host_age_2)

In [None]:
# Data 1: RPCA Dim = 128
get_result(metab1_path, metab3_path, 128, 1, host_age_1)

In [None]:
# Data 2: RPCA Dim = 128
get_result(metab2_path, metab4_path, 128, 1, host_age_2)