In [1]:
import numpy as np
import pandas as pd
import biom
from biom import Table
from gemelli.rpca import rpca
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [3]:
meta_path = './adrc_full_metadata.csv'
meta = pd.read_csv(meta_path)

metab1_path = './../v2_dataset/metab_df_1_processed.csv'
metab2_path = './../v2_dataset/metab_df_2_processed.csv'
metab3_path = './../v2_dataset/metab_drugs_df_1_processed.csv'
metab4_path = './../v2_dataset/metab_drugs_df_2_processed.csv'


In [4]:
# print(df.shape)
# print(df.head(5))

# print(df2.shape)
# print(df2.head(5))

# print(df3.shape)
# print(df3.head(5))

# print(df4.shape)
# print(df4.head(5))

In [5]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'
        
        
def rpca_fr(path, dim):
    df = pd.read_csv(path, delimiter='\t')

    df['feature_type'] = df.apply(classify_feature, axis=1)
    binary_features = df[df['feature_type'] == 'binary']
    numerical_features = df[df['feature_type'] == 'numerical']
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    print(f'Number of binary features: {binary_features.shape[0]}')
    print(f'Number of numerical features: {numerical_features.shape[0]}')

    scaled = numerical_features.copy()
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(numerical_features)

    scaled += 1e-10
    sample_ids = numerical_features.index.tolist()  # Sample IDs (rows)
    feature_ids = numerical_features.columns.tolist()   # Feature IDs (columns)
    table_scaled = Table(scaled.T, feature_ids, sample_ids)

    rpca_results = rpca(table_scaled, n_components=dim)

    ordination, distance = rpca_results
    sample_scores = ordination.samples  # Scores for samples
    feature_scores = ordination.features  # Scores for features
    
    X_reconstructed = np.dot(sample_scores, feature_scores.T)
    
    mse = mean_squared_error(scaled, X_reconstructed)
    print(f'Reconstruction MSE: {mse}')
    
    print(f'Sample scores shape: {sample_scores.shape}')
    print(f'Feature scores shape: {feature_scores.shape}')
    print(f'Original scaled data shape: {scaled.T.shape}')
    print(f'Reconstructed data shape: {X_reconstructed.shape}\n')

    reduced_df = pd.DataFrame(sample_scores, index=sample_ids)
    return reduced_df

In [6]:
def pca_fr(path, dim, svd='full'):
    df = pd.read_csv(path, delimiter='\t')

    df['feature_type'] = df.apply(classify_feature, axis=1)
    binary_features = df[df['feature_type'] == 'binary']
    numerical_features = df[df['feature_type'] == 'numerical']
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    print(f'Number of binary features: {binary_features.shape[0]}')
    print(f'Number of numerical features: {numerical_features.shape[0]}')

    scaled = numerical_features.copy()
    scaler = StandardScaler()
    scaled = scaler.fit_transform(numerical_features)

    pca = PCA(n_components=dim, svd_solver=svd)
    reduced_df = pca.fit_transform(scaled)
    X_reconstructed = pca.inverse_transform(reduced_df)
    
    mse = mean_squared_error(scaled, X_reconstructed)
    print(f'Reconstruction MSE: {mse}')
    
    print(f'Reduced data shape: {reduced_df.shape}')    
    print(f'Original scaled data shape: {scaled.shape}')
    print(f'Reconstructed data shape: {X_reconstructed.shape}\n')

    return reduced_df

In [7]:
def combine_df(df1, df2):
    df1 = pd.DataFrame(df1)
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    return pd.concat([df1, df2], axis=1)

In [8]:
def get_X_y(reduced, y):
    combined_df = combine_df(reduced, y)
    cleaned_df = combined_df.dropna(subset=['host_age'])
    X = cleaned_df.drop(columns=['host_age'])
    y = cleaned_df['host_age']
    return X, y

In [9]:
def training(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    print(f"Random Forest MSE: {mse_rf}")

    # MLP
    '''/opt/conda/lib/python3.11/site-packages/sklearn/neural_network/_multilayer_perceptron.py:691: 
    ConvergenceWarning: Stochastic Optimizer: 
    Maximum iterations (3000) reached and the optimization hasn't converged yet.
    warnings.warn(
    '''
    mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=4000, random_state=42)
    mlp_model.fit(X_train, y_train)
    
    y_pred_mlp = mlp_model.predict(X_test)
    mse_mlp = mean_squared_error(y_test, y_pred_mlp)
    print(f"MLP Regressor MSE: {mse_mlp}")

    # XGBoost
    xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost MSE: {mse_xgb}")

In [10]:
host_age_1 = meta['host_age'][:611]
host_age_2 = meta['host_age'][611:]

In [13]:
def get_result(path, bi_path, rpca_dim, pca_dim, y, svd='full'):
    print(f'--------RPCA Results--------')
    rpca_df = rpca_fr(path, rpca_dim)

    print(f'For Numerical Dataset Only')
    X_rpca_nu, Y_rpca_nu = get_X_y(rpca_df, y)
    training(X_rpca_nu, Y_rpca_nu)
    
    print(f'For Both Numerical And Binary Dataset')
    df = pd.read_csv(bi_path, delimiter='\t')
    bi = df.astype(int)
    combined_rpca = combine_df(rpca_df, bi)
    X_rpca, Y_rpca = get_X_y(combined_rpca, y)
    training(X_rpca, Y_rpca)

    print(f'--------PCA Results--------')
    pca_df = pca_fr(path, pca_dim, svd)
    
    print(f'For Numerical Dataset Only')
    X_pca_nu, Y_pca_nu = get_X_y(pca_df, y)
    training(X_pca_nu, Y_pca_nu)
    
    print(f'For Both Numerical And Binary Dataset')
    df = pd.read_csv(bi_path, delimiter='\t')
    bi = df.astype(int)
    # pca_df = pd.DataFrame(pca_df)
    combined_pca = combine_df(pca_df, bi)
    X_pca, Y_pca = get_X_y(combined_pca, y)
    X_pca.columns = X_pca.columns.astype(str)
    training(X_pca, Y_pca)

In [14]:
# Data 1:
# RPCA Dim = 3
# PCA Dim = 128
get_result(metab1_path, metab3_path, 3, 128, host_age_1)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.012501460682039531
Sample scores shape: (611, 3)
Feature scores shape: (20628, 3)
Original scaled data shape: (20628, 611)
Reconstructed data shape: (611, 20628)

For Numerical Dataset Only
Linear Regression MSE: 46.51356591171915
Random Forest MSE: 53.10572601626015
MLP Regressor MSE: 46.57046178347206
XGBoost MSE: 55.23687578651714
For Both Numerical And Binary Dataset
Linear Regression MSE: 2.0670389960185925e+23
Random Forest MSE: 50.723239837398395
MLP Regressor MSE: 161.16620473368565
XGBoost MSE: 58.60295667583456
--------PCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.30418474524385936
Reduced data shape: (611, 128)
Original scaled data shape: (611, 20628)
Reconstructed data shape: (611, 20628)

For Numerical Dataset Only
Linear Regression MSE: 310.81368949552785
Random Forest MSE: 48.94675447154472
MLP Regres

In [15]:
# Data 1:
# RPCA Dim = 10
# PCA Dim = 256
get_result(metab1_path, metab3_path, 10, 256, host_age_1)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.012475447633089843
Sample scores shape: (611, 10)
Feature scores shape: (20628, 10)
Original scaled data shape: (20628, 611)
Reconstructed data shape: (611, 20628)

For Numerical Dataset Only
Linear Regression MSE: 48.36572072065762
Random Forest MSE: 53.34101219512195
MLP Regressor MSE: 48.26453371342069
XGBoost MSE: 63.55266072613479
For Both Numerical And Binary Dataset
Linear Regression MSE: 3.3465879416889438e+25
Random Forest MSE: 50.49903414634147
MLP Regressor MSE: 143.40681692537234
XGBoost MSE: 60.129498314400266
--------PCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.13926215572094966
Reduced data shape: (611, 256)
Original scaled data shape: (611, 20628)
Reconstructed data shape: (611, 20628)

For Numerical Dataset Only
Linear Regression MSE: 4215.367316451802
Random Forest MSE: 48.52167723577236
MLP Regre

In [16]:
# Data 1:
# RPCA Dim = 16
# PCA Dim = 512
get_result(metab1_path, metab3_path, 16, 512, host_age_1)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.01246277297743487
Sample scores shape: (611, 16)
Feature scores shape: (20628, 16)
Original scaled data shape: (20628, 611)
Reconstructed data shape: (611, 20628)

For Numerical Dataset Only
Linear Regression MSE: 49.21721527726441
Random Forest MSE: 50.49815934959351
MLP Regressor MSE: 49.099668863659716
XGBoost MSE: 54.07871717560423
For Both Numerical And Binary Dataset
Linear Regression MSE: 6.843520123376235e+23
Random Forest MSE: 49.439541463414635
MLP Regressor MSE: 157.97884534963035
XGBoost MSE: 52.505496468165475
--------PCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.012952282020246434
Reduced data shape: (611, 512)
Original scaled data shape: (611, 20628)
Reconstructed data shape: (611, 20628)

For Numerical Dataset Only
Linear Regression MSE: 72224.23663194376
Random Forest MSE: 49.20252845528456
MLP Regr

In [17]:
# Data 2:
# RPCA Dim = 3
# PCA Dim = 128
get_result(metab2_path, metab4_path, 3, 128, host_age_2)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.009809758083317488
Sample scores shape: (701, 3)
Feature scores shape: (17354, 3)
Original scaled data shape: (17354, 701)
Reconstructed data shape: (701, 17354)

For Numerical Dataset Only
Linear Regression MSE: 64.89636150347302
Random Forest MSE: 78.51629130434783
MLP Regressor MSE: 64.94867495905055
XGBoost MSE: 94.56859406491328
For Both Numerical And Binary Dataset
Linear Regression MSE: 1.7489068384534355e+17
Random Forest MSE: 73.03219782608694




MLP Regressor MSE: 172.96033931483916
XGBoost MSE: 88.75558652196365
--------PCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.44225927060364
Reduced data shape: (701, 128)
Original scaled data shape: (701, 17354)
Reconstructed data shape: (701, 17354)

For Numerical Dataset Only
Linear Regression MSE: 205.25543013630707
Random Forest MSE: 65.63490652173913
MLP Regressor MSE: 547.2678923052799
XGBoost MSE: 72.75895342517701
For Both Numerical And Binary Dataset
Linear Regression MSE: 433.9419910032091
Random Forest MSE: 65.59200724637681
MLP Regressor MSE: 637.7967797632032
XGBoost MSE: 71.9908366353758


In [18]:
# Data 2:
# RPCA Dim = 10
# PCA Dim = 256
get_result(metab2_path, metab4_path, 10, 256, host_age_2)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.009785697839528871
Sample scores shape: (701, 10)
Feature scores shape: (17354, 10)
Original scaled data shape: (17354, 701)
Reconstructed data shape: (701, 17354)

For Numerical Dataset Only
Linear Regression MSE: 65.79234530061986
Random Forest MSE: 75.99756086956522
MLP Regressor MSE: 65.736632213642
XGBoost MSE: 81.5469668116932
For Both Numerical And Binary Dataset
Linear Regression MSE: 4.793693716762208e+18
Random Forest MSE: 76.53347246376812




MLP Regressor MSE: 186.19761168736875
XGBoost MSE: 80.5837383961988
--------PCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.2413041825594591
Reduced data shape: (701, 256)
Original scaled data shape: (701, 17354)
Reconstructed data shape: (701, 17354)

For Numerical Dataset Only
Linear Regression MSE: 887.6770323598265
Random Forest MSE: 71.02130942028985
MLP Regressor MSE: 858.3208990703735
XGBoost MSE: 68.89333748259578
For Both Numerical And Binary Dataset
Linear Regression MSE: 2166.332639372921
Random Forest MSE: 72.68692173913044
MLP Regressor MSE: 814.2007368065916
XGBoost MSE: 71.49099277441164


In [19]:
# Data 2:
# RPCA Dim = 16
# PCA Dim = 512
get_result(metab2_path, metab4_path, 16, 512, host_age_2)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.009772766867380706
Sample scores shape: (701, 16)
Feature scores shape: (17354, 16)
Original scaled data shape: (17354, 701)
Reconstructed data shape: (701, 17354)

For Numerical Dataset Only
Linear Regression MSE: 66.63682231242703
Random Forest MSE: 67.70672971014491
MLP Regressor MSE: 66.61807889589264
XGBoost MSE: 75.4900028398696
For Both Numerical And Binary Dataset
Linear Regression MSE: 5.350338509513611e+18
Random Forest MSE: 68.65868840579711




MLP Regressor MSE: 226.3978073720886
XGBoost MSE: 74.63492938161242
--------PCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.04765471512575212
Reduced data shape: (701, 512)
Original scaled data shape: (701, 17354)
Reconstructed data shape: (701, 17354)

For Numerical Dataset Only
Linear Regression MSE: 79469.05484510306
Random Forest MSE: 66.28607971014492
MLP Regressor MSE: 2910.9152381204904
XGBoost MSE: 70.55328645360609
For Both Numerical And Binary Dataset
Linear Regression MSE: 422395.53964730556
Random Forest MSE: 66.23017536231883
MLP Regressor MSE: 2329.5608811064244
XGBoost MSE: 73.99400255921765


In [None]:
# Data 1: RPCA Dim = 128
get_result(metab1_path, metab3_path, 128, 1, host_age_1)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 611


In [None]:
# Data 2: RPCA Dim = 128
get_result(metab2_path, metab4_path, 128, 1, host_age_2)