In [2]:
import numpy as np
import pandas as pd
import biom
from sklearn.preprocessing import MinMaxScaler
from biom import Table
from gemelli.rpca import rpca
from sklearn.metrics import mean_squared_error

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [4]:
table_path = "./redbiom_adrc_wolr2_fecal_v2.biom"
meta_path = "./redbiom_adrc_wolr2_fecal_v2.tsv"

table = biom.load_table(table_path)

meta = pd.read_csv(meta_path, sep='\t')

# print(meta.shape)
# print(meta.head(5))
# print(f"Table shape: {table.shape}")
# print(table.head(5))


In [70]:
# Convert the BIOM table to a pandas DataFrame (OTUs as rows, samples as columns)
biom_df = pd.DataFrame(table.matrix_data.toarray(), index=table.ids(axis='observation'), columns=table.ids(axis='sample'))

In [71]:
biom_df_T = biom_df.T
print(biom_df.shape)
print(biom_df_T.shape)

(15363, 13436)
(13436, 15363)


In [85]:
feature_frequency = 20
columns_to_drop = biom_df_T.columns[biom_df_T.sum() < feature_frequency] #drop columns with low prev
df1 = biom_df_T.drop(columns = columns_to_drop)
# df1 = utils.clr_transformation(df1)

In [86]:
print
print(df1.shape)

(13436, 8308)


In [87]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'

def split_features(df):
    df['feature_type'] = df.apply(classify_feature, axis=1)

    binary_features = df[df['feature_type'] == 'binary']
    numerical_features = df[df['feature_type'] == 'numerical']
    
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    
    print(f"Number of binary features: {binary_features.shape[0]}")
    print(f"Number of numerical features: {numerical_features.shape[0]}")

    return binary_features, numerical_features


In [88]:
def clr_transformation(df):
    # Replace zeros with a very small number to avoid issues with log(0)
    df_replaced_zeros = df.replace(0, 1e-6)
    
    # Calculate the geometric mean for each row
    geometric_mean = df_replaced_zeros.apply(lambda x: np.exp(np.mean(np.log(x))), axis=1)
    
    # Apply the CLR transformation
    clr_transformed = df_replaced_zeros.apply(lambda x: np.log(x / geometric_mean[x.name]), axis=1)
    
    return clr_transformed


In [89]:
df1_clr = clr_transformation(df1)

print(f'--------Original Dataset Feature Info---')
biom_bin, biom_num = split_features(biom_df)
print(f'--------Dataset Feature Info With Feature Frequency = {feature_frequency}--------')
df1_bin, df1_num = split_features(df1.T)
# print(f'--------Dataset Feature Info After Clr Transformation--------')
# df1_clr_bin, df1_clr_num = split_features(df1.T)


--------Original Dataset Feature Info---
Number of binary features: 0
Number of numerical features: 15363
--------Dataset Feature Info With Feature Frequency = 20--------
Number of binary features: 51
Number of numerical features: 8257


In [90]:
'''
Original Binary: 1568/(1568+13795) = 10.2%

----FF=5----
After Dropping Binary: 3.5%
----FF=6----
After Dropping Binary: 270/(270+9294) = 2.8%
----FF=10----
After Dropping Binary: 130/(130+8919) = 1.4%
----FF=15----
After Dropping Binary: 83/(83+8555) = 0.96%
----FF=20----
After Dropping Binary: 51/(51+8257) = 0.61%

'''


'\nOriginal Binary: 1568/(1568+13795) = 10.2%\n\n----FF=5----\nAfter Dropping Binary: 3.5%\n----FF=6----\nAfter Dropping Binary: 270/(270+9294) = 2.8%\n----FF=10----\nAfter Dropping Binary: 130/(130+8919) = 1.4%\n----FF=15----\nAfter Dropping Binary: 83/(83+8555) = 0.96%\n----FF=20----\nAfter Dropping Binary: 51/(51+8257) = 0.61%\n\n'

In [91]:
scaled_d = df1_num.copy()
scaler = MinMaxScaler()

scaled_d[df1_num.columns] = scaler.fit_transform(df1_num)
scaled = scaler.fit_transform(df1_num)

# print("Scaled Numerical Dataset:")
# print(scaled.head())
# print(scaled.shape) # (13795, 13436)

In [92]:
# # print(df1_num.head(5))
# # print(df1.T.head(5))
# print(df1.shape)
# print(scaled.shape)

In [93]:
def rpca_fr(scaled, numerical_features, dim):
    scaled += 1e-10
    sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
    feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
    table_scaled = Table(scaled, feature_ids, sample_ids)

    rpca_results = rpca(table_scaled, n_components=dim)

    ordination, distance = rpca_results
    sample_scores = ordination.samples  # Scores for samples
    feature_scores = ordination.features  # Scores for features
    
    X_reconstructed = np.dot(sample_scores, feature_scores.T)
    
    mse = mean_squared_error(scaled.T, X_reconstructed)
    print(f"Reconstruction MSE: {mse}")
    
    print(f"Sample scores shape: {sample_scores.shape}")
    print(f"Feature scores shape: {feature_scores.shape}")
    print(f"Original scaled data shape: {scaled.T.shape}")
    print(f"Reconstructed data shape: {X_reconstructed.shape}")

    return sample_scores

In [94]:
import time
start_time = time.time()
print(start_time)

reduced_data = rpca_fr(scaled, df1_num, 4)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

1727725965.8551793
Reconstruction MSE: 0.0003722700606232105
Sample scores shape: (13436, 4)
Feature scores shape: (8257, 4)
Original scaled data shape: (13436, 8257)
Reconstructed data shape: (13436, 8257)
Execution time: 323.2673773765564 seconds


In [95]:
def combine_df(df1, df2):
    df1 = pd.DataFrame(df1)
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    return pd.concat([df1, df2], axis=1)
    
def get_X_y(reduced, y, cat):
    combined_df = combine_df(reduced, y)
    cleaned_df = combined_df.dropna()
    X = cleaned_df.drop(columns=[cat])
    y = cleaned_df[cat]
    return X, y

def training(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    print(f"Random Forest MSE: {mse_rf}")

    # # MLP
    # mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=5000, random_state=42)
    # mlp_model.fit(X_train, y_train)
    
    # y_pred_mlp = mlp_model.predict(X_test)
    # mse_mlp = mean_squared_error(y_test, y_pred_mlp)
    # print(f"MLP Regressor MSE: {mse_mlp}")

    # XGBoost
    xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost MSE: {mse_xgb}")

In [96]:
cat = 'host_age'
cat_y = meta[cat]
cat_X = combine_df(reduced_data, df1_bin)

In [97]:
X, y = get_X_y(cat_X, cat_y, cat)
training(X, y)

Linear Regression MSE: 194.21174991381397
Random Forest MSE: 158.43241928386362
XGBoost MSE: 225.44448023642371


In [98]:
'''
Dim = 3
Linear Regression MSE: 3.995264211009359e+28
Random Forest MSE: 332.3066680347287
/opt/conda/lib/python3.11/site-packages/sklearn/neural_network/_multilayer_perceptron.py:697: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
MLP Regressor MSE: 685.7311446346571
XGBoost MSE: 299.14464191813653

Dim = 16
Reconstruction MSE: 0.00022048968863458537
Sample scores shape: (13436, 16)
Feature scores shape: (13795, 16)
Original scaled data shape: (13436, 13795)
Reconstructed data shape: (13436, 13795)

Linear Regression MSE: 1.1185584052698574e+29
Random Forest MSE: 297.4191591508802
XGBoost MSE: 306.4656906982071

--------Drop Low Feature Frequency--------

----Dim = 4----

--Feature Freq = 5--
1727723260.1230855
Reconstruction MSE: 0.0003263223654129197
Sample scores shape: (13436, 4)
Feature scores shape: (9419, 4)
Original scaled data shape: (13436, 9419)
Reconstructed data shape: (13436, 9419)
Execution time: 364.3079755306244 seconds

Linear Regression MSE: 449.6641499171495
Random Forest MSE: 285.2861205474667
XGBoost MSE: 284.12249716704207

--FF=6--
1727724124.2568882
Reconstruction MSE: 0.000330731990772629
Sample scores shape: (13436, 4)
Feature scores shape: (9294, 4)
Original scaled data shape: (13436, 9294)
Reconstructed data shape: (13436, 9294)
Execution time: 368.4696533679962 seconds

Linear Regression MSE: 837.1834712312027
Random Forest MSE: 254.49559979720556
XGBoost MSE: 311.0350083620908

--FF=10--
1727724685.7488904
Reconstruction MSE: 0.00034463803042504126
Sample scores shape: (13436, 4)
Feature scores shape: (8919, 4)
Original scaled data shape: (13436, 8919)
Reconstructed data shape: (13436, 8919)
Execution time: 337.9884605407715 seconds

Linear Regression MSE: 251.11581757858377
Random Forest MSE: 263.78265246685015
XGBoost MSE: 239.09977714115107

--FF=15--
1727725337.0230546
Reconstruction MSE: 0.00035930221102317895
Sample scores shape: (13436, 4)
Feature scores shape: (8555, 4)
Original scaled data shape: (13436, 8555)
Reconstructed data shape: (13436, 8555)
Execution time: 332.6713442802429 seconds

Linear Regression MSE: 157.05134799361545
Random Forest MSE: 214.4763377781649
XGBoost MSE: 276.81906855632036

--FF=20--
1727725965.8551793
Reconstruction MSE: 0.0003722700606232105
Sample scores shape: (13436, 4)
Feature scores shape: (8257, 4)
Original scaled data shape: (13436, 8257)
Reconstructed data shape: (13436, 8257)
Execution time: 323.2673773765564 seconds

Linear Regression MSE: 194.21174991381397
Random Forest MSE: 158.43241928386362
XGBoost MSE: 225.44448023642371

'''


print(y.max())
print(y.min())

80.76
8.38


In [None]:
scaled_d = numerical_features.copy()
scaler = MinMaxScaler()

scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
scaled = pd.DataFrame(scaler.fit_transform(numerical_features.T).T, index=numerical_features.index, columns=numerical_features.columns)

# print("Scaled Numerical Dataset:")
# print(scaled.head())
# print(scaled)

In [None]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'

def split_feature(biom_df):
    biom_df['feature_type'] = biom_df.apply(classify_feature, axis=1)

    binary_features = biom_df[biom_df['feature_type'] == 'binary']
    numerical_features = biom_df[biom_df['feature_type'] == 'numerical']
    
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    
    print(f"Number of binary features: {binary_features.shape[0]}")
    print(f"Number of numerical features: {numerical_features.shape[0]}")

    return binary_features, numerical_features

def scale_data(numerical_features):
    scaled_d = numerical_features.copy()
    scaler = MinMaxScaler()
    scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
    scaled = scaler.fit_transform(numerical_features.T).T
    return scaled

def rpca_fr(scaled, numerical_features, dim):
    scaled += 1e-10
    sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
    feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
    table_scaled = Table(scaled, feature_ids, sample_ids)

    rpca_results = rpca(table_scaled, n_components=dim)

    ordination, distance = rpca_results
    sample_scores = ordination.samples  # Scores for samples
    feature_scores = ordination.features  # Scores for features
    
    X_reconstructed = np.dot(sample_scores, feature_scores.T)
    
    mse = mean_squared_error(scaled.T, X_reconstructed)
    print(f"Reconstruction MSE: {mse}")
    
    print(f"Sample scores shape: {sample_scores.shape}")
    print(f"Feature scores shape: {feature_scores.shape}")
    print(f"Original scaled data shape: {scaled.T.shape}")
    print(f"Reconstructed data shape: {X_reconstructed.shape}")

    return sample_scores

In [None]:
def rpca(df, dim):
    bin_data, num_data = split_feature(df)
    scaled_data = scale_data(num_data)
    reduced_data = rpca_fr(scaled_data, num_data, dim)
    return reduced_data

In [None]:
rpca(biom_df, 3)

In [None]:
scaled += 1e-10
sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
table_scaled = Table(scaled, feature_ids, sample_ids)


In [None]:
dim = 3  # Number of components to reduce to
rpca_results = rpca(table_scaled, n_components=dim)

ordination, distance = rpca_results
sample_scores = ordination.samples  # Scores for samples
feature_scores = ordination.features  # Scores for features

X_reconstructed = np.dot(sample_scores, feature_scores.T)

mse = mean_squared_error(scaled.T, X_reconstructed)
print(f"Reconstruction MSE: {mse}")

print(f"Sample scores shape: {sample_scores.shape}")
print(f"Feature scores shape: {feature_scores.shape}")
print(f"Original scaled data shape: {scaled.T.shape}")
print(f"Reconstructed data shape: {X_reconstructed.shape}")

In [None]:
# Reconstruction MSE: 0.003958728646643008
# Sample scores shape: (1312, 3)
# Feature scores shape: (1102, 3)
# Original scaled data shape: (1312, 1102)
# Reconstructed data shape: (1312, 1102)


# Reconstruction MSE: 0.0033221211643852722
# Sample scores shape: (1312, 10)
# Feature scores shape: (8448, 10)
# Original scaled data shape: (1312, 8448)
# Reconstructed data shape: (1312, 8448)

In [None]:
print(ordination)
print(distance)



In [None]:
# Only using reduced scaled numerical dataset
reduced_df = pd.DataFrame(sample_scores, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [None]:
# # Using both binary and reduced numerical dataset
# reduced_df = pd.DataFrame(sample_scores, index=sample_ids)

# binary_transposed = binary_features.T 
# # reduced_df.index = reduced_df.index.astype(float)
# # print(reduced_df.index)
# # print(binary_transposed.index)
# # print("Do sample names match?", (reduced_df.index == binary_transposed.index).all())
# combined_df = pd.concat([reduced_df, binary_transposed], axis=1)
# combined_df.index = reduced_df.index.astype(float)

# meta_df = meta.set_index('sample_name')

# merged_df = combined_df.join(metadata_df[['host_age']], how='inner')
# cleaned_df = merged_df.dropna(subset=['host_age'])

# # print(f"Original dataset shape: {merged_df.shape}")
# # print(f"Cleaned dataset shape: {cleaned_df.shape}")
# # # Original dataset shape: (1312, 1749)
# # # Cleaned dataset shape: (1301, 1749)

# X = cleaned_df.drop(columns=['host_age'])
# Y = cleaned_df['host_age']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

In [None]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

In [None]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

In [None]:
'''
--------RPCA Num ONLY--------
Dim = 3
Scaler: MinMax Scaler
Reconstruction MSE: 0.003958728646643008
Cleaned dataset shape: (1301, 4)
Linear Regression MSE: 68.19986181183467
Random Forest MSE: 75.95139961685823
MLP Regressor MSE: 68.22414791217268
XGBoost MSE: 76.82236484129864

Dim = 10
Scaler: MinMax Scaler
Reconstruction MSE: 0.003946154448124425
Linear Regression MSE: 59.808153281084984
Random Forest MSE: 62.939483908045986
MLP Regressor MSE: 59.78929765458566
XGBoost MSE: 64.94144790232647

'''

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

data_transposed = numerical_features.T

scaler = StandardScaler()
standardized_data = scaler.fit_transform(data_transposed)

reduced_dim = 512
pca = PCA(n_components=reduced_dim, svd_solver='auto')
pca_reduced = pca.fit_transform(standardized_data)
pca_reconstructed = pca.inverse_transform(pca_reduced)

mse_pca = mean_squared_error(standardized_data, pca_reconstructed)

print(mse_pca)

In [None]:
reduced_df = pd.DataFrame(pca_reduced, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

In [None]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

In [None]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

In [None]:
'''
--------PCA--------
Dim = 128
Reconstruction MSE(full): 0.32280275488908244
Cleaned dataset shape: (1301, 129)
Linear Regression MSE: 2841.559584303109
Random Forest MSE: 66.89663639846744
MLP Regressor MSE: 895.1086733724248
XGBoost MSE: 70.36381670902817

Reconstruction MSE(auto): 0.32696752866827633
Linear Regression MSE: 97.59366339211996
Random Forest MSE: 64.57761494252874
MLP Regressor MSE: 970.4559274788406
XGBoost MSE: 64.4298729789169

Dim = 512
Reconstruction MSE(full): 0.01719558259971444
Linear Regression MSE: 551.90068430994
Random Forest MSE: 66.89663639846744
MLP Regressor MSE: 811.2290273839908
XGBoost MSE: 69.61207270397696

Reconstruction MSE(auto): 0.01743150302309283
Linear Regression MSE: 580.8346541828049
Random Forest MSE: 63.36988850574713
MLP Regressor MSE: 911.1563581922785
XGBoost MSE: 67.18452645782199
'''

In [None]:
print(cleaned_df['host_age'].max())
print(cleaned_df['host_age'].min())