In [1]:
import numpy as np
import pandas as pd
import biom
from sklearn.preprocessing import MinMaxScaler
from biom import Table
from gemelli.rpca import rpca
from sklearn.metrics import mean_squared_error

In [2]:
meta_path = "./adrc_full_metadata.csv"
table_path = "./adrc_full_table.biom"
# table_path = "./feature_reduction/metadata/adrc_coverage_filtered_table.biom"
meta = pd.read_csv(meta_path)
table = biom.load_table(table_path)

print(meta.shape)
print(meta.head(5))
print(f"Table shape: {table.shape}")

(1312, 355)
    sample_name      NACCID Kit.Number  Specimen.Bar.Code Visit  FORMVER  \
0  15448.582342  NACC849121     376222            5823422    Y1      3.2   
1  15448.582709  NACC918252     376177            5827089    Y1      3.2   
2  15448.583516  NACC420113     376232            5835163    Y1      3.2   
3  15448.594676  NACC454008     404184            5946764    Y1      3.2   
4  15448.421452  NACC329271     420091           42145155    Y2      3.0   

   NACCVNUM  SEX  HISPANIC  RACE  ...    tube_id  turbidity_grade  uom  visit  \
0         3    1         0     1  ...  363236903              NaN  NaN     Y1   
1         4    1         0     1  ...  363195322              NaN  NaN     Y1   
2         9    2         0     1  ...  363183648              NaN  NaN     Y1   
3         4    2         0     1  ...  363236757              NaN  NaN     Y1   
4         3    2         0     2  ...  363236670              NaN  NaN     Y2   

   well_id  year_of_collection     yob  amyl

In [3]:
# Convert the BIOM table to a pandas DataFrame (OTUs as rows, samples as columns)
biom_df = pd.DataFrame(table.matrix_data.toarray(), index=table.ids(axis='observation'), columns=table.ids(axis='sample'))


In [4]:
print(biom_df.head(5))

            15448.0005635586  15448.0005643690  15448.0005671650  \
G000005825               0.0               0.0               0.0   
G000006605               0.0               0.0               0.0   
G000006725               0.0               0.0               0.0   
G000006745               0.0               0.0               0.0   
G000006785               0.0               0.0               1.0   

            15448.0005700235  15448.0005704913  15448.0005704938  \
G000005825               1.0               0.0               0.0   
G000006605               0.0               0.0               0.0   
G000006725               0.0               0.0               0.0   
G000006745               0.0               0.0               0.0   
G000006785               3.0               1.0               0.0   

            15448.0005707913  15448.0005714515  15448.0005728264  \
G000005825               0.0               0.0               0.0   
G000006605               1.0               0.0

In [25]:
# # Check general information about the table
# print("Table dimensions (rows, columns):", biom_df.shape)
# print("Maximum value in the table:", biom_df.values.max())
# print("Minimum value in the table:", biom_df.values.min())
# print("Mean value in the table:", biom_df.values.mean())
# print("Standard deviation in the table:", biom_df.values.std())

# # Optionally, you can get descriptive statistics (min, max, mean, etc. for each column)
# print("Descriptive statistics:\n", biom_df.describe())

# Table dimensions (rows, columns): (10193, 1312)
# Maximum value in the table: 8599868.0
# Minimum value in the table: 0.0
# Mean value in the table: 683.7085010067885
# Standard deviation in the table: 19481.296853288957
# Descriptive statistics:
#         15448.0005635586  15448.0005643690  15448.0005671650  15448.0005700235  \
# count      10193.000000      1.019300e+04      10193.000000      10193.000000   
# mean         120.188070      3.704773e+02        376.374080        123.804670   
# std         2432.148599      1.271535e+04       6914.365337       2666.197215   
# min            0.000000      0.000000e+00          0.000000          0.000000   
# 25%            0.000000      0.000000e+00          0.000000          0.000000   
# 50%            0.000000      0.000000e+00          0.000000          0.000000   
# 75%            0.000000      0.000000e+00          0.000000          0.000000   
# max       175514.000000      1.107260e+06     310734.000000     230160.000000   

#        15448.0005704913  15448.0005704938  15448.0005707913  15448.0005714515  \
# count      10193.000000      10193.000000      10193.000000      10193.000000   
# mean         240.527813        258.992838        158.010203        225.403316   
# std         6288.504698       6798.884678       4678.156618       6603.885954   
# min            0.000000          0.000000          0.000000          0.000000   
# 25%            0.000000          0.000000          0.000000          0.000000   
# 50%            0.000000          0.000000          0.000000          0.000000   
# 75%            0.000000          0.000000          0.000000          0.000000   
# max       573808.000000     473105.000000     401999.000000     605058.000000   

#        15448.0005728264  15448.0005728304  ...  15448.43049648  \
# count      10193.000000      10193.000000  ...    1.019300e+04   
# mean         286.159031        349.248406  ...    1.553244e+03   
# std         7258.375894       8228.733100  ...    2.695542e+04   
# min            0.000000          0.000000  ...    0.000000e+00   
# 25%            0.000000          0.000000  ...    0.000000e+00   
# 50%            0.000000          0.000000  ...    0.000000e+00   
# 75%            0.000000          0.000000  ...    1.000000e+00   
# max       560804.000000     574964.000000  ...    1.916519e+06   

#        15448.43051236  15448.43051259  15448.43051286  15448.43051306  \
# count    10193.000000    1.019300e+04    10193.000000    1.019300e+04   
# mean       574.905425    1.196409e+03      611.245463    1.361047e+03   
# std      11065.953035    3.049383e+04     8212.387482    2.235665e+04   
# min          0.000000    0.000000e+00        0.000000    0.000000e+00   
# 25%          0.000000    0.000000e+00        0.000000    0.000000e+00   
# 50%          0.000000    0.000000e+00        0.000000    0.000000e+00   
# 75%          0.000000    0.000000e+00        1.000000    1.000000e+00   
# max     587838.000000    2.875161e+06   464469.000000    1.753825e+06   

#        15448.43105150  15448.5823422  15448.5827089  15448.5835163  \
# count    10193.000000   10193.000000   1.019300e+04   10193.000000   
# mean       607.535858     967.961640   9.991206e+02     966.035024   
# std       8956.249757   12099.219172   1.833273e+04   17898.826230   
# min          0.000000       0.000000   0.000000e+00       0.000000   
# 25%          0.000000       0.000000   0.000000e+00       0.000000   
# 50%          0.000000       0.000000   0.000000e+00       0.000000   
# 75%          0.000000       1.000000   0.000000e+00       0.000000   
# max     501124.000000  394460.000000   1.320731e+06  943580.000000   

#        15448.5946764  
# count   1.019300e+04  
# mean    1.315904e+03  
# std     2.524127e+04  
# min     0.000000e+00  
# 25%     0.000000e+00  
# 50%     0.000000e+00  
# 75%     1.000000e+00  
# max     1.520083e+06  

# [8 rows x 1312 columns]

In [29]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'

biom_df['feature_type'] = biom_df.apply(classify_feature, axis=1)

binary_features = biom_df[biom_df['feature_type'] == 'binary']
numerical_features = biom_df[biom_df['feature_type'] == 'numerical']

binary_features = binary_features.drop(columns=['feature_type'])
numerical_features = numerical_features.drop(columns=['feature_type'])

print(f"Number of binary features: {binary_features.shape[0]}")
print(f"Number of numerical features: {numerical_features.shape[0]}")


Number of binary features: 1745
Number of numerical features: 8448


In [67]:
# scaled_d = numerical_features.copy()
# scaler = MinMaxScaler()

# scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
# scaled = pd.DataFrame(scaler.fit_transform(numerical_features.T).T, index=numerical_features.index, columns=numerical_features.columns)

# # print("Scaled Numerical Dataset:")
# # print(scaled.head())
# print(scaled)

In [68]:
scaled_d = numerical_features.copy()
scaler = MinMaxScaler()

scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
scaled = scaler.fit_transform(numerical_features.T).T

# print("Scaled Numerical Dataset:")
# print(scaled.head())
# print(scaled)

In [69]:
scaled += 1e-10
sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
table_scaled = Table(scaled, feature_ids, sample_ids)


In [171]:
dim = 10  # Number of components to reduce to
rpca_results = rpca(table_scaled, n_components=dim)

ordination, distance = rpca_results
sample_scores = ordination.samples  # Scores for samples
feature_scores = ordination.features  # Scores for features

X_reconstructed = np.dot(sample_scores, feature_scores.T)

mse = mean_squared_error(scaled.T, X_reconstructed)
print(f"Reconstruction MSE: {mse}")

print(f"Sample scores shape: {sample_scores.shape}")
print(f"Feature scores shape: {feature_scores.shape}")
print(f"Original scaled data shape: {scaled.T.shape}")
print(f"Reconstructed data shape: {X_reconstructed.shape}")

Reconstruction MSE: 0.003322121164396924
Sample scores shape: (1312, 10)
Feature scores shape: (8448, 10)
Original scaled data shape: (1312, 8448)
Reconstructed data shape: (1312, 8448)


In [160]:
# Reconstruction MSE: 0.003335061983338765
# Sample scores shape: (1312, 3)
# Feature scores shape: (8448, 3)
# Original scaled data shape: (1312, 8448)
# Reconstructed data shape: (1312, 8448)

# Reconstruction MSE: 0.0033221211643852722
# Sample scores shape: (1312, 10)
# Feature scores shape: (8448, 10)
# Original scaled data shape: (1312, 8448)
# Reconstructed data shape: (1312, 8448)

In [161]:
# print(ordination)
# print(distance)

# # Ordination results:
# # 	Method: (Robust Aitchison) RPCA Biplot (rpca_biplot)
# # 	Eigvals: 3
# # 	Proportion explained: 3
# # 	Features: 8448x3
# # 	Samples: 1312x3
# # 	Biplot Scores: N/A
# # 	Sample constraints: N/A
# # 	Feature IDs: 'G000005825', 'G000006605', 'G000006725', 'G000006745', 'G000006785', ...
# # 	Sample IDs: '15448.0005635586', '15448.0005643690', '15448.0005671650', '15448.0005700235', ...
# # 1312x1312 distance matrix
# # IDs:
# # '15448.0005635586', '15448.0005643690', '15448.0005671650', '15448.0005700235', ...
# # Data:
# # [[0.         0.78449788 0.64521211 ... 1.71529196 2.67520481 2.77895356]
# #  [0.78449788 0.         1.02136413 ... 1.8786006  2.6711468  2.79996043]
# #  [0.64521211 1.02136413 0.         ... 1.07032847 2.04138511 2.14052415]
# #  ...
# #  [1.71529196 1.8786006  1.07032847 ... 0.         1.00700367 1.08827712]
# #  [2.67520481 2.6711468  2.04138511 ... 1.00700367 0.         0.14763164]
# #  [2.77895356 2.79996043 2.14052415 ... 1.08827712 0.14763164 0.        ]]

In [114]:
# Only using reduced scaled numerical dataset
reduced_df = pd.DataFrame(sample_scores, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [200]:
# Using both binary and reduced numerical dataset
reduced_df = pd.DataFrame(sample_scores, index=sample_ids)

binary_transposed = binary_features.T 
# reduced_df.index = reduced_df.index.astype(float)
# print(reduced_df.index)
# print(binary_transposed.index)
# print("Do sample names match?", (reduced_df.index == binary_transposed.index).all())
combined_df = pd.concat([reduced_df, binary_transposed], axis=1)
combined_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = combined_df.join(metadata_df[['host_age']], how='inner')
cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# # Original dataset shape: (1312, 1749)
# # Cleaned dataset shape: (1301, 1749)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [173]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

Linear Regression MSE: 3.435636078812604e+27


In [174]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

Random Forest MSE: 68.0639318007663


In [175]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

MLP Regressor MSE: 170.48459862383126


In [176]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

XGBoost MSE: 66.91970884783616


In [None]:
'''
--------RPCA Num ONLY--------
Dim = 3
Scaler: MinMax Scaler
Reconstruction MSE: 0.003335061983338765
Cleaned dataset shape: (1301, 4)
Linear Regression MSE: 68.21386741357641
Random Forest MSE: 74.40174406130268
MLP Regressor MSE: 68.23618351926648
XGBoost MSE: 73.97999390869334

Dim = 10
Scaler: MinMax Scaler
Reconstruction MSE: 0.0033221211643852722
Linear Regression MSE: 64.54681775188067
Random Forest MSE: 69.77633448275861
MLP Regressor MSE: 64.54510624008212
XGBoost MSE: 73.98760806174032

--------RPCA Num & Bin--------
Dim = 3
Scaler: MinMax Scaler
Reconstruction MSE: 0.003335061983338765
Cleaned dataset shape: (1301, 1749)
Linear Regression MSE: 1.5921342189492617e+28
Random Forest MSE: 76.84507739463601
MLP Regressor MSE: 220.2787336807211
XGBoost MSE: 74.42477144102078

Dim = 3
Scaled: MinMax Scaler
Reconstruction MSE: 0.003322121164396924
Cleaned dataset shape: (1301, 1749)
Linear Regression MSE: 3.435636078812604e+27
Random Forest MSE: 68.0639318007663
MLP Regressor MSE: 170.48459862383126
XGBoost MSE: 66.91970884783616
'''

In [192]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

data_transposed = numerical_features.T

scaler = StandardScaler()
standardized_data = scaler.fit_transform(data_transposed)

reduced_dim = 512
pca = PCA(n_components=reduced_dim, svd_solver='full')
pca_reduced = pca.fit_transform(standardized_data)
pca_reconstructed = pca.inverse_transform(pca_reduced)

mse_pca = mean_squared_error(standardized_data, pca_reconstructed)

print(mse_pca)

0.09005376802270713


In [208]:
reduced_df = pd.DataFrame(pca_reduced, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [206]:
# Using both binary and reduced numerical dataset
reduced_df = pd.DataFrame(pca_reduced, index=sample_ids)

binary_transposed = binary_features.T 
# reduced_df.index = reduced_df.index.astype(float)
# print(reduced_df.index)
# print(binary_transposed.index)
# print("Do sample names match?", (reduced_df.index == binary_transposed.index).all())
combined_df = pd.concat([reduced_df, binary_transposed], axis=1)
combined_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = combined_df.join(metadata_df[['host_age']], how='inner')
cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# # Original dataset shape: (1312, 1749)
# # Cleaned dataset shape: (1301, 1749)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

# print(X)

In [212]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

Linear Regression MSE: 2841.559584303109


In [213]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

Random Forest MSE: 66.89663639846744


In [147]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

MLP Regressor MSE: 895.1086733724248


In [148]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

XGBoost MSE: 70.36381670902817


In [None]:
'''
--------RPCA Num ONLY--------
Dim = 3
Scaler: MinMax Scaler
Reconstruction MSE: 0.003335061983338765
Cleaned dataset shape: (1301, 4)
Linear Regression MSE: 68.21386741357641
Random Forest MSE: 74.40174406130268
MLP Regressor MSE: 68.23618351926648
XGBoost MSE: 73.97999390869334

Dim = 10
Scaler: MinMax Scaler
Reconstruction MSE: 0.0033221211643852722
Linear Regression MSE: 64.54681775188067
Random Forest MSE: 69.77633448275861
MLP Regressor MSE: 64.54510624008212
XGBoost MSE: 73.98760806174032

--------RPCA Num & Bin--------
Dim = 3
Scaler: MinMax Scaler
Reconstruction MSE: 0.003335061983338765
Cleaned dataset shape: (1301, 1749)
Linear Regression MSE: 1.5921342189492617e+28
Random Forest MSE: 76.84507739463601
MLP Regressor MSE: 220.2787336807211
XGBoost MSE: 74.42477144102078

Dim = 3
Scaled: MinMax Scaler
Reconstruction MSE: 0.003322121164396924
Cleaned dataset shape: (1301, 1749)
Linear Regression MSE: 3.435636078812604e+27
Random Forest MSE: 68.0639318007663
MLP Regressor MSE: 170.48459862383126
XGBoost MSE: 66.91970884783616
'''

'''
--------PCA--------
Dim = 128
Reconstruction MSE(full): 0.3550807607177242
Cleaned dataset shape: (1301, 129)
Linear Regression MSE: 331.4252737672184
Random Forest MSE: 65.77763601532565
MLP Regressor MSE: 1105.2147394644705
XGBoost MSE: 71.20844263899042
Reconstruction MSE(auto): 0.355807160249656
Linear Regression MSE: 285.86977878343504
Random Forest MSE: 67.39131762452108
MLP Regressor MSE: 1057.784862529724
XGBoost MSE: 72.47198869159442

Dim = 512
Reconstruction MSE(full): 0.09005376802270713
Linear Regression MSE: 2841.559584303109
Random Forest MSE: 66.89663639846744
MLP Regressor MSE: 895.1086733724248
XGBoost MSE: 70.36381670902817




--------PCA With Both--------
Dim = 512
PCA MSE: 0.09005376802270713
Cleaned dataset shape: (1301, 2258)

'''

In [191]:
print(cleaned_df['host_age'].max())
print(cleaned_df['host_age'].min())

100.0
21.0
