In [1]:
import numpy as np
import pandas as pd
import biom
from sklearn.preprocessing import MinMaxScaler
from biom import Table`
from gemelli.rpca import rpca
from sklearn.metrics import mean_squared_error

In [2]:
meta_path = "./adrc_full_metadata.csv"
table_path = "./adrc_full_table.biom"
table_path = "./../metadata/adrc_coverage_filtered_table.biom"
meta = pd.read_csv(meta_path)
table = biom.load_table(table_path)

print(meta.shape)
print(meta.head(5))
print(f"Table shape: {table.shape}")
print(table.head(5))


(1312, 355)
    sample_name      NACCID Kit.Number  Specimen.Bar.Code Visit  FORMVER  \
0  15448.582342  NACC849121     376222            5823422    Y1      3.2   
1  15448.582709  NACC918252     376177            5827089    Y1      3.2   
2  15448.583516  NACC420113     376232            5835163    Y1      3.2   
3  15448.594676  NACC454008     404184            5946764    Y1      3.2   
4  15448.421452  NACC329271     420091           42145155    Y2      3.0   

   NACCVNUM  SEX  HISPANIC  RACE  ...    tube_id  turbidity_grade  uom  visit  \
0         3    1         0     1  ...  363236903              NaN  NaN     Y1   
1         4    1         0     1  ...  363195322              NaN  NaN     Y1   
2         9    2         0     1  ...  363183648              NaN  NaN     Y1   
3         4    2         0     1  ...  363236757              NaN  NaN     Y1   
4         3    2         0     2  ...  363236670              NaN  NaN     Y2   

   well_id  year_of_collection     yob  amyl

In [3]:
# Convert the BIOM table to a pandas DataFrame (OTUs as rows, samples as columns)
biom_df = pd.DataFrame(table.matrix_data.toarray(), index=table.ids(axis='observation'), columns=table.ids(axis='sample'))


In [4]:
# # # Check general information about the table
# print("Table dimensions (rows, columns):", biom_df.shape)
# print("Maximum value in the table:", biom_df.values.max())
# print("Minimum value in the table:", biom_df.values.min())
# print("Mean value in the table:", biom_df.values.mean())
# print("Standard deviation in the table:", biom_df.values.std())


In [5]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'

biom_df['feature_type'] = biom_df.apply(classify_feature, axis=1)

binary_features = biom_df[biom_df['feature_type'] == 'binary']
numerical_features = biom_df[biom_df['feature_type'] == 'numerical']

binary_features = binary_features.drop(columns=['feature_type'])
numerical_features = numerical_features.drop(columns=['feature_type'])

print(f"Number of binary features: {binary_features.shape[0]}")
print(f"Number of numerical features: {numerical_features.shape[0]}")


Number of binary features: 0
Number of numerical features: 1102


In [6]:
scaled_d = numerical_features.copy()
scaler = MinMaxScaler()

scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
scaled = pd.DataFrame(scaler.fit_transform(numerical_features.T).T, index=numerical_features.index, columns=numerical_features.columns)

# print("Scaled Numerical Dataset:")
# print(scaled.head())
print(scaled)

            15448.0005635586  15448.0005643690  15448.0005671650  \
G000006865          0.000829          0.000319      1.274616e-03   
G000006925          0.000012          0.000120      9.487189e-07   
G000007325          0.000000          0.000535      0.000000e+00   
G000007465          0.000000          0.000000      0.000000e+00   
G000007525          0.000000          0.000000      2.506277e-05   
...                      ...               ...               ...   
G902377395          0.013391          0.004304      1.960784e-02   
G902385285          0.018129          0.057304      1.898153e-02   
G902387315          0.005682          0.002308      3.866189e-02   
G902406265          0.002646          0.000075      1.117818e-04   
G902765685          0.000000          0.000007      3.529416e-06   

            15448.0005700235  15448.0005704913  15448.0005704938  \
G000006865          0.000637          0.000255          0.000000   
G000006925          0.000162          0.000874 

In [7]:
scaled_d = numerical_features.copy()
scaler = MinMaxScaler()

scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
scaled = scaler.fit_transform(numerical_features.T).T

# print("Scaled Numerical Dataset:")
# print(scaled.head())
# print(scaled)

In [8]:
scaled += 1e-10
sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
table_scaled = Table(scaled, feature_ids, sample_ids)


In [None]:
dim = 128  # Number of components to reduce to
rpca_results = rpca(table_scaled, n_components=dim)

ordination, distance = rpca_results
sample_scores = ordination.samples  # Scores for samples
feature_scores = ordination.features  # Scores for features

X_reconstructed = np.dot(sample_scores, feature_scores.T)

mse = mean_squared_error(scaled.T, X_reconstructed)
print(f"Reconstruction MSE: {mse}")

print(f"Sample scores shape: {sample_scores.shape}")
print(f"Feature scores shape: {feature_scores.shape}")
print(f"Original scaled data shape: {scaled.T.shape}")
print(f"Reconstructed data shape: {X_reconstructed.shape}")

In [243]:
# Reconstruction MSE: 0.003958728646643008
# Sample scores shape: (1312, 3)
# Feature scores shape: (1102, 3)
# Original scaled data shape: (1312, 1102)
# Reconstructed data shape: (1312, 1102)


# Reconstruction MSE: 0.0033221211643852722
# Sample scores shape: (1312, 10)
# Feature scores shape: (8448, 10)
# Original scaled data shape: (1312, 8448)
# Reconstructed data shape: (1312, 8448)

In [244]:
# print(ordination)
# print(distance)

# Ordination results:
# 	Method: (Robust Aitchison) RPCA Biplot (rpca_biplot)
# 	Eigvals: 3
# 	Proportion explained: 3
# 	Features: 1102x3
# 	Samples: 1312x3
# 	Biplot Scores: N/A
# 	Sample constraints: N/A
# 	Feature IDs: 'G000006865', 'G000006925', 'G000007325', 'G000007465', 'G000007525', ...
# 	Sample IDs: '15448.0005635586', '15448.0005643690', '15448.0005671650', '15448.0005700235', ...
# 1312x1312 distance matrix
# IDs:
# '15448.0005635586', '15448.0005643690', '15448.0005671650', '15448.0005700235', ...
# Data:
# [[0.         2.08392107 0.56944807 ... 2.39861693 1.28328955 2.31825922]
#  [2.08392107 0.         1.57075643 ... 0.9076768  1.28762466 0.98842086]
#  [0.56944807 1.57075643 0.         ... 1.82917398 1.08186416 1.75242724]
#  ...
#  [2.39861693 0.9076768  1.82917398 ... 0.         2.0035645  0.24021381]
#  [1.28328955 1.28762466 1.08186416 ... 2.0035645  0.         1.97177064]
#  [2.31825922 0.98842086 1.75242724 ... 0.24021381 1.97177064 0.        ]]


In [245]:
# Only using reduced scaled numerical dataset
reduced_df = pd.DataFrame(sample_scores, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [246]:
# # Using both binary and reduced numerical dataset
# reduced_df = pd.DataFrame(sample_scores, index=sample_ids)

# binary_transposed = binary_features.T 
# # reduced_df.index = reduced_df.index.astype(float)
# # print(reduced_df.index)
# # print(binary_transposed.index)
# # print("Do sample names match?", (reduced_df.index == binary_transposed.index).all())
# combined_df = pd.concat([reduced_df, binary_transposed], axis=1)
# combined_df.index = reduced_df.index.astype(float)

# meta_df = meta.set_index('sample_name')

# merged_df = combined_df.join(metadata_df[['host_age']], how='inner')
# cleaned_df = merged_df.dropna(subset=['host_age'])

# # print(f"Original dataset shape: {merged_df.shape}")
# # print(f"Cleaned dataset shape: {cleaned_df.shape}")
# # # Original dataset shape: (1312, 1749)
# # # Cleaned dataset shape: (1301, 1749)

# X = cleaned_df.drop(columns=['host_age'])
# Y = cleaned_df['host_age']

In [247]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

Linear Regression MSE: 59.808153281084984


In [248]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

Random Forest MSE: 62.939483908045986


In [249]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

MLP Regressor MSE: 59.78929765458566


In [250]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

XGBoost MSE: 64.94144790232647


In [None]:
'''
--------RPCA Num ONLY--------
Dim = 3
Scaler: MinMax Scaler
Reconstruction MSE: 0.003958728646643008
Cleaned dataset shape: (1301, 4)
Linear Regression MSE: 68.19986181183467
Random Forest MSE: 75.95139961685823
MLP Regressor MSE: 68.22414791217268
XGBoost MSE: 76.82236484129864

Dim = 10
Scaler: MinMax Scaler
Reconstruction MSE: 0.003946154448124425
Linear Regression MSE: 59.808153281084984
Random Forest MSE: 62.939483908045986
MLP Regressor MSE: 59.78929765458566
XGBoost MSE: 64.94144790232647

'''

In [276]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

data_transposed = numerical_features.T

scaler = StandardScaler()
standardized_data = scaler.fit_transform(data_transposed)

reduced_dim = 512
pca = PCA(n_components=reduced_dim, svd_solver='auto')
pca_reduced = pca.fit_transform(standardized_data)
pca_reconstructed = pca.inverse_transform(pca_reduced)

mse_pca = mean_squared_error(standardized_data, pca_reconstructed)

print(mse_pca)

0.01743150302309283


In [277]:
reduced_df = pd.DataFrame(pca_reduced, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [278]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

Linear Regression MSE: 580.8346541828049


In [279]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

Random Forest MSE: 63.36988850574713


In [280]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

MLP Regressor MSE: 911.1563581922785


In [281]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

XGBoost MSE: 67.18452645782199


In [269]:
'''
--------PCA--------
Dim = 128
Reconstruction MSE(full): 0.32280275488908244
Cleaned dataset shape: (1301, 129)
Linear Regression MSE: 2841.559584303109
Random Forest MSE: 66.89663639846744
MLP Regressor MSE: 895.1086733724248
XGBoost MSE: 70.36381670902817

Reconstruction MSE(auto): 0.32696752866827633
Linear Regression MSE: 97.59366339211996
Random Forest MSE: 64.57761494252874
MLP Regressor MSE: 970.4559274788406
XGBoost MSE: 64.4298729789169

Dim = 512
Reconstruction MSE(full): 0.01719558259971444
Linear Regression MSE: 551.90068430994
Random Forest MSE: 66.89663639846744
MLP Regressor MSE: 811.2290273839908
XGBoost MSE: 69.61207270397696

Reconstruction MSE(auto): 0.01743150302309283
Linear Regression MSE: 580.8346541828049
Random Forest MSE: 63.36988850574713
MLP Regressor MSE: 911.1563581922785
XGBoost MSE: 67.18452645782199
'''

'\n--------PCA--------\nDim = 128\nReconstruction MSE(full): 0.32280275488908244\nCleaned dataset shape: (1301, 129)\nLinear Regression MSE: 2841.559584303109\nRandom Forest MSE: 66.89663639846744\nMLP Regressor MSE: 895.1086733724248\nXGBoost MSE: 70.36381670902817\n\nReconstruction MSE(auto): 0.32696752866827633\nLinear Regression MSE: 97.59366339211996\nRandom Forest MSE: 65.1517490421456\nMLP Regressor MSE: 970.4559274788406\nXGBoost MSE: 64.4298729789169\n\nDim = 512\nReconstruction MSE(full): 0.09005376802270713\nLinear Regression MSE: 2841.559584303109\nRandom Forest MSE: 66.89663639846744\nMLP Regressor MSE: 895.1086733724248\nXGBoost MSE: 70.36381670902817\n\n\n\n'

In [191]:
print(cleaned_df['host_age'].max())
print(cleaned_df['host_age'].min())

100.0
21.0
