In [1]:
import numpy as np
import pandas as pd
import biom
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from biom import Table
from sklearn.metrics import pairwise_distances
from skbio.stats.ordination import pcoa
from sklearn.metrics import mean_squared_error

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
table_path = "./redbiom_adrc_wolr2_fecal_v2.biom"
meta_path = "./redbiom_adrc_wolr2_fecal_v2.tsv"

table = biom.load_table(table_path)

meta = pd.read_csv(meta_path, sep='\t')

# print(meta.shape)
# print(meta.head(5))
# print(f"Table shape: {table.shape}")
# print(table.head(5))


In [4]:
# Convert the BIOM table to a pandas DataFrame (OTUs as rows, samples as columns)
biom_df = pd.DataFrame(table.matrix_data.toarray(), index=table.ids(axis='observation'), columns=table.ids(axis='sample'))


In [5]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'

biom_df['feature_type'] = biom_df.apply(classify_feature, axis=1)

binary_features = biom_df[biom_df['feature_type'] == 'binary']
numerical_features = biom_df[biom_df['feature_type'] == 'numerical']

binary_features = binary_features.drop(columns=['feature_type'])
numerical_features = numerical_features.drop(columns=['feature_type'])

print(f"Number of binary features: {binary_features.shape[0]}")
print(f"Number of numerical features: {numerical_features.shape[0]}")


Number of binary features: 1568
Number of numerical features: 13795


In [6]:
# # # Check general information about the table
# print("Table dimensions (rows, columns):", biom_df.shape)
# print("Maximum value in the table:", biom_df.values.max())
# print("Minimum value in the table:", biom_df.values.min())
# print("Mean value in the table:", biom_df.values.mean())
# print("Standard deviation in the table:", biom_df.values.std())


In [7]:
num_data = numerical_features.T

In [8]:
scaled_d = numerical_features.copy()
scaler = MinMaxScaler()

scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
scaled = scaler.fit_transform(num_data)

# print("Scaled Numerical Dataset:")
# print(scaled.head())
# print(scaled.shape) # (13795, 13436)

In [9]:
scaler_2 = MinMaxScaler()
scaled_2 = scaler_2.fit_transform(num_data)

In [10]:
scaler_1 = StandardScaler()
scaled_1 = scaler_1.fit_transform(num_data)

In [11]:
scaled_2.shape

(13436, 13795)

In [12]:
clean = scaled[~np.isnan(scaled).any(axis=1)]

In [13]:
print(scaled.shape)
print(clean.shape)

(13436, 13795)
(13436, 13795)


In [14]:
import time
start_time = time.time()
print(start_time)

df_scaled = pd.DataFrame(scaled)

dm = pairwise_distances(num_data, metric='euclidean')
# dm = pairwise_distances(scaled_2, metric='euclidean')
# dm = (dm + dm.T) / 2

# dm = pairwise_distances(scaled_2, metric='euclidean')


dim = 256

pcoa_results = pcoa(dm)
reduced = pcoa_results.samples.iloc[:, :dim]
print(reduced)


# View the PCoA results
print(pcoa_results)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
'''
Note: runtime has no connection with output dimension. Dim only affects how many rows we want from the result.
Dim = 16, Execution time: 149.83883666992188 seconds
Dim = 64, Execution time: 226.11388063430786 seconds
Dim = 128, Execution time: 147.53481197357178 seconds
Dim = 128, Execution time: 150.34863781929016 seconds
'''

1727712897.6928103


  warn(


                PC1            PC2            PC3            PC4  \
0     -3.248695e+06 -576371.445652 -607291.079868  511376.074966   
1     -3.115778e+02   39123.159677   60486.508880  -70207.490780   
2      1.436833e+05  -48037.821287   13673.230721   -1633.312265   
3      1.127946e+05   76440.615929   -6093.942982  -16852.402048   
4      1.707834e+05  157426.647613  -92674.275936   50247.424616   
...             ...            ...            ...            ...   
13431 -3.791729e+05  -91210.138215 -123400.219649  180843.622730   
13432 -2.704470e+05  -54108.470951 -126675.877762   83746.482084   
13433 -7.597396e+05   89641.608099   35056.912698  -53501.198762   
13434 -9.806089e+05 -119857.658340 -247686.524320  184542.014154   
13435 -7.683849e+05  100479.157638  -92696.145454  -36011.426261   

                 PC5           PC6           PC7            PC8  \
0      -74201.107154  2.679797e+05  3.506847e+05  234849.169595   
1      -93666.208706  2.378098e+05 -2.841562e+04 

'\nNote: runtime has no connection with output dimension. Dim only affects how many rows we want from the result.\nDim = 16, Execution time: 149.83883666992188 seconds\nDim = 64, Execution time: 226.11388063430786 seconds\nDim = 128, Execution time: 147.53481197357178 seconds\nDim = 128, Execution time: 150.34863781929016 seconds\n'

In [15]:
explained_variance_1 = pcoa_results.eigvals / pcoa_results.eigvals.sum()

cumulative_explained_variance_1 = explained_variance_1.sum()
print(explained_variance_1)
print(cumulative_explained_variance_1)
print(explained_variance_1.head(16))

PC1        0.325234
PC2        0.085104
PC3        0.077676
PC4        0.068997
PC5        0.044542
             ...   
PC13432    0.000000
PC13433    0.000000
PC13434    0.000000
PC13435    0.000000
PC13436    0.000000
Length: 13436, dtype: float64
0.9999999999999999
PC1     0.325234
PC2     0.085104
PC3     0.077676
PC4     0.068997
PC5     0.044542
PC6     0.033578
PC7     0.024592
PC8     0.016536
PC9     0.015819
PC10    0.015359
PC11    0.013926
PC12    0.012939
PC13    0.009901
PC14    0.009748
PC15    0.009349
PC16    0.009036
dtype: float64


In [16]:
# from skbio.stats.distance import mantel

# reduced_distances = pairwise_distances(reduced, metric='euclidean')
# original_distances = pairwise_distances(num_data, metric='euclidean')
# correlation, p_value, _ = mantel(original_distances, reduced_distances)
# print(f"Mantel test correlation: {correlation}, p-value: {p_value}")


In [17]:
# df_scaled = pd.DataFrame(scaled)

# # dm = pairwise_distances(num_data, metric='euclidean')
# # dm = pairwise_distances(scaled_1, metric='euclidean')
# # dm = (dm + dm.T) / 2

# dm = pairwise_distances(scaled_2, metric='euclidean')


# dim = 16

# pcoa_results = pcoa(dm)
# reduced = pcoa_results.samples.iloc[:, :dim]
# print(reduced)


# # View the PCoA results
# print(pcoa_results)

In [18]:
def combine_df(df1, df2):
    df1 = pd.DataFrame(df1)
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    return pd.concat([df1, df2], axis=1)
    
def get_X_y(reduced, y, cat):
    combined_df = combine_df(reduced, y)
    cleaned_df = combined_df.dropna()
    X = cleaned_df.drop(columns=[cat])
    y = cleaned_df[cat]
    return X, y

def training(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    print(f"Random Forest MSE: {mse_rf}")

    # # MLP
    # mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=5000, random_state=42)
    # mlp_model.fit(X_train, y_train)
    
    # y_pred_mlp = mlp_model.predict(X_test)
    # mse_mlp = mean_squared_error(y_test, y_pred_mlp)
    # print(f"MLP Regressor MSE: {mse_mlp}")

    # XGBoost
    xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost MSE: {mse_xgb}")

In [19]:
# cat = 'host_age'
# cat_y = meta[cat]
# cat_X = combine_df(reduced, binary_features)

In [20]:
# X, y = get_X_y(cat_X, cat_y, cat)
# training(X, y)

In [21]:
cat = 'host_age'

reduced_df = combine_df(reduced, binary_features.T)
reduced_df.index = num_data.index
reduced_df.index.name = '#SampleID'
meta_df = meta.set_index('#SampleID')

merged_df = pd.merge(reduced_df, meta_df[cat], left_index=True, right_index=True, how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])
f_cleaned_df = merged_df.dropna()
# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

print(cleaned_df.shape)
print(f_cleaned_df.shape)
X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']
training(X, Y)

(13436, 1825)
(13436, 1825)
Linear Regression MSE: 247037.21177537154
Random Forest MSE: 215.7651744320405
XGBoost MSE: 204.33327737775818


In [22]:
# print(binary_features)

In [None]:
# # PCoA, Dim = 16

# # Linear Regression MSE: 80720753047910.94
# # Random Forest MSE: 290.40436212971275
# # XGBoost MSE: 300.0148567682974

# # PC1     0.325234
# # PC2     0.085104
# # PC3     0.077676
# # PC4     0.068997
# # PC5     0.044542
# # PC6     0.033578
# # PC7     0.024592
# # PC8     0.016536
# # PC9     0.015819
# # PC10    0.015359
# # PC11    0.013926
# # PC12    0.012939
# # PC13    0.009901
# # PC14    0.009748
# # PC15    0.009349
# # PC16    0.009036
# # dtype: float64

# dim = 128
# (13436, 1697)
# (13436, 1697)
# Linear Regression MSE: 51820.4883765476
# Random Forest MSE: 218.34668141373584
# XGBoost MSE: 211.44149421183153

1727329770.3884413


In [None]:
def combine_df(df1, df2):
    df1 = pd.DataFrame(df1)
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    return pd.concat([df1, df2], axis=1)
    
def get_X_y(reduced, y, cat):
    combined_df = combine_df(reduced, y)
    cleaned_df = combined_df.dropna()
    X = cleaned_df.drop(columns=[cat])
    y = cleaned_df[cat]
    return X, y

def training(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")

    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    mse_rf = mean_squared_error(y_test, y_pred_rf)
    print(f"Random Forest MSE: {mse_rf}")

    # # MLP
    # mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=5000, random_state=42)
    # mlp_model.fit(X_train, y_train)
    
    # y_pred_mlp = mlp_model.predict(X_test)
    # mse_mlp = mean_squared_error(y_test, y_pred_mlp)
    # print(f"MLP Regressor MSE: {mse_mlp}")

    # XGBoost
    xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost MSE: {mse_xgb}")

In [None]:
cat = 'host_age'
cat_y = meta[y_category]
cat_X = combine_df(reduced_data, binary_features)

In [None]:
X, y = get_X_y(cat_X, cat_y, cat)
training(X, y)

In [None]:
# Dim = 3
# Linear Regression MSE: 3.995264211009359e+28
# Random Forest MSE: 332.3066680347287
# /opt/conda/lib/python3.11/site-packages/sklearn/neural_network/_multilayer_perceptron.py:697: UserWarning: Training interrupted by user.
#   warnings.warn("Training interrupted by user.")
# MLP Regressor MSE: 685.7311446346571
# XGBoost MSE: 299.14464191813653

# Dim = 16
# Reconstruction MSE: 0.00022048968863458537
# Sample scores shape: (13436, 16)
# Feature scores shape: (13795, 16)
# Original scaled data shape: (13436, 13795)
# Reconstructed data shape: (13436, 13795)

# Linear Regression MSE: 1.1185584052698574e+29
# Random Forest MSE: 297.4191591508802
# XGBoost MSE: 306.4656906982071

print(y.max())
print(y.min())

In [None]:
scaled_d = numerical_features.copy()
scaler = MinMaxScaler()

scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
scaled = pd.DataFrame(scaler.fit_transform(numerical_features.T).T, index=numerical_features.index, columns=numerical_features.columns)

# print("Scaled Numerical Dataset:")
# print(scaled.head())
# print(scaled)

In [None]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'

def split_feature(biom_df):
    biom_df['feature_type'] = biom_df.apply(classify_feature, axis=1)

    binary_features = biom_df[biom_df['feature_type'] == 'binary']
    numerical_features = biom_df[biom_df['feature_type'] == 'numerical']
    
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    
    print(f"Number of binary features: {binary_features.shape[0]}")
    print(f"Number of numerical features: {numerical_features.shape[0]}")

    return binary_features, numerical_features

def scale_data(numerical_features):
    scaled_d = numerical_features.copy()
    scaler = MinMaxScaler()
    scaled_d[numerical_features.columns] = scaler.fit_transform(numerical_features)
    scaled = scaler.fit_transform(numerical_features.T).T
    return scaled

def rpca_fr(scaled, numerical_features, dim):
    scaled += 1e-10
    sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
    feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
    table_scaled = Table(scaled, feature_ids, sample_ids)

    rpca_results = rpca(table_scaled, n_components=dim)

    ordination, distance = rpca_results
    sample_scores = ordination.samples  # Scores for samples
    feature_scores = ordination.features  # Scores for features
    
    X_reconstructed = np.dot(sample_scores, feature_scores.T)
    
    mse = mean_squared_error(scaled.T, X_reconstructed)
    print(f"Reconstruction MSE: {mse}")
    
    print(f"Sample scores shape: {sample_scores.shape}")
    print(f"Feature scores shape: {feature_scores.shape}")
    print(f"Original scaled data shape: {scaled.T.shape}")
    print(f"Reconstructed data shape: {X_reconstructed.shape}")

    return sample_scores

In [None]:
def rpca(df, dim):
    bin_data, num_data = split_feature(df)
    scaled_data = scale_data(num_data)
    reduced_data = rpca_fr(scaled_data, num_data, dim)
    return reduced_data

In [None]:
rpca(biom_df, 3)

In [None]:
scaled += 1e-10
sample_ids = numerical_features.columns.tolist()  # Sample IDs (columns)
feature_ids = numerical_features.index.tolist()   # Feature IDs (rows)
table_scaled = Table(scaled, feature_ids, sample_ids)


In [None]:
dim = 3  # Number of components to reduce to
rpca_results = rpca(table_scaled, n_components=dim)

ordination, distance = rpca_results
sample_scores = ordination.samples  # Scores for samples
feature_scores = ordination.features  # Scores for features

X_reconstructed = np.dot(sample_scores, feature_scores.T)

mse = mean_squared_error(scaled.T, X_reconstructed)
print(f"Reconstruction MSE: {mse}")

print(f"Sample scores shape: {sample_scores.shape}")
print(f"Feature scores shape: {feature_scores.shape}")
print(f"Original scaled data shape: {scaled.T.shape}")
print(f"Reconstructed data shape: {X_reconstructed.shape}")

In [None]:
# Reconstruction MSE: 0.003958728646643008
# Sample scores shape: (1312, 3)
# Feature scores shape: (1102, 3)
# Original scaled data shape: (1312, 1102)
# Reconstructed data shape: (1312, 1102)


# Reconstruction MSE: 0.0033221211643852722
# Sample scores shape: (1312, 10)
# Feature scores shape: (8448, 10)
# Original scaled data shape: (1312, 8448)
# Reconstructed data shape: (1312, 8448)

In [None]:
print(ordination)
print(distance)



In [None]:
# Only using reduced scaled numerical dataset
reduced_df = pd.DataFrame(sample_scores, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [None]:
# # Using both binary and reduced numerical dataset
# reduced_df = pd.DataFrame(sample_scores, index=sample_ids)

# binary_transposed = binary_features.T 
# # reduced_df.index = reduced_df.index.astype(float)
# # print(reduced_df.index)
# # print(binary_transposed.index)
# # print("Do sample names match?", (reduced_df.index == binary_transposed.index).all())
# combined_df = pd.concat([reduced_df, binary_transposed], axis=1)
# combined_df.index = reduced_df.index.astype(float)

# meta_df = meta.set_index('sample_name')

# merged_df = combined_df.join(metadata_df[['host_age']], how='inner')
# cleaned_df = merged_df.dropna(subset=['host_age'])

# # print(f"Original dataset shape: {merged_df.shape}")
# # print(f"Cleaned dataset shape: {cleaned_df.shape}")
# # # Original dataset shape: (1312, 1749)
# # # Cleaned dataset shape: (1301, 1749)

# X = cleaned_df.drop(columns=['host_age'])
# Y = cleaned_df['host_age']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

In [None]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

In [None]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

In [None]:
'''
--------RPCA Num ONLY--------
Dim = 3
Scaler: MinMax Scaler
Reconstruction MSE: 0.003958728646643008
Cleaned dataset shape: (1301, 4)
Linear Regression MSE: 68.19986181183467
Random Forest MSE: 75.95139961685823
MLP Regressor MSE: 68.22414791217268
XGBoost MSE: 76.82236484129864

Dim = 10
Scaler: MinMax Scaler
Reconstruction MSE: 0.003946154448124425
Linear Regression MSE: 59.808153281084984
Random Forest MSE: 62.939483908045986
MLP Regressor MSE: 59.78929765458566
XGBoost MSE: 64.94144790232647

'''

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

data_transposed = numerical_features.T

scaler = StandardScaler()
standardized_data = scaler.fit_transform(data_transposed)

reduced_dim = 512
pca = PCA(n_components=reduced_dim, svd_solver='auto')
pca_reduced = pca.fit_transform(standardized_data)
pca_reconstructed = pca.inverse_transform(pca_reduced)

mse_pca = mean_squared_error(standardized_data, pca_reconstructed)

print(mse_pca)

In [None]:
reduced_df = pd.DataFrame(pca_reduced, index=sample_ids)
reduced_df.index = reduced_df.index.astype(float)

meta_df = meta.set_index('sample_name')

merged_df = reduced_df.join(metadata_df[['host_age']], how='inner')

# print(merged_df.head())
# print(merged_df.shape)

cleaned_df = merged_df.dropna(subset=['host_age'])

# print(f"Original dataset shape: {merged_df.shape}")
# print(f"Cleaned dataset shape: {cleaned_df.shape}")
# Original dataset shape: (1312, 4)
# Cleaned dataset shape: (1301, 4)

X = cleaned_df.drop(columns=['host_age'])
Y = cleaned_df['host_age']

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train

# Initialize and fit the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Linear Regression MSE: {mse}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf}")

In [None]:
from sklearn.neural_network import MLPRegressor

# Initialize and fit the model
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_mlp = mlp_model.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
print(f"MLP Regressor MSE: {mse_mlp}")

In [None]:
from xgboost import XGBRegressor

# Initialize and fit the model
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost MSE: {mse_xgb}")

In [None]:
'''
--------PCA--------
Dim = 128
Reconstruction MSE(full): 0.32280275488908244
Cleaned dataset shape: (1301, 129)
Linear Regression MSE: 2841.559584303109
Random Forest MSE: 66.89663639846744
MLP Regressor MSE: 895.1086733724248
XGBoost MSE: 70.36381670902817

Reconstruction MSE(auto): 0.32696752866827633
Linear Regression MSE: 97.59366339211996
Random Forest MSE: 64.57761494252874
MLP Regressor MSE: 970.4559274788406
XGBoost MSE: 64.4298729789169

Dim = 512
Reconstruction MSE(full): 0.01719558259971444
Linear Regression MSE: 551.90068430994
Random Forest MSE: 66.89663639846744
MLP Regressor MSE: 811.2290273839908
XGBoost MSE: 69.61207270397696

Reconstruction MSE(auto): 0.01743150302309283
Linear Regression MSE: 580.8346541828049
Random Forest MSE: 63.36988850574713
MLP Regressor MSE: 911.1563581922785
XGBoost MSE: 67.18452645782199
'''

In [None]:
print(cleaned_df['host_age'].max())
print(cleaned_df['host_age'].min())