In [1]:
import numpy as np
import pandas as pd
import biom
from biom import Table
from gemelli.rpca import rpca
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
meta_path = './adrc_full_metadata.csv'
meta = pd.read_csv(meta_path)

metab1_path = './../v2_dataset/metab_df_1_processed.csv'
metab2_path = './../v2_dataset/metab_df_2_processed.csv'
metab3_path = './../v2_dataset/metab_drugs_df_1_processed.csv'
metab4_path = './../v2_dataset/metab_drugs_df_2_processed.csv'


In [4]:
# print(df.shape)
# print(df.head(5))

# print(df2.shape)
# print(df2.head(5))

# print(df3.shape)
# print(df3.head(5))

# print(df4.shape)
# print(df4.head(5))

In [5]:
def classify_feature(row):
    unique_values = row.unique()
    if len(unique_values) == 2 and set(unique_values).issubset({0, 1}):
        return 'binary'
    else:
        return 'numerical'
        
        
def rpca_fr(path, dim):
    df = pd.read_csv(path, delimiter='\t')

    df['feature_type'] = df.apply(classify_feature, axis=1)
    binary_features = df[df['feature_type'] == 'binary']
    numerical_features = df[df['feature_type'] == 'numerical']
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    print(f'Number of binary features: {binary_features.shape[0]}')
    print(f'Number of numerical features: {numerical_features.shape[0]}')

    scaled = numerical_features.copy()
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(numerical_features)

    scaled += 1e-10
    sample_ids = numerical_features.index.tolist()  # Sample IDs (rows)
    feature_ids = numerical_features.columns.tolist()   # Feature IDs (columns)
    table_scaled = Table(scaled.T, feature_ids, sample_ids)

    rpca_results = rpca(table_scaled, n_components=dim)

    ordination, distance = rpca_results
    sample_scores = ordination.samples  # Scores for samples
    feature_scores = ordination.features  # Scores for features
    
    X_reconstructed = np.dot(sample_scores, feature_scores.T)
    
    mse = mean_squared_error(scaled, X_reconstructed)
    print(f'Reconstruction MSE: {mse}')
    
    print(f'Sample scores shape: {sample_scores.shape}')
    print(f'Feature scores shape: {feature_scores.shape}')
    print(f'Original scaled data shape: {scaled.T.shape}')
    print(f'Reconstructed data shape: {X_reconstructed.shape}\n')

    reduced_df = pd.DataFrame(sample_scores, index=sample_ids)
    return reduced_df

In [6]:
def pca_fr(path, dim, svd='full'):
    df = pd.read_csv(path, delimiter='\t')

    df['feature_type'] = df.apply(classify_feature, axis=1)
    binary_features = df[df['feature_type'] == 'binary']
    numerical_features = df[df['feature_type'] == 'numerical']
    binary_features = binary_features.drop(columns=['feature_type'])
    numerical_features = numerical_features.drop(columns=['feature_type'])
    print(f'Number of binary features: {binary_features.shape[0]}')
    print(f'Number of numerical features: {numerical_features.shape[0]}')

    scaled = numerical_features.copy()
    scaler = StandardScaler()
    scaled = scaler.fit_transform(numerical_features)

    pca = PCA(n_components=dim, svd_solver=svd)
    reduced_df = pca.fit_transform(scaled)
    X_reconstructed = pca.inverse_transform(reduced_df)
    
    mse = mean_squared_error(scaled, X_reconstructed)
    print(f'Reconstruction MSE: {mse}')
    
    print(f'Reduced data shape: {reduced_df.shape}')    
    print(f'Original scaled data shape: {scaled.shape}')
    print(f'Reconstructed data shape: {X_reconstructed.shape}\n')

    return reduced_df

In [7]:
def combine_df(df1, df2):
    df1 = pd.DataFrame(df1)
    df1 = df1.reset_index(drop=True)
    df2 = df2.reset_index(drop=True)
    return pd.concat([df1, df2], axis=1)

In [8]:
def get_X_y(reduced, y):
    combined_df = combine_df(reduced, y)
    cleaned_df = combined_df.dropna(subset=['apoe4_binary'])
    X = cleaned_df.drop(columns=['apoe4_binary'])
    y = cleaned_df['apoe4_binary']
    return X, y

In [9]:
def training(X, Y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Logistic Regression
    lr_model = LogisticRegression(random_state=42)
    lr_model.fit(X_train, y_train)
    
    y_pred = lr_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Logistic Regression Accuracy: {accuracy}")
    print(f"Classification Report for Logistic Regression:\n{classification_report(y_test, y_pred)}")

    # Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    print(f"Random Forest Accuracy: {accuracy_rf}")
    print(f"Classification Report for Random Forest:\n{classification_report(y_test, y_pred_rf)}")

    # MLP Classifier (Neural Network)
    mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=4000, random_state=42)
    mlp_model.fit(X_train, y_train)
    
    y_pred_mlp = mlp_model.predict(X_test)
    accuracy_mlp = accuracy_score(y_test, y_pred_mlp)
    print(f"MLP Classifier Accuracy: {accuracy_mlp}")
    print(f"Classification Report for MLP Classifier:\n{classification_report(y_test, y_pred_mlp)}")

    # XGBoost Classifier
    xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    xgb_model.fit(X_train, y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
    print(f"XGBoost Accuracy: {accuracy_xgb}")
    print(f"Classification Report for XGBoost:\n{classification_report(y_test, y_pred_xgb)}")


In [10]:
# host_age_1 = meta['host_age'][:611]
# host_age_2 = meta['host_age'][611:]
# print(meta.head)
apoe4_1 = meta['apoe4'][:611]
binary_data_1 = apoe4_1.map({'Carrier': 1, 'Non-carrier': 0})
apoe4_1 = pd.DataFrame({'apoe4_binary': binary_data_1})

apoe4_2 = meta['apoe4'][611:]
binary_data_2 = apoe4_2.map({'Carrier': 1, 'Non-carrier': 0})
apoe4_2 = pd.DataFrame({'apoe4_binary': binary_data_2})

In [None]:
counts = apoe4_1.value_counts()

print(counts)

In [None]:
counts = apoe4_2.value_counts()

print(counts)

In [11]:
def get_result(path, bi_path, rpca_dim, pca_dim, y, svd='full'):
    print(f'--------RPCA Results--------')
    rpca_df = rpca_fr(path, rpca_dim)

    # print(f'For Numerical Dataset Only')
    # X_rpca_nu, Y_rpca_nu = get_X_y(rpca_df, y)
    # training(X_rpca_nu, Y_rpca_nu)
    
    print(f'For Both Numerical And Binary Dataset')
    df = pd.read_csv(bi_path, delimiter='\t')
    bi = df.astype(int)
    combined_rpca = combine_df(rpca_df, bi)
    X_rpca, Y_rpca = get_X_y(combined_rpca, y)
    training(X_rpca, Y_rpca)

    print(f'--------PCA Results--------')
    pca_df = pca_fr(path, pca_dim, svd)
    
    # print(f'For Numerical Dataset Only')
    # X_pca_nu, Y_pca_nu = get_X_y(pca_df, y)
    # training(X_pca_nu, Y_pca_nu)
    
    print(f'For Both Numerical And Binary Dataset')
    df = pd.read_csv(bi_path, delimiter='\t')
    bi = df.astype(int)
    # pca_df = pd.DataFrame(pca_df)
    combined_pca = combine_df(pca_df, bi)
    X_pca, Y_pca = get_X_y(combined_pca, y)
    X_pca.columns = X_pca.columns.astype(str)
    training(X_pca, Y_pca)

In [13]:
# Data 1:
# RPCA Dim = 3
# PCA Dim = 128
get_result(metab1_path, metab3_path, 3, 128, apoe4_1)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.01250146068203957
Sample scores shape: (611, 3)
Feature scores shape: (20628, 3)
Original scaled data shape: (20628, 611)
Reconstructed data shape: (611, 20628)

For Both Numerical And Binary Dataset
Logistic Regression Accuracy: 0.5306122448979592
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.56      0.74      0.63        54
         1.0       0.46      0.27      0.34        44

    accuracy                           0.53        98
   macro avg       0.51      0.51      0.49        98
weighted avg       0.51      0.53      0.50        98

Random Forest Accuracy: 0.5612244897959183
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.56      0.91      0.70        54
         1.0       0.55      0.14      0.22        44

    accuracy   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.5510204081632653
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.57      0.80      0.66        54
         1.0       0.50      0.25      0.33        44

    accuracy                           0.55        98
   macro avg       0.53      0.52      0.50        98
weighted avg       0.54      0.55      0.51        98

MLP Classifier Accuracy: 0.5306122448979592
Classification Report for MLP Classifier:
              precision    recall  f1-score   support

         0.0       0.56      0.70      0.62        54
         1.0       0.47      0.32      0.38        44

    accuracy                           0.53        98
   macro avg       0.51      0.51      0.50        98
weighted avg       0.52      0.53      0.51        98

XGBoost Accuracy: 0.5816326530612245
Classification Report for XGBoost:
              precision    recall  f1-score   support

         0.0       0.58      0.89      0.70      

In [None]:
# Data 1:
# RPCA Dim = 10
# PCA Dim = 256
get_result(metab1_path, metab3_path, 10, 256, apoe4_1)

In [12]:
# Data 1:
# RPCA Dim = 16
# PCA Dim = 512
get_result(metab1_path, metab3_path, 16, 512, apoe4_1)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 611
Reconstruction MSE: 0.012462772977431187
Sample scores shape: (611, 16)
Feature scores shape: (20628, 16)
Original scaled data shape: (20628, 611)
Reconstructed data shape: (611, 20628)

For Both Numerical And Binary Dataset
Logistic Regression Accuracy: 0.5306122448979592
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.56      0.74      0.63        54
         1.0       0.46      0.27      0.34        44

    accuracy                           0.53        98
   macro avg       0.51      0.51      0.49        98
weighted avg       0.51      0.53      0.50        98

Random Forest Accuracy: 0.5510204081632653
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.56      0.93      0.69        54
         1.0       0.50      0.09      0.15        44

    accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.5714285714285714
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.56      0.98      0.72        54
         1.0       0.75      0.07      0.12        44

    accuracy                           0.57        98
   macro avg       0.66      0.52      0.42        98
weighted avg       0.65      0.57      0.45        98

MLP Classifier Accuracy: 0.5
Classification Report for MLP Classifier:
              precision    recall  f1-score   support

         0.0       0.54      0.59      0.57        54
         1.0       0.44      0.39      0.41        44

    accuracy                           0.50        98
   macro avg       0.49      0.49      0.49        98
weighted avg       0.49      0.50      0.50        98

XGBoost Accuracy: 0.5408163265306123
Classification Report for XGBoost:
              precision    recall  f1-score   support

         0.0       0.55      0.89      0.68        54
         1

In [12]:
# Data 2:
# RPCA Dim = 3
# PCA Dim = 128
get_result(metab2_path, metab4_path, 3, 128, apoe4_2)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.009809758083284662
Sample scores shape: (701, 3)
Feature scores shape: (17354, 3)
Original scaled data shape: (17354, 701)
Reconstructed data shape: (701, 17354)

For Both Numerical And Binary Dataset
Logistic Regression Accuracy: 0.5714285714285714
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.59      0.92      0.72        74
         1.0       0.40      0.08      0.13        52

    accuracy                           0.57       126
   macro avg       0.49      0.50      0.42       126
weighted avg       0.51      0.57      0.47       126

Random Forest Accuracy: 0.5952380952380952
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.60      0.93      0.73        74
         1.0       0.55      0.12      0.19        52

    accuracy  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.5634920634920635
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.58      0.92      0.71        74
         1.0       0.33      0.06      0.10        52

    accuracy                           0.56       126
   macro avg       0.46      0.49      0.41       126
weighted avg       0.48      0.56      0.46       126

MLP Classifier Accuracy: 0.5396825396825397
Classification Report for MLP Classifier:
              precision    recall  f1-score   support

         0.0       0.59      0.69      0.64        74
         1.0       0.42      0.33      0.37        52

    accuracy                           0.54       126
   macro avg       0.51      0.51      0.50       126
weighted avg       0.52      0.54      0.53       126

XGBoost Accuracy: 0.5714285714285714
Classification Report for XGBoost:
              precision    recall  f1-score   support

         0.0       0.60      0.80      0.69      

In [None]:
# Data 2:
# RPCA Dim = 10
# PCA Dim = 256
get_result(metab2_path, metab4_path, 10, 256, apoe4_2)

In [13]:
# Data 2:
# RPCA Dim = 16
# PCA Dim = 512
get_result(metab2_path, metab4_path, 16, 512, apoe4_2)

--------RPCA Results--------
Number of binary features: 0
Number of numerical features: 701
Reconstruction MSE: 0.009772766867363093
Sample scores shape: (701, 16)
Feature scores shape: (17354, 16)
Original scaled data shape: (17354, 701)
Reconstructed data shape: (701, 17354)

For Both Numerical And Binary Dataset
Logistic Regression Accuracy: 0.5793650793650794
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.59      0.93      0.72        74
         1.0       0.44      0.08      0.13        52

    accuracy                           0.58       126
   macro avg       0.52      0.50      0.43       126
weighted avg       0.53      0.58      0.48       126

Random Forest Accuracy: 0.5634920634920635
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.58      0.93      0.72        74
         1.0       0.29      0.04      0.07        52

    accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.5873015873015873
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.59      0.97      0.73        74
         1.0       0.50      0.04      0.07        52

    accuracy                           0.59       126
   macro avg       0.55      0.51      0.40       126
weighted avg       0.55      0.59      0.46       126

MLP Classifier Accuracy: 0.49206349206349204
Classification Report for MLP Classifier:
              precision    recall  f1-score   support

         0.0       0.57      0.58      0.57        74
         1.0       0.38      0.37      0.37        52

    accuracy                           0.49       126
   macro avg       0.47      0.47      0.47       126
weighted avg       0.49      0.49      0.49       126

XGBoost Accuracy: 0.5555555555555556
Classification Report for XGBoost:
              precision    recall  f1-score   support

         0.0       0.58      0.86      0.70     

In [None]:
# 