In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# EPICAGE - ElasticNet on Internal Dataset

In [None]:
# Data Path
from pathlib import Path
import os

PROJECT_ROOT = Path("/content/drive/My Drive/Age_Prediction")

meth_path = PROJECT_ROOT / "data_preprocessing" / "deduplicated_methylation_data.parquet"
age_path = PROJECT_ROOT / "data_merge" / "Raw Data"/ "age.csv"
clinical_path = PROJECT_ROOT / "data_merge" / "Raw Data"/ "raw clinical data.csv"
fold_path = PROJECT_ROOT / "data_merge" / "Raw Data"/ "fivefold_id_split.csv"

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import spearmanr
from tqdm import tqdm

# Load data
methylation_df = pd.read_parquet(meth_path)
clinical_df_raw = pd.read_csv(clinical_path, index_col=0)
age_df = pd.read_csv(age_path, index_col=0)
folds_df = pd.read_csv(fold_path, index_col=0)

# Align data
age_series = age_df["years_to_birth"]
common_idx = methylation_df.index.intersection(age_series.index)
methylation_df = methylation_df.loc[common_idx]
age_series = age_series.loc[common_idx]
clinical_df_raw = clinical_df_raw.loc[common_idx]
folds_df = folds_df.loc[common_idx]

# Hyperparameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0],
    'l1_ratio': [0.2, 0.5, 0.8]
}

# Spearman correlation
def compute_spearman(X_df, y):
    return pd.Series(
        [spearmanr(X_df[col], y)[0] for col in X_df.columns],
        index=X_df.columns
    )

# Results
maes, rmses, r2s, best_params_list = [], [], [], []

# Outer CV loop
for i in tqdm(range(1, 6), desc="Running Nested CV"):
    fold_col = f"splitfold{i}"
    train_ids = folds_df[folds_df[fold_col] == "train"].index
    test_ids = folds_df[folds_df[fold_col] == "test"].index

    # Methylation data
    X_train_meth = methylation_df.loc[train_ids]
    X_test_meth = methylation_df.loc[test_ids]
    y_train = age_series.loc[train_ids]
    y_test = age_series.loc[test_ids]

    # Spearman filtering on outer train only
    spearman_corrs = compute_spearman(X_train_meth, y_train)
    selected_features = spearman_corrs[abs(spearman_corrs) >= 0.2].index
    X_train_sel = X_train_meth[selected_features]
    X_test_sel = X_test_meth[selected_features]

    # Clinical encoding: OneHotEncoder fitted on outer train only
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    X_train_clin = clinical_df_raw.loc[train_ids]
    X_test_clin = clinical_df_raw.loc[test_ids]
    X_train_clin_enc = pd.DataFrame(
        encoder.fit_transform(X_train_clin),
        index=X_train_clin.index,
        columns=encoder.get_feature_names_out()
    )
    X_test_clin_enc = pd.DataFrame(
        encoder.transform(X_test_clin),
        index=X_test_clin.index,
        columns=encoder.get_feature_names_out()
    )

    # Combine methylation + clinical
    X_train = pd.concat([X_train_sel, X_train_clin_enc], axis=1)
    X_test = pd.concat([X_test_sel, X_test_clin_enc], axis=1)

    # Impute missing values
    imputer = SimpleImputer(strategy="mean")
    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

    # Inner CV for hyperparameter tuning
    grid = GridSearchCV(
        ElasticNet(max_iter=5000),
        param_grid,
        cv=3,
        scoring="neg_mean_absolute_error",
        n_jobs=-1
    )
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    maes.append(mae)
    rmses.append(rmse)
    r2s.append(r2)
    best_params_list.append(grid.best_params_)

    # Print real-time result
    alpha = grid.best_params_['alpha']
    l1_ratio = grid.best_params_['l1_ratio']
    print(f"\nFold {i}:")
    print(f"  Best Params : alpha={alpha}, l1_ratio={l1_ratio}")
    print(f"  MAE  = {mae:.2f}")
    print(f"  RMSE = {rmse:.2f}")
    print(f"  R²   = {r2:.2f}")

# Final summary
def fmt(arr):
    return f"{np.mean(arr):.2f} ± {np.std(arr):.2f}"

print("\nElasticNet Nested CV Results (Strict, With Clinical):")
print("MAE :", fmt(maes))
print("RMSE:", fmt(rmses))
print("R²  :", fmt(r2s))

print("\nBest Parameters per Fold:")
for i, p in enumerate(best_params_list, 1):
    print(f"Fold {i}: {p}")

  model = cd_fast.enet_coordinate_descent(
Running Nested CV:  20%|██        | 1/5 [13:27<53:50, 807.65s/it]


Fold 1:
  Best Params : alpha=0.1, l1_ratio=0.2
  MAE  = 7.97
  RMSE = 9.97
  R²   = 0.45


  model = cd_fast.enet_coordinate_descent(
Running Nested CV:  40%|████      | 2/5 [27:41<41:44, 834.77s/it]


Fold 2:
  Best Params : alpha=0.1, l1_ratio=0.2
  MAE  = 8.27
  RMSE = 10.23
  R²   = 0.43


  model = cd_fast.enet_coordinate_descent(
Running Nested CV:  60%|██████    | 3/5 [42:01<28:12, 846.31s/it]


Fold 3:
  Best Params : alpha=0.1, l1_ratio=0.2
  MAE  = 8.07
  RMSE = 10.14
  R²   = 0.46


  model = cd_fast.enet_coordinate_descent(
Running Nested CV:  80%|████████  | 4/5 [55:18<13:47, 827.02s/it]


Fold 4:
  Best Params : alpha=0.1, l1_ratio=0.2
  MAE  = 7.71
  RMSE = 9.41
  R²   = 0.49


  model = cd_fast.enet_coordinate_descent(
Running Nested CV: 100%|██████████| 5/5 [1:07:50<00:00, 814.14s/it]


Fold 5:
  Best Params : alpha=0.1, l1_ratio=0.2
  MAE  = 8.03
  RMSE = 10.19
  R²   = 0.47

ElasticNet Nested CV Results (Strict, With Clinical):
MAE : 8.01 ± 0.18
RMSE: 9.99 ± 0.30
R²  : 0.46 ± 0.02

Best Parameters per Fold:
Fold 1: {'alpha': 0.1, 'l1_ratio': 0.2}
Fold 2: {'alpha': 0.1, 'l1_ratio': 0.2}
Fold 3: {'alpha': 0.1, 'l1_ratio': 0.2}
Fold 4: {'alpha': 0.1, 'l1_ratio': 0.2}
Fold 5: {'alpha': 0.1, 'l1_ratio': 0.2}





# EPICAGE - ElasticNet on External Dataset

1. Training Data

Internal dataset is used for feature selection and model training.

2. Feature Selection

Method: Spearman correlation between each CpG and age.

Threshold: CpGs with |ρ| ≥ 0.2 are selected.

Selection is done only on training data.

3. Data Processing
Clinical variables are one-hot encoded (fit on training only).

Missing values are filled using column means from training data.

Test data is transformed using the same encoder and imputer from training.

4. Model Training

Model: ElasticNet

Parameters: alpha=0.1, l1_ratio=0.2 (selected through grid search with 5-fold cross-validation on the training data)

Trained on: Full training data (methylation + clinical)

5. Test Data

External data is used only for prediction and evaluation.

In [None]:
# Data Path
from pathlib import Path
import os

PROJECT_ROOT = Path("/content/drive/My Drive/Age_Prediction")

# Internal Data Paths
train_meth_path = PROJECT_ROOT / "data_preprocessing" / "deduplicated_methylation_data.parquet"
train_clinical_path = PROJECT_ROOT / "data_merge" / "Raw Data"/ "raw clinical data.csv"
train_age_path = PROJECT_ROOT / "data_merge" / "Raw Data"/ "age.csv"
# External data Paths
test_meth_path = PROJECT_ROOT / "data_preprocessing" / "outside_deduplicated_methylation_data.parquet"
test_clinical_path = PROJECT_ROOT / "data_merge" / "Outside Raw Data"/ "outside_raw clinical data.csv"
test_age_path = PROJECT_ROOT / "data_merge" / "Outside Raw Data"/ "outside_age.csv"

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from joblib import Parallel, delayed
from scipy.stats import spearmanr

# Load training data
X_meth_full = pd.read_parquet(train_meth_path)
X_clin_full = pd.read_csv(train_clinical_path, index_col=0)
y_full = pd.read_csv(train_age_path, index_col=0)["years_to_birth"]

# Align
X_meth_full = X_meth_full[X_meth_full.index.isin(y_full.index)]
X_clin_full = X_clin_full.loc[X_meth_full.index]
y_full = y_full.loc[X_meth_full.index]

# Spearman filtering
def compute_spearman(X_df, y):
    results = Parallel(n_jobs=-1)(
        delayed(spearmanr)(X_df[col], y) for col in X_df.columns
    )
    corrs = pd.Series([r[0] for r in results], index=X_df.columns)
    return corrs

corrs = compute_spearman(X_meth_full, y_full)
selected_cpgs = corrs[abs(corrs) >= 0.2].index
X_meth_full_sel = X_meth_full[selected_cpgs]

# One-hot encode clinical variables
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
X_clin_full_enc = pd.DataFrame(
    encoder.fit_transform(X_clin_full),
    index=X_clin_full.index,
    columns=encoder.get_feature_names_out()
)

# Merge features
X_train_raw = pd.concat([X_meth_full_sel, X_clin_full_enc], axis=1)

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(
    imputer.fit_transform(X_train_raw),
    index=X_train_raw.index,
    columns=X_train_raw.columns
)

# Train final ElasticNet model with best parameters
final_model = ElasticNet(alpha=0.1, l1_ratio=0.2, max_iter=5000)
final_model.fit(X_train, y_full)

# Load test data
X_meth_test = pd.read_parquet(test_meth_path)
X_clin_test = pd.read_csv(test_clinical_path, index_col=0)
y_test = pd.read_csv(test_age_path, index_col=0)["years_to_birth"]

# Subset CpGs
X_meth_test_sel = X_meth_test[selected_cpgs]

# One-hot encode clinical variables using training encoder
X_clin_test_enc = pd.DataFrame(
    encoder.transform(X_clin_test),
    index=X_clin_test.index,
    columns=encoder.get_feature_names_out()
)

# Merge test features
X_test_raw = pd.concat([X_meth_test_sel, X_clin_test_enc], axis=1)

# Impute missing values using training imputer
X_test = pd.DataFrame(
    imputer.transform(X_test_raw),
    index=X_test_raw.index,
    columns=X_test_raw.columns
)

# Predict and evaluate
y_pred = final_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("ElasticNet on External Dataset:")
print(f"MAE : {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²  : {r2:.2f}")

  model = cd_fast.enet_coordinate_descent(


ElasticNet on External Dataset:
MAE : 10.28
RMSE: 12.62
R²  : 0.44
