In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import LeaveOneOut
import os

In [4]:
OUTPUT_DIR = 'liver_model_input'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
df_liver = pd.read_csv('MasterMatrix.tsv', sep='\t', index_col=0)

sample_ages = [int(col.split('_')[0]) for col in df_liver.columns]
y_liver_age = pd.Series(sample_ages, index=df_liver.columns, name='Chronological_Age')

print(f"Loaded {df_liver.shape[0]} proteins and {df_liver.shape[1]} samples.")
print(f"Age range: {y_liver_age.min()} to {y_liver_age.max()}")

df_liver_T = df_liver.transpose()

Loaded 10643 proteins and 20 samples.
Age range: 14 to 66


In [6]:
df_liver_T.head()

Protein,A0A024R1R8,A0A024R1R8;Q9Y2S6,A0A024RBG1,A0A075B6H9,A0A075B6I0,A0A075B6I1,A0A075B6I9,A0A075B6I9;P04211,A0A075B6J9,A0A075B6K0,...,Q9Y6X2,Q9Y6X3,Q9Y6X4,Q9Y6X5,Q9Y6X8,Q9Y6X9,Q9Y6Y0,Q9Y6Y8,Q9Y6Y9,Q9Y6Z7
14_F,,,5680500.0,,2094920.0,,,1335270.0,,1070060.0,...,,19092200.0,963100.0,6153130.0,1609050.0,305866.0,20765700.0,138314000.0,,6077240.0
14_F.1,,7196940.0,,372784.0,8912260.0,,,,,,...,,986241.0,1171160.0,2318020.0,57379400.0,4197180.0,2331490.0,46037200.0,,53629200.0
17_F,,5025300.0,,1583110.0,1173300.0,,,,,,...,,6455440.0,6352800.0,22618800.0,3906600.0,756909.0,2712820.0,619845000.0,,1284610.0
23_M,,,,,252769.0,,,1928550.0,,,...,,17430500.0,692701.0,16582100.0,3040170.0,533140.0,7023250.0,437019000.0,,5755110.0
25_M,,3420590.0,,2044870.0,1880380.0,,,8412360.0,4842210.0,,...,,20303.1,5534070.0,3498630.0,34796800.0,19340.2,1967860.0,798572000.0,,95435500.0


In [11]:
df_liver_log = np.log2(df_liver_T + 1) 

min_val = df_liver_log[~df_liver_log.isna()].min().min()

imputation_val = min_val - 2

df_liver_imputed = df_liver_log.fillna(imputation_val)

print(f"Data Log2-transformed and imputed with value: {imputation_val:.2f}")

Data Log2-transformed and imputed with value: 7.45


In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_liver_imputed, y_liver_age, test_size=0.25, random_state=42)

In [15]:
X_train.head()

Protein,A0A024R1R8,A0A024R1R8;Q9Y2S6,A0A024RBG1,A0A075B6H9,A0A075B6I0,A0A075B6I1,A0A075B6I9,A0A075B6I9;P04211,A0A075B6J9,A0A075B6K0,...,Q9Y6X2,Q9Y6X3,Q9Y6X4,Q9Y6X5,Q9Y6X8,Q9Y6X9,Q9Y6Y0,Q9Y6Y8,Q9Y6Y9,Q9Y6Z7
28_M,7.453989,21.717705,23.137116,19.636816,7.453989,7.453989,7.453989,20.542014,7.453989,7.453989,...,7.453989,21.342778,22.081813,23.85525,24.987172,22.934143,23.484388,28.740816,11.746464,17.354499
45_F,7.453989,22.928476,23.427622,7.453989,7.453989,7.453989,7.453989,7.453989,7.453989,7.453989,...,7.453989,20.875873,20.315376,22.957535,19.790285,20.081115,24.788873,26.064182,7.453989,21.081232
23_M,7.453989,7.453989,7.453989,7.453989,17.947466,7.453989,7.453989,20.879086,7.453989,7.453989,...,7.453989,24.055111,19.401875,23.983123,21.535721,19.024158,22.743708,28.703121,7.453989,22.456412
59_M,7.453989,7.453989,7.453989,20.496075,7.453989,7.453989,7.453989,7.453989,7.453989,7.453989,...,7.453989,21.754324,22.631336,21.410784,23.460478,19.268725,23.25504,29.297642,13.540564,23.378129
50_F,7.453989,7.453989,19.783714,14.320991,7.453989,7.453989,7.453989,21.34782,7.453989,7.453989,...,7.453989,20.664004,20.111412,22.566104,20.720372,19.186264,20.291349,26.290336,7.453989,23.538106


In [10]:
X_train.shape, X_test.shape

((15, 10643), (5, 10643))

In [20]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

In [21]:
X_train_scaled

Protein,A0A024R1R8,A0A024R1R8;Q9Y2S6,A0A024RBG1,A0A075B6H9,A0A075B6I0,A0A075B6I1,A0A075B6I9,A0A075B6I9;P04211,A0A075B6J9,A0A075B6K0,...,Q9Y6X2,Q9Y6X3,Q9Y6X4,Q9Y6X5,Q9Y6X8,Q9Y6X9,Q9Y6Y0,Q9Y6Y8,Q9Y6Y9,Q9Y6Z7
28_M,-0.267261,0.609897,1.176684,0.736091,-0.696129,-0.267261,-0.267261,0.679401,-0.392228,-0.392115,...,-0.267261,0.365465,-0.142577,1.338602,0.896066,1.492574,0.654633,1.054648,0.579924,-1.618372
45_F,-0.267261,0.781336,1.215709,-1.178235,-0.696129,-0.267261,-0.267261,-1.221058,-0.392228,-0.392115,...,-0.267261,0.198111,-1.333624,0.359697,-1.654763,0.29427,1.358053,-0.860978,-0.557604,-0.537859
23_M,-0.267261,-1.409766,-0.930097,-1.178235,0.917664,-0.267261,-0.267261,0.728346,-0.392228,-0.392115,...,-0.267261,1.337648,-1.949566,1.47804,-0.798037,-0.149664,0.255234,1.02767,-0.557604,-0.139144
59_M,-0.267261,-1.409766,-0.930097,0.871109,-0.696129,-0.267261,-0.267261,-1.221058,-0.392228,-0.392115,...,-0.267261,0.512975,0.227948,-1.326941,0.146707,-0.046943,0.530961,1.453159,1.05537,0.128094
50_F,-0.267261,-1.409766,0.726208,-0.099201,-0.696129,-0.267261,-0.267261,0.796409,-0.392228,-0.392115,...,-0.267261,0.122171,-1.47115,-0.067134,-1.198241,-0.081577,-1.067155,-0.699123,-0.557604,0.174477
47_M,-0.267261,0.619614,1.140034,0.664889,-0.696129,-0.267261,-0.267261,0.77229,-0.392228,-0.392115,...,-0.267261,0.544127,0.475685,0.62373,1.643002,1.029999,-1.240025,0.016365,-0.557604,1.26185
17_F,-0.267261,0.686794,-0.930097,0.886548,1.258259,-0.267261,-0.267261,-1.221058,-0.392228,-0.392115,...,-0.267261,0.824007,0.206119,1.966443,-0.62047,0.062696,-0.484778,1.388526,-0.557604,-0.766424
43_M,-0.267261,0.864344,1.156378,-1.178235,-0.696129,-0.267261,-0.267261,-1.221058,-0.392228,2.615693,...,-0.267261,-0.713194,0.190021,-1.313535,-0.624628,0.663134,-0.308234,-0.698778,-0.557604,0.113147
66_M,-0.267261,-1.409766,-0.930097,-1.178235,-0.696129,-0.267261,-0.267261,-1.221058,-0.392228,-0.392115,...,-0.267261,1.276459,2.540167,0.44155,-0.88164,0.687144,0.035852,-0.100099,-0.557604,-1.911139
25_M,-0.267261,0.608214,-0.930097,0.944569,1.362906,-0.267261,-0.267261,1.036908,2.561654,-0.392115,...,-0.267261,-2.155485,0.071905,-0.96974,0.928107,-2.15932,-0.734531,1.65012,-0.557604,1.035563


In [22]:
corr_results = []
for protein in X_train_scaled.columns:
    r, p = spearmanr(X_train_scaled[protein], y_train)

    corr_results.append({'Protein': protein, 'R_value': r, 'P_value': p})

df_corr = pd.DataFrame(corr_results)

SIGNIFICANCE_THRESHOLD = 0.05
df_degs = df_corr[df_corr['P_value'] < SIGNIFICANCE_THRESHOLD].copy()

dep_proteins = df_degs['Protein'].tolist()
df_final_features_X = X_train_scaled[dep_proteins]

print(f"Initial proteins: {len(df_liver.index)}")
print(f"Age-Dependent Proteins (DEPs) selected: {len(df_final_features_X.columns)}")

df_final_features_X.to_csv(os.path.join(OUTPUT_DIR, 'liver_final_features_X.tsv'), sep='\t')
y_train.to_frame().to_csv(os.path.join(OUTPUT_DIR, 'liver_target_Y.tsv'), sep='\t')

  r, p = spearmanr(X_train_scaled[protein], y_train)


Initial proteins: 10643
Age-Dependent Proteins (DEPs) selected: 345


In [23]:
df_final_features_X.head()

Protein,A0A075B6I0,A0A075B6K5,A0A0B4J1X8,A0A0C4DH35,A0A0C4DH41;P01824;P01825;P06331;P0DP06;P0DP08,A0FGR8,A0PJW6,A5YKK6,A6NIH7,A6PVS8,...,Q9Y3E2,Q9Y421,Q9Y4F1,Q9Y4R8,Q9Y5B8,Q9Y616,Q9Y673,Q9Y6I3,Q9Y6N7,Q9Y6R1
28_M,-0.696129,0.983759,-0.686188,0.833324,0.540949,1.935536,1.902298,0.551947,0.185726,-0.499918,...,-0.017567,-0.389127,1.365338,-0.57748,-0.42283,0.489977,0.394839,0.455098,-0.937869,-0.054402
45_F,-0.696129,-1.188285,-0.686188,0.448512,0.299535,-1.267823,-1.175062,-1.373378,0.834681,-0.499918,...,0.121326,-0.389127,0.188081,1.077756,-1.796669,-0.564403,0.740315,-1.180648,0.300578,1.590865
23_M,0.917664,0.252363,1.250771,0.91255,0.49913,0.72855,1.362028,0.400455,-1.788496,-0.499918,...,-0.419443,-0.389127,3.084986,-2.172701,-2.032314,-0.564403,0.766887,-0.836092,0.546465,0.402589
59_M,-0.696129,-1.188285,-0.686188,-1.787781,0.307986,-1.685684,-0.138784,-1.223534,0.931841,1.945029,...,-0.788032,2.873259,-0.621766,0.732482,0.438884,-0.564403,0.847624,-0.94569,1.241456,-0.331296
50_F,-0.696129,-1.188285,-0.686188,0.255832,0.34097,-1.297804,-1.305465,-0.720069,1.067518,2.01221,...,1.037065,-0.389127,-0.964932,1.060833,0.818767,-0.564403,1.285677,-1.000163,0.727476,-2.252731


In [24]:
y_train.head()

28_M    28
45_F    45
23_M    23
59_M    59
50_F    50
Name: Chronological_Age, dtype: int64

In [27]:
cv_strategy = LeaveOneOut()

model = ElasticNetCV(
    l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
    cv=cv_strategy, 
    random_state=42, 
    n_jobs=-1
)

In [28]:
print("Step 5: Training Elastic Net Model with LOOCV...")
model.fit(df_final_features_X, y_train)

Step 5: Training Elastic Net Model with LOOCV...


  model = cd_fast.enet_coordinate_descent(


0,1,2
,l1_ratio,"[0.1, 0.5, ...]"
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,1000
,tol,0.0001
,cv,LeaveOneOut()
,copy_X,True


In [32]:
from sklearn.metrics import r2_score

X_test_selected = X_test_scaled[df_final_features_X.columns]

y_pred = model.predict(X_test_selected)

mae = np.mean(np.abs(y_test.values - y_pred)) 
r_squared = r2_score(y_test.values, y_pred)

print("\n--- Model Training Complete ---")
print(f"Optimal Alpha (Regularization): {model.alpha_:.4f}")
print(f"Optimal L1 Ratio (Mixing): {model.l1_ratio_:.4f}")
print(f"Final Model Performance (R-squared): {r_squared:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} years")

results = pd.DataFrame({
    'Actual_Age': y_test,
    'Predicted_Age': y_pred
})
results.to_csv(os.path.join(OUTPUT_DIR, 'liver_age_prediction_results.tsv'), sep='\t')
print(f"Results saved to: {OUTPUT_DIR}")


--- Model Training Complete ---
Optimal Alpha (Regularization): 0.1048
Optimal L1 Ratio (Mixing): 0.1000
Final Model Performance (R-squared): -0.1090
Mean Absolute Error (MAE): 14.26 years
Results saved to: liver_model_input


In [40]:
X_train_selected = X_train_scaled[df_final_features_X.columns]

y_pred = model.predict(X_train_selected)

mae = np.mean(np.abs(y_train.values - y_pred)) 
r_squared = r2_score(y_train.values, y_pred)

print("\n--- Model Training Complete ---")
print(f"Optimal Alpha (Regularization): {model.alpha_:.4f}")
print(f"Optimal L1 Ratio (Mixing): {model.l1_ratio_:.4f}")
print(f"Final Model Performance (R-squared): {r_squared:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} years")


--- Model Training Complete ---
Optimal Alpha (Regularization): 0.1048
Optimal L1 Ratio (Mixing): 0.1000
Final Model Performance (R-squared): 1.0000
Mean Absolute Error (MAE): 0.03 years


In [33]:
from sklearn import tree

In [38]:
model2 = tree.DecisionTreeRegressor(
    max_depth=5,
    random_state=42
)
# Train on the selected training features and y_train
model2.fit(df_final_features_X, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [39]:
from sklearn.metrics import r2_score
# Predict on the test set using the same selected features
X_test_selected = X_test_scaled[df_final_features_X.columns]
y_pred_tree = model2.predict(X_test_selected)

mae = np.mean(np.abs(y_test.values - y_pred_tree)) 
r_squared = r2_score(y_test.values, y_pred_tree)

print("\n--- Decision Tree Training Complete ---")
print(f"Final Model Performance (R-squared): {r_squared:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} years")

results = pd.DataFrame({
    'Actual_Age': y_test,
    'Predicted_Age': y_pred_tree
})
results.to_csv(os.path.join(OUTPUT_DIR, 'liver_age_prediction_results_tree.tsv'), sep='\t')
print(f"Results saved to: {OUTPUT_DIR}")


--- Decision Tree Training Complete ---
Final Model Performance (R-squared): 0.4990
Mean Absolute Error (MAE): 10.40 years
Results saved to: liver_model_input


In [41]:
X_train_selected = X_train_scaled[df_final_features_X.columns]
y_pred_tree = model2.predict(X_train_selected)

mae = np.mean(np.abs(y_train.values - y_pred_tree)) 
r_squared = r2_score(y_train.values, y_pred_tree)

print("\n--- Decision Tree Training Complete ---")
print(f"Final Model Performance (R-squared): {r_squared:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} years")


--- Decision Tree Training Complete ---
Final Model Performance (R-squared): 1.0000
Mean Absolute Error (MAE): 0.00 years
