In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.stats import spearmanr
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import LeaveOneOut
import os

In [10]:
OUTPUT_DIR = 'liver_model_input'
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
df_liver = pd.read_csv('MasterMatrix.tsv', sep='\t', index_col=0)

# --- Metadata Extraction (Target Y) ---
# Column names are like '14_F', '17_F', etc.
sample_ages = [int(col.split('_')[0]) for col in df_liver.columns]
y_liver_age = pd.Series(sample_ages, index=df_liver.columns, name='Chronological_Age')

print(f"Loaded {df_liver.shape[0]} proteins and {df_liver.shape[1]} samples.")
print(f"Age range: {y_liver_age.min()} to {y_liver_age.max()}")

# Transpose the data so proteins are columns (features) and samples are rows (observations) 
# This is the standard ML format.
df_liver_T = df_liver.transpose()

Loaded 10643 proteins and 20 samples.
Age range: 14 to 66


In [3]:
df_liver_T.head()

Protein,A0A024R1R8,A0A024R1R8;Q9Y2S6,A0A024RBG1,A0A075B6H9,A0A075B6I0,A0A075B6I1,A0A075B6I9,A0A075B6I9;P04211,A0A075B6J9,A0A075B6K0,...,Q9Y6X2,Q9Y6X3,Q9Y6X4,Q9Y6X5,Q9Y6X8,Q9Y6X9,Q9Y6Y0,Q9Y6Y8,Q9Y6Y9,Q9Y6Z7
14_F,,,5680500.0,,2094920.0,,,1335270.0,,1070060.0,...,,19092200.0,963100.0,6153130.0,1609050.0,305866.0,20765700.0,138314000.0,,6077240.0
14_F.1,,7196940.0,,372784.0,8912260.0,,,,,,...,,986241.0,1171160.0,2318020.0,57379400.0,4197180.0,2331490.0,46037200.0,,53629200.0
17_F,,5025300.0,,1583110.0,1173300.0,,,,,,...,,6455440.0,6352800.0,22618800.0,3906600.0,756909.0,2712820.0,619845000.0,,1284610.0
23_M,,,,,252769.0,,,1928550.0,,,...,,17430500.0,692701.0,16582100.0,3040170.0,533140.0,7023250.0,437019000.0,,5755110.0
25_M,,3420590.0,,2044870.0,1880380.0,,,8412360.0,4842210.0,,...,,20303.1,5534070.0,3498630.0,34796800.0,19340.2,1967860.0,798572000.0,,95435500.0


In [5]:
# 1. Log2 Transformation
# Adds 1 before log to avoid errors on zero values (Log2(0) is undefined)
df_liver_log = np.log2(df_liver_T + 1) 

# 2. Imputation (MinProb Approximation)
# Calculate the overall minimum observed value in the dataset
min_val = df_liver_log[~df_liver_log.isna()].min().min()

# Determine the imputation value: a small fraction below the minimum.
# This mimics the idea that the protein was present but below the detection limit.
imputation_val = min_val - 2 # 2 units below the lowest Log2 value

# Replace NaNs with the imputation value
df_liver_imputed = df_liver_log.fillna(imputation_val)

print(f"Data Log2-transformed and imputed with value: {imputation_val:.2f}")

# 3. Standardization (Z-scoring)
# Essential for Elastic Net to prevent high-abundance proteins from dominating the model
scaler = StandardScaler()
df_liver_scaled = pd.DataFrame(
    scaler.fit_transform(df_liver_imputed), 
    columns=df_liver_imputed.columns, 
    index=df_liver_imputed.index
)

Data Log2-transformed and imputed with value: 7.45


In [11]:
# Calculate Spearman correlation between each protein (column) and the age vector (y_liver_age)
# We must ensure the age vector is aligned with the DataFrame rows (samples)

corr_results = []
for protein in df_liver_scaled.columns:
    # Calculate R-value (correlation coefficient) and p-value
    r, p = spearmanr(df_liver_scaled[protein], y_liver_age)
    
    # Store results
    corr_results.append({'Protein': protein, 'R_value': r, 'P_value': p})

df_corr = pd.DataFrame(corr_results)

# Apply Feature Selection Criteria (p < 0.05)
# This identifies the Age-Dependent Differentially Expressed Proteins (DEPs)
SIGNIFICANCE_THRESHOLD = 0.05
df_degs = df_corr[df_corr['P_value'] < SIGNIFICANCE_THRESHOLD].copy()

# Filter the scaled data to keep only the selected DEP features
dep_proteins = df_degs['Protein'].tolist()
df_final_features_X = df_liver_scaled[dep_proteins]

print(f"Initial proteins: {len(df_liver.index)}")
print(f"Age-Dependent Proteins (DEPs) selected: {len(df_final_features_X.columns)}")

# Save the final model inputs
df_final_features_X.to_csv(os.path.join(OUTPUT_DIR, 'liver_final_features_X.tsv'), sep='\t')
y_liver_age.to_frame().to_csv(os.path.join(OUTPUT_DIR, 'liver_target_Y.tsv'), sep='\t')

Initial proteins: 10643
Age-Dependent Proteins (DEPs) selected: 314


In [12]:
df_final_features_X.head()

Protein,A0A075B6I0,A0A0U1RQS6,A6PVS8,A7KAX9,B7ZAQ6;P0CG08,O00186,O00187,O00192,O00533,O14521,...,Q9Y371,Q9Y421,Q9Y4B6,Q9Y4F1,Q9Y4R8,Q9Y570,Q9Y5B8,Q9Y5Z4,Q9Y6N7,Q9Y6X4
14_F,1.013183,2.985149,-0.49991,-1.918042,0.226323,-0.604859,0.49821,-0.054867,-0.499208,0.511412,...,-1.225821,-0.330791,-1.669854,1.918909,-0.404899,-0.466746,-0.338262,-0.350447,-0.085721,-1.475355
14_F.1,1.30721,-0.33333,-0.49991,0.298526,1.012461,-1.474138,-2.401099,-2.288915,-0.499208,1.144316,...,-0.344784,-0.330791,-1.411055,-0.296164,-1.005672,-1.308068,-1.022883,-0.241799,0.098612,-1.287051
17_F,0.895465,-0.33333,-0.49991,0.129606,0.989243,0.174584,-0.486906,-0.373298,-0.499208,0.136025,...,-0.624663,-0.330791,-0.2935,0.162301,-1.121567,-1.561719,-1.699422,-0.791421,-0.058591,0.340842
23_M,0.583733,3.014785,-0.49991,-0.077757,1.559222,-1.766399,-0.217086,-0.789542,-0.499208,-0.051597,...,-1.111286,-0.330791,-0.002931,2.909699,-2.243528,-1.296532,-2.068195,-0.797938,0.569541,-1.792634
25_M,0.991243,-0.33333,-0.49991,0.107323,0.470974,-0.856008,0.763141,-1.660845,-0.499208,-0.193406,...,-0.819464,-0.330791,0.70416,0.456148,-0.924858,-1.67724,0.222888,-0.706046,-1.976162,0.208011


In [13]:
y_liver_age.head()

14_F      14
14_F.1    14
17_F      17
23_M      23
25_M      25
Name: Chronological_Age, dtype: int64

In [15]:
cv_strategy = LeaveOneOut() 

# ElasticNetCV automatically searches for the best alpha (regularization) and l1_ratio (mixing)
model = ElasticNetCV(
    l1_ratio=[.1, .5, .7, .9, .95, .99, 1], # Test a range of ratios (1=Lasso, small=Ridge)
    cv=cv_strategy, 
    random_state=42, 
    n_jobs=-1 # Use all available cores
)

In [17]:
print("Step 5: Training Elastic Net Model with LOOCV...")
# X is the DEP features, Y is the Chronological Age
model.fit(df_final_features_X, y_liver_age)

Step 5: Training Elastic Net Model with LOOCV...


  model = cd_fast.enet_coordinate_descent(


0,1,2
,l1_ratio,"[0.1, 0.5, ...]"
,eps,0.001
,n_alphas,'deprecated'
,alphas,'warn'
,fit_intercept,True
,precompute,'auto'
,max_iter,1000
,tol,0.0001
,cv,LeaveOneOut()
,copy_X,True


In [18]:
y_pred = model.predict(df_final_features_X)

# Calculate key metrics
mae = np.mean(np.abs(y_liver_age - y_pred)) # Mean Absolute Error (Primary metric)
r_squared = model.score(df_final_features_X, y_liver_age)

print("\n--- Model Training Complete ---")
print(f"Optimal Alpha (Regularization): {model.alpha_:.4f}")
print(f"Optimal L1 Ratio (Mixing): {model.l1_ratio_:.4f}")
print(f"Final Model Performance (R-squared): {r_squared:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} years")

# Save model coefficients and performance
results = pd.DataFrame({
    'Actual_Age': y_liver_age,
    'Predicted_Age': y_pred
})
results.to_csv(os.path.join(OUTPUT_DIR, 'liver_age_prediction_results.tsv'), sep='\t')
print(f"Results saved to: {OUTPUT_DIR}")


--- Model Training Complete ---
Optimal Alpha (Regularization): 0.1193
Optimal L1 Ratio (Mixing): 0.1000
Final Model Performance (R-squared): 1.0000
Mean Absolute Error (MAE): 0.05 years
Results saved to: liver_model_input
