In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Variable declaration

In [120]:
Clock_result = "/mnt/NAS_PROJECT/vol_Phucteam/CONGNGUYEN/pipeline/Biological_age/dnaMethyAge_Multi-model/GSE90124/Results/GSE90124_Merged_Multi_Clock.tsv"
number_of_selected_model = ["mAge_HorvathS2018","mAge_BernabeuE2023c","mAge_ShirebyG2020"]

## Select the best performance models

In [121]:
predicted_value = pd.read_csv(Clock_result,sep=",")

#Remove column with the header AgeAcc_*
predicted_value = predicted_value.loc[:,~predicted_value.columns.str.contains("AgeAcc_")]

#Select the columns: Sample, Age, and column in the list number_of_selected_model
predicted_value = predicted_value[["Age"]+number_of_selected_model]
print(predicted_value.head())
print(predicted_value.shape)


     Age  mAge_HorvathS2018  mAge_BernabeuE2023c  mAge_ShirebyG2020
0  65.14          64.016003            52.031383          64.252379
1  65.14          68.855760            53.288873          71.942673
2  46.02          53.501029            44.105950          60.317771
3  50.62          54.211210            43.264171          54.296012
4  50.62          52.464813            42.919258          53.578556
(322, 4)


In [122]:
# Split data into training and testing sets
X = predicted_value[number_of_selected_model]
y = predicted_value["Age"]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(x_train.shape)
print(x_test.shape)

# Fit the linear regression model
regressor = LinearRegression()
regressor.fit(x_train, y_train)

print(regressor)

(257, 3)
(65, 3)
LinearRegression()


## Apply new regression model on the test dataset

In [123]:
# Predict on the test data
y_pred = regressor.predict(x_test)

# Compare predicted values with actual values
comparison_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
print(comparison_df)


     Actual  Predicted
173   59.10  60.843220
132   63.35  66.567618
197   52.24  42.213540
9     63.50  60.765942
104   60.61  64.391832
..      ...        ...
229   56.21  56.136407
60    62.08  69.474134
244   44.95  47.291552
261   70.10  62.203221
118   74.41  77.444178

[65 rows x 2 columns]


## Calculate the Pearson correlation coefficient

In [124]:
# Calculate the correlation matrix
correlation_matrix = comparison_df.corr()

# Print the correlation coefficient
pearson_corr = correlation_matrix.loc["Actual", "Predicted"]
print(f"Pearson correlation coefficient: {pearson_corr:.4f}")

# Calculate the MAD and RMSD
mean_abs_diff = np.mean(np.abs(y_test - y_pred))
root_mean_sq_diff = np.sqrt(np.mean((y_test - y_pred) ** 2))
print(f"Mean absolute difference: {mean_abs_diff:.4f}")
print(f"Root mean squared difference: {root_mean_sq_diff:.4f}")

Pearson correlation coefficient: 0.8981
Mean absolute difference: 3.6870
Root mean squared difference: 4.5532
