In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, widgets
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge, Lasso, Ridge
import xgboost as xgb
from scipy.stats import pearsonr

In [None]:
cmr_phenotypes_34K = pd.read_csv('path\\ukb_cmr_34K.csv')

In [None]:
covariates = pd.read_csv('path\\ukb_covariates_34K.csv')

In [None]:
all_cmr_phenotypes_34K = cmr_phenotypes_34K.drop(columns=['f.eid'])

# Preprocess data one hot encoder

In [None]:
covariates.columns

In [None]:
covariates.isna().sum()

In [None]:
encoder = LabelEncoder()
scaler = StandardScaler()

In [None]:
covariates['highest_qualification_category_encoded'] = encoder.fit_transform(covariates['highest_qualification_category'])
covariates['Sex_encoded'] = encoder.fit_transform(covariates['sex'])
covariates['ethnicity_encoded'] = encoder.fit_transform(covariates['ethnicity'])
covariates['smoking_status_encoded'] = encoder.fit_transform(covariates['smoking_status'])
covariates['drinking_status_encoded'] = encoder.fit_transform(covariates['drinking_status'])

In [None]:
covariates['age_at_recruitment_visit2_encoded'] = scaler.fit_transform(covariates[['age_at_recruitment_visit2']])

In [None]:
len(covariates['age_at_recruitment_visit2'].value_counts())

In [None]:
len(covariates['age_at_recruitment_visit2_encoded'].value_counts())

# KNN Imputer

In [None]:
imputer = KNNImputer(n_neighbors=10)
data_imputed = imputer.fit_transform(all_cmr_phenotypes_34K)

In [None]:
heart_34K_imputed = pd.DataFrame(data_imputed, columns=all_cmr_phenotypes_34K.columns)

In [None]:
heart_34K_imputed.columns

In [None]:
df_knn_ = pd.concat([cmr_phenotypes_34K['f.eid'], heart_34K_imputed ], axis=1)

# Deconfound_features

In [None]:
covariates_ = covariates[['f.eid', 'obesity_groups','Sex_encoded', 'ethnicity_encoded', 'age_at_recruitment_visit2_encoded']]

In [None]:
covariates_

In [None]:
df_knn_cmr_covariates = df_knn_.merge(covariates_)

In [None]:
df_knn_cmr_covariates#.columns

In [None]:
def deconfound_feature(feature, sex, ethnic, age):
    X = np.column_stack((sex, ethnic, age))  # Independent variables
    y = feature  # Dependent variable
    
    model = LinearRegression()
    model.fit(X, y)  # Fit model
    
    predicted = model.predict(X)  # Get predicted values
    residual = y - predicted  # Compute residual (de-confounded feature)
    
    return residual

In [None]:
excluded_columns = ['f.eid', 'obesity_groups', 'Sex_encoded', 'ethnicity_encoded', 'age_at_recruitment_visit2_encoded']
cmr_features = [col for col in df_knn_cmr_covariates.columns if col not in excluded_columns]

In [None]:
for feature in cmr_features:
    df_knn_cmr_covariates[f"{feature}_deconfounded"] = deconfound_feature(df_knn_cmr_covariates[feature], df_knn_cmr_covariates['Sex_encoded'], df_knn_cmr_covariates['ethnicity_encoded'], df_knn_cmr_covariates['age_at_recruitment_visit2_encoded'])

In [None]:
df_knn_cmr_covariates.drop(columns=cmr_features + ['f.eid', 'obesity_groups', 'Sex_encoded', 'ethnicity_encoded', 'age_at_recruitment_visit2_encoded'], inplace=True)

In [None]:
features = [f"{f}_deconfounded" for f in cmr_features]

In [None]:
df_knn_cmr_covariates[features] = scaler.fit_transform(df_knn_cmr_covariates[features])

In [None]:
df_knn_cmr_covariates

# Prepare Train, validation, testing

In [None]:
df_merge = covariates.merge(df_knn_cmr_covariates, left_index=True, right_index=True)

In [None]:
df_merge_ = df_merge.drop(columns=['deprivation_index', 'highest_qualification_category', 'smoking_status', 'sex', 'drinking_status', 'physical_moderate', 'physical_vigorous', 'sbp','ethnicity', 'ethnicity_encoded','highest_qualification_category_encoded', 'Sex_encoded','smoking_status_encoded', 'drinking_status_encoded','age_at_recruitment_visit2_encoded'])

In [None]:
df_merge_['obesity_groups'].value_counts()

In [None]:
df_test = df_merge_[df_merge_['obesity_groups'] != 'healthy range'].reset_index(drop=True)

In [None]:
len(df_test)

In [None]:
healthy = df_merge_[df_merge_['obesity_groups'] == 'healthy range'].reset_index(drop=True)

In [None]:
test_healthy = healthy.sample(n=2189, random_state=42).sort_values(by='f.eid').reset_index(drop=True)

In [None]:
train_healthy = healthy[~healthy['f.eid'].isin(test_healthy['f.eid'])].sort_values(by='f.eid').reset_index(drop=True)

In [None]:
df_all_test = pd.concat([df_test, test_healthy]).sort_values(by='f.eid').reset_index(drop=True)

In [None]:
X_test = df_all_test.drop(columns=['f.eid', 'obesity_groups', 'age_at_recruitment_visit2'])
y_test = df_all_test['age_at_recruitment_visit2']

In [None]:
X = train_healthy.drop(columns=['f.eid', 'obesity_groups', 'age_at_recruitment_visit2'])

In [None]:
y = train_healthy['age_at_recruitment_visit2']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print('X train', X_train.shape)
print('X val', X_val.shape)
print('X test', X_test.shape)
print('Y train', y_train.shape)
print('Y val', y_val.shape)
print('Y test', y_test.shape)

# Training Model

## Linear regression

In [None]:
scoring_metrics = {
    'MAE': 'neg_mean_absolute_error',
    'MSE': 'neg_mean_squared_error',
    'R2': 'r2'
}

In [None]:
random_grid = {'fit_intercept': [True, False] }

In [None]:
optimized_lg = RandomizedSearchCV(
    estimator=LinearRegression(),
    param_distributions=random_grid,
    n_iter=100,
    cv=10,
    scoring=scoring_metrics,  # Multiple metrics
    refit='MAE',  # Choose which metric to use for selecting the best model
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

In [None]:
np.ravel(y)

In [None]:
optimized_lg.fit(X, np.ravel(y))

In [None]:
print("Best MAE:", optimized_lg.best_score_)
print("Best MSE:", max(optimized_lg.cv_results_['mean_test_MSE']))
print("Best R2:", max(optimized_lg.cv_results_['mean_test_R2']))

In [None]:
best_model = optimized_lg.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
cv_10_lg = optimized_lg.cv_results_
results_df_10cv_lg = pd.DataFrame(cv_10_lg)

In [None]:
Y_test = y_test.values.flatten()

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test R²: {r2:.4f}")

In [None]:
y_pred_lg

### Bias correction

In [None]:
y_pred_train = optimized_lg.best_estimator_.predict(X)
y_pred_test = optimized_lg.best_estimator_.predict(X_test)

In [None]:
# Heart age delta (bias)
heart_age_delta_train = y_pred_train - y
heart_age_delta_test = y_pred_test - y_test

In [None]:
# Train Bias Correction Model
bias_model = LinearRegression()
bias_model.fit(y.values.reshape(-1, 1), heart_age_delta_train)

In [None]:
# Get bias correction parameters
beta1 = bias_model.coef_[0]
beta0 = bias_model.intercept_

In [None]:
# Apply Bias Correction
y_pred_test_corrected = y_pred_test - (beta1 * y_test + beta0)

In [None]:
def evaluate_model(y_true, y_pred, label="Model"):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corr = np.corrcoef(y_true, y_pred)[0, 1]
    print(f"\n🔹 {label} Results:")
    print(f"   MAE: {mae:.4f}")
    print(f"   MSE: {mse:.4f}")
    print(f"   R²: {r2:.4f}")
    print(f"   Correlation: {corr:.2f}")

In [None]:
# Before correction
evaluate_model(y_test, y_pred_test, "Linear regression (Before Bias Correction)")

In [None]:
# After correction
evaluate_model(y_test, y_pred_test_corrected, "Linear regression (After Bias Correction)")

In [None]:
y_pred_train = optimized_lg.best_estimator_.predict(X)
y_pred_test = optimized_lg.best_estimator_.predict(X_test)

In [None]:
# Compute heart age delta before and after correction
heart_age_delta_test_before = y_pred_test - y_test
heart_age_delta_test_after = y_pred_test_corrected - y_test

In [None]:
# Train Bias Correction Model
bias_model = LinearRegression()
bias_model.fit(y.values.reshape(-1, 1), heart_age_delta_train)

In [None]:
# Bias correction parameters
beta1 = bias_model.coef_[0]
beta0 = bias_model.intercept_

In [None]:
beta0

In [None]:
y_pred_test_corrected = y_pred_test - (beta1 * y_test + beta0)

In [None]:
corr_pred_actual_before, _ = pearsonr(y_pred_test, y_test)
corr_pred_actual_after, _ = pearsonr(y_pred_test_corrected, y_test)

In [None]:
corr_delta_actual_before, _ = pearsonr(heart_age_delta_test_before, y_test)
corr_delta_actual_after, _ = pearsonr(heart_age_delta_test_after, y_test)

In [None]:
# Print correlation results
print("\n🔹 Correlation Between Predicted Heart Age & Actual Age:")
print(f"   Before Correction: {corr_pred_actual_before:.4f}")
print(f"   After Correction: {corr_pred_actual_after:.4f}")

In [None]:
print("\n🔹 Correlation Between Heart Age Delta & Actual Age:")
print(f"   Before Correction: {corr_delta_actual_before:.4f}")
print(f"   After Correction: {corr_delta_actual_after:.4f}")

## Random Forest

In [None]:
random_grid = {
    'n_estimators': [10, 50, 100, 200, 500],  # Num of trees
    'max_depth': [1, 3, 6, 10, 12],  # maximum depth of the tree
    'min_samples_split': [1, 10, 50, 100],  # minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 10, 50, 100],  # minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2', None],  # number of features
    'bootstrap': [False, True]  # bootstrap samples 
}

In [None]:
optimized_rf = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=random_grid,
    n_iter=100,  # Number of iterations
    cv=10,  # CV con 10 folds
    scoring=scoring_metrics,  # Metrics
    refit='MAE', 
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

In [None]:
optimized_rf.fit(X_train, np.ravel(y_train))

In [None]:
optimized_rf.best_params_

In [None]:
print("Best MAE:", optimized_rf.best_score_)
print("Best MSE:", max(optimized_rf.cv_results_['mean_test_MSE']))
print("Best R2:", max(optimized_rf.cv_results_['mean_test_R2']))

In [None]:
optimized_rf.cv_results_

In [None]:
print("Best MAE:", optimized_rf.best_score_)
print("Best MSE:", max(optimized_rf.cv_results_['mean_train_MSE']))
print("Best R2:", max(optimized_rf.cv_results_['mean_train_R2']))

In [None]:
y_pred_train = optimized_rf.best_estimator_.predict(X_train)
y_pred_test = optimized_rf.best_estimator_.predict(X_val)

In [None]:
# Heart age delta (bias)
heart_age_delta_train = y_pred_train - y_train
heart_age_delta_test = y_pred_test - y_val

In [None]:
# Train Bias Correction Model
bias_model = LinearRegression()
bias_model.fit(y_train.values.reshape(-1, 1), heart_age_delta_train)

In [None]:
# Bias correction parameters
beta1 = bias_model.coef_[0]
beta0 = bias_model.intercept_

In [None]:
y_pred_test_corrected = y_pred_test - (beta1 * y_val + beta0)

In [None]:
# Compute heart age delta before and after correction
heart_age_delta_test_before = y_pred_test - y_val
heart_age_delta_test_after = y_pred_test_corrected - y_val

In [None]:
corr_pred_actual_before, _ = pearsonr(y_pred_test, y_val)
corr_pred_actual_after, _ = pearsonr(y_pred_test_corrected, y_val)
corr_delta_actual_before, _ = pearsonr(heart_age_delta_test_before, y_val)
corr_delta_actual_after, _ = pearsonr(heart_age_delta_test_after, y_val)

In [None]:
# Print correlation results
print("\n🔹 Correlation Between Predicted Heart Age & Actual Age:")
print(f"   Before Correction: {corr_pred_actual_before:.4f}")
print(f"   After Correction: {corr_pred_actual_after:.4f}")
print("\n🔹 Correlation Between Heart Age Delta & Actual Age:")
print(f"   Before Correction: {corr_delta_actual_before:.4f}")
print(f"   After Correction: {corr_delta_actual_after:.4f}")

In [None]:
# Before correction
evaluate_model(y_train, y_pred_train, "XGBoost (Before Bias Correction)")

In [None]:
# Before correction
evaluate_model(y_val, y_pred_test, "XGBoost (Before Bias Correction)")

In [None]:
# After correction
evaluate_model(y_val, y_pred_test_corrected, "XGBoost (After Bias Correction)")

## BayesianRidge

In [None]:
random_grid = {
    'alpha_1': np.logspace(-6, -1, 10),
    'alpha_2': np.logspace(-6, -1, 10),
    'lambda_1': np.logspace(-6, -1, 10),
    'lambda_2': np.logspace(-6, -1, 10),  
    'tol': [1e-4, 1e-3, 1e-2, 1e-1] 
}

In [None]:
optimized_br = RandomizedSearchCV(
    estimator=BayesianRidge(),
    param_distributions=random_grid,
    n_iter=100,  
    cv=10, 
    scoring=scoring_metrics, 
    refit='MAE',  
    verbose=1,
    n_jobs=-1, 
    return_train_score=True
)

In [None]:
optimized_br.fit(X, y)

In [None]:
print("Best MAE:", optimized_br.best_score_)
print("Best MSE:", max(optimized_br.cv_results_['mean_test_MSE']))
print("Best R2:", max(optimized_br.cv_results_['mean_test_R2']))

In [None]:
y_pred_train = optimized_br.best_estimator_.predict(X)
y_pred_test = optimized_br.best_estimator_.predict(X_test)

In [None]:
# Heart age delta (bias)
heart_age_delta_train = y_pred_train - y
heart_age_delta_test = y_pred_test - y_test

In [None]:
bias_model = LinearRegression()
bias_model.fit(y.values.reshape(-1, 1), heart_age_delta_train)

In [None]:
# Bias correction parameters
beta1 = bias_model.coef_[0]
beta0 = bias_model.intercept_

In [None]:
y_pred_test_corrected = y_pred_test - (beta1 * y_test + beta0)

In [None]:
corr_pred_actual_before, _ = pearsonr(y_pred_test, y_test)
corr_pred_actual_after, _ = pearsonr(y_pred_test_corrected, y_test)

In [None]:
corr_delta_actual_before, _ = pearsonr(heart_age_delta_test_before, y_test)
corr_delta_actual_after, _ = pearsonr(heart_age_delta_test_after, y_test)

In [None]:
# Print correlation results
print("\n🔹 Correlation Between Predicted Heart Age & Actual Age:")
print(f"   Before Correction: {corr_pred_actual_before:.4f}")
print(f"   After Correction: {corr_pred_actual_after:.4f}")

In [None]:
print("\n🔹 Correlation Between Heart Age Delta & Actual Age:")
print(f"   Before Correction: {corr_delta_actual_before:.4f}")
print(f"   After Correction: {corr_delta_actual_after:.4f}")

In [None]:
# Before correction
evaluate_model(y_test, y_pred_test, "XGBoost (Before Bias Correction)")

In [None]:
# After correction
evaluate_model(y_test, y_pred_test_corrected, "XGBoost (After Bias Correction)")

## LASSO

In [None]:
lasso_grid = {
    'alpha': np.logspace(-4, 1, 50)  # Regularization strength
}

In [None]:
lasso_search = RandomizedSearchCV(
    estimator=Lasso(),
    param_distributions=lasso_grid,
    n_iter=100, 
    cv=10, 
    scoring=scoring_metrics,  
    refit='MAE',
    verbose=1,
    n_jobs=-1, 
    return_train_score=True
)


In [None]:
lasso_search.fit(X, np.ravel(y))

In [None]:
print("Best MAE:", lasso_search.best_score_)
print("Best MSE:", max(lasso_search.cv_results_['mean_test_MSE']))
print("Best R2:", max(lasso_search.cv_results_['mean_test_R2']))

In [None]:
y_pred_train = lasso_search.best_estimator_.predict(X)
y_pred_test = lasso_search.best_estimator_.predict(X_test)

In [None]:
# Heart age delta (bias)
heart_age_delta_train = y_pred_train - y
heart_age_delta_test = y_pred_test - y_test

In [None]:
bias_model = LinearRegression()
bias_model.fit(y.values.reshape(-1, 1), heart_age_delta_train)

In [None]:
# Bias correction parameters
beta1 = bias_model.coef_[0]
beta0 = bias_model.intercept_

In [None]:
y_pred_test_corrected = y_pred_test - (beta1 * y_test + beta0)

In [None]:
# Before correction
evaluate_model(y_test, y_pred_test, "XGBoost (Before Bias Correction)")

In [None]:
# After correction
evaluate_model(y_test, y_pred_test_corrected, "XGBoost (After Bias Correction)")

In [None]:
corr_pred_actual_before, _ = pearsonr(y_pred_test, y_test)
corr_pred_actual_after, _ = pearsonr(y_pred_test_corrected, y_test)

In [None]:
corr_delta_actual_before, _ = pearsonr(heart_age_delta_test_before, y_test)
corr_delta_actual_after, _ = pearsonr(heart_age_delta_test_after, y_test)

In [None]:
# Print correlation results
print("\n🔹 Correlation Between Predicted Heart Age & Actual Age:")
print(f"   Before Correction: {corr_pred_actual_before:.4f}")
print(f"   After Correction: {corr_pred_actual_after:.4f}")

In [None]:
print("\n🔹 Correlation Between Heart Age Delta & Actual Age:")
print(f"   Before Correction: {corr_delta_actual_before:.4f}")
print(f"   After Correction: {corr_delta_actual_after:.4f}")

## Ridge

In [None]:
ridge_grid = {
    'alpha': np.logspace(-4, 1, 50)  # Regularization strength
}

In [None]:
ridge_search = RandomizedSearchCV(
    estimator=Ridge(),
    param_distributions=ridge_grid,
    n_iter=20,
    cv=5,
    scoring=scoring_metrics,
    refit='MAE',
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

In [None]:
ridge_search.fit(X, y)

In [None]:
print("Best MAE:", ridge_search.best_score_)
print("Best MSE:", max(ridge_search.cv_results_['mean_test_MSE']))
print("Best R2:", max(ridge_search.cv_results_['mean_test_R2']))

In [None]:
y_pred_ridge = ridge_search.best_estimator_.predict(X_test)

In [None]:
print("Ridge Test MAE:", mean_absolute_error(y_test, y_pred_ridge))
print("Ridge Test MSE:", mean_squared_error(y_test, y_pred_ridge))
print("Ridge Test R²:", r2_score(y_test, y_pred_ridge))