# Imports

In [1]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import textwrap
import logging
import glob

from tqdm import tqdm
#from matplotlib import pyplot
#from skopt import BayesSearchCV
#from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split, ParameterSampler
#from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, mean_squared_log_error, r2_score


from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll.base import scope

#from keras.models import Sequential
#from keras.layers import LSTM, Dense, Dropout, Input
#from keras.callbacks import EarlyStopping

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("logging.log"),
        logging.StreamHandler()
    ]
)

# Load - Train - Save Models without HP

In [3]:
# === SETTINGS ===
base_path = "../../07_Imputation/CSV/exports/CIR-16/impute/"
observation_window = 'o4'
label = 'los'
model_output_dir = "models/"
plot_dir_error = "plots/03_Error_Metric_Plots"
plot_dir_most_important_shap = "plots/01_Most_Important_SHAP"
plot_dir_true_vs_predict = "plots/02_Prediction_Plot/02_true_vs_pred"
plot_dir_residuals = "plots/02_Prediction_Plot/01_residuals"
plot_dir_calibration = "plots/04_Calibration_Plots"

os.makedirs(model_output_dir, exist_ok=True)
os.makedirs(plot_dir_error, exist_ok=True)
os.makedirs(plot_dir_most_important_shap, exist_ok=True)
os.makedirs(plot_dir_true_vs_predict, exist_ok=True)
os.makedirs(plot_dir_residuals, exist_ok=True)
os.makedirs(plot_dir_calibration, exist_ok=True)

In [4]:
# === HELPER FUNCTION TO MATCH FILES ===
def find_file(path, pattern):
    matches = glob.glob(os.path.join(path, pattern))
    return matches[0] if matches else None

In [5]:
# === FUNCTION TO RUN XGBOOST ===
def run_xgboost():
    all_metrics = []  # collect all results for summary pivot plot
    seq_folders = sorted([f for f in os.listdir(base_path) if f.startswith("seq_")])

    for folder in seq_folders:
        logging.info(f"Processing folder: {folder}")
        load_path = os.path.join(base_path, folder)
        load_path_label = os.path.join(base_path, "labels")

        try:
            # Load data
            X_train = pd.read_csv(find_file(load_path, f"{observation_window}_X_train*.csv"))
            y_train = pd.read_csv(find_file(load_path_label, f"{observation_window}_y_train_{label}.csv"))

            X_validate = pd.read_csv(find_file(load_path, f"{observation_window}_X_validate*.csv"))
            y_validate = pd.read_csv(find_file(load_path_label, f"{observation_window}_y_validate_{label}.csv"))

            X_test = pd.read_csv(find_file(load_path, f"{observation_window}_X_test*.csv"))
            y_test = pd.read_csv(find_file(load_path_label, f"{observation_window}_y_test_{label}.csv"))

            X_external = pd.read_csv(find_file(load_path, f"{observation_window}_X_external*.csv"))
            y_external = pd.read_csv(find_file(load_path_label, f"{observation_window}_y_external_{label}.csv"))

            # Train model
            model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
            model.fit(X_train, y_train)

            # Save model
            file_prefix = f"{folder}_{observation_window}_{label}"
            model.save_model(os.path.join(model_output_dir, f"{file_prefix}_model.json"))
            logging.info(f"Saved model to {model_output_dir}/{file_prefix}_model.json")

            # Predict
            y_pred_test = model.predict(X_test)
            y_pred_ext = model.predict(X_external)

            # Plot True vs Pred for test and external sets
            plot_true_vs_pred(y_test, y_pred_test, file_prefix, set_name="test")
            plot_true_vs_pred(y_external, y_pred_ext, file_prefix, set_name="external")

            # Log metrics
            logging.info(f"[{folder}] Test MSE: {mean_squared_error(y_test, y_pred_test):.4f} | MAE: {mean_absolute_error(y_test, y_pred_test):.4f} | RMSE:{np.sqrt(mean_squared_error(y_test, y_pred_test)):.4f} | R2: {r2_score(y_test, y_pred_test)*100:.2f}")
            logging.info(f"[{folder}] External MSE: {mean_squared_error(y_external, y_pred_ext):.4f} | MAE: {mean_absolute_error(y_external, y_pred_ext):.4f} | RMSE: {np.sqrt(mean_squared_error(y_external, y_pred_ext)):.4f} | R2: {r2_score(y_external, y_pred_ext)*100:.2f}")

            # Residuals plot
            plot_residuals(y_test, y_pred_test, mae=mean_absolute_error(y_test, y_pred_test), file_prefix=file_prefix)

            # Calibration plots
            plot_calibration(y_test, y_pred_test, file_prefix, set_name="internal")
            plot_calibration(y_external, y_pred_ext, file_prefix, set_name="external")


            # Plot + metrics
            internal_metrics = plot_error_metrics(y_test, y_pred_test, file_prefix, plot_label='internal', config_label="no HP")
            external_metrics = plot_error_metrics(y_external, y_pred_ext, file_prefix, plot_label='external', config_label="no HP")

            # Plot top N feature importances
            feature_importance_plot(model, X_train, file_prefix, top_n=20)

            # Shap plot
            generate_shap_plot(model, X_train, file_prefix, top_n=20)

            all_metrics.append({"folder": folder, "dataset": "internal", **internal_metrics})
            all_metrics.append({"folder": folder, "dataset": "external", **external_metrics})

        except Exception as e:
            logging.error(f"Failed in folder {folder}: {str(e)}")


    # === AFTER MAIN LOOP ===
    metrics_df = pd.DataFrame(all_metrics)
    summary_csv_path = os.path.join(plot_dir_error, "all_seq_metrics.csv")
    metrics_df.to_csv(summary_csv_path, index=False)
    logging.info(f"Saved summary metrics to: {summary_csv_path}")

    metrics_melted = metrics_df.melt(
        id_vars=["folder", "dataset"],
        value_vars=["MSE", "MAE", "RMSE", "R2", "MSLE"],
        var_name="Metric",
        value_name="Value"
    ).dropna()

    for metric in metrics_melted["Metric"].unique():
        plt.figure(figsize=(12, 6))
        subset = metrics_melted[metrics_melted["Metric"] == metric]

        sns.barplot(data=subset, x="folder", y="Value", hue="dataset", palette="Set2", errorbar=None)
        plt.title(f"{metric} Comparison (Internal vs External)")
        plt.ylabel(metric)
        plt.xlabel("Sequence Folder")
        plt.xticks(rotation=45)
        plt.legend(title="Dataset")
        plt.tight_layout()

        metric_plot_path = os.path.join(plot_dir_error, f"metric_{metric}_comparison_plot.png")
        plt.savefig(metric_plot_path, dpi=300)
        plt.close()

# === MAIN FUNCTION ENTRY POINT ===
def main(model_type="xgboost"):
    if model_type.lower() == "xgboost":
        run_xgboost()
    else:
        logging.error(f"Model type '{model_type}' is not supported.")

# Plots

In [6]:
"""
Plots error metrics (MSE, MAE, RMSE, MSLE if possible) and R².
"""

def plot_error_metrics(y_true, y_pred, file_prefix: str, plot_label: str, config_label: str = "no HP"):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred) * 100

    error_metrics = ['MSE', 'MAE', 'RMSE']
    values = [mse, mae, rmse]

    msle = np.nan
    try:
        msle = mean_squared_log_error(y_true, y_pred)
        logging.info(f"{plot_label.title()} Set MSLE: {msle:.4f}")
        error_metrics.append('MSLE')
        values.append(msle)
    except ValueError:
        logging.info(f"{plot_label.title()} Set MSLE: Not computable due to negative values.")

    # Bar plot
    plt.figure(figsize=(10, 6))
    plt.bar(error_metrics, values, color=['blue', 'green', 'red', 'orange'][:len(error_metrics)])
    plt.xlabel('Error Metric')
    plt.ylabel('Value')
    plt.title(f'Error Metrics ({plot_label.title()} Set) - {config_label}')
    plt.savefig(f"{plot_dir_error}/{file_prefix}_{plot_label}_{config_label.replace(' ', '_')}_error_metrics.png", dpi=300, bbox_inches='tight')
    plt.close()

    # R² pie plot
    plt.figure(figsize=(6, 6))
    if r2 >= 0:
        plt.pie([r2, 100 - r2], labels=['Explained Variance (R2)', 'Unexplained Variance'],
                colors=['lightblue', 'lightgrey'], autopct='%1.1f%%')
    else:
        plt.pie([100], labels=['Unexplained Variance'], colors=['lightgrey'], autopct='%1.1f%%')
    plt.title(f'Explained Variance by R² ({plot_label.title()} Set) - {config_label}')
    plt.savefig(f"{plot_dir_error}/{file_prefix}_{plot_label}_{config_label.replace(' ', '_')}_R2.png", dpi=300, bbox_inches='tight')
    plt.close()

    return {
        "MSE": mse,
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2,
        "MSLE": msle
    }

In [7]:
"""
Generate and save a plot of the top N most important features.
"""

def feature_importance_plot(model, X_train, file_prefix: str, top_n: int = 20):

    importances = model.feature_importances_
    feature_df = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': importances
    })

    feature_df = feature_df.sort_values(by='Importance', ascending=False).head(top_n)
    feature_df['Importance'] *= 100000  # Scale if you prefer

    # Plot
    plt.figure(figsize=(10, 6))
    #sns.barplot(data=feature_df, x='Importance', y='Feature', palette='viridis')
    sns.barplot(data=feature_df, x='Importance', y='Feature', color='steelblue')
    plt.title(f'Top {top_n} Most Important Features - {file_prefix}')
    plt.xlabel('Importance (scaled)')
    plt.ylabel('Feature')
    plt.tight_layout()

    # Save plot
    importance_path = os.path.join(plot_dir_most_important_shap, f"{file_prefix}_top{top_n}_feature_importance.png")
    plt.savefig(importance_path, dpi=300)
    plt.close()
    logging.info(f"Saved top {top_n} feature importance plot to {importance_path}")


In [8]:
"""
Generate and save a SHAP summary plot (dot type) for the given model and training data.
"""

def generate_shap_plot(model, X_train, file_prefix: str, top_n: int = 20):

    try:
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_train)

        mean_abs_shap = np.abs(shap_values).mean(axis=0)
        shap_df = pd.DataFrame({
            'Feature': X_train.columns,
            'Mean Absolute SHAP Value': mean_abs_shap
        }).sort_values(by='Mean Absolute SHAP Value', ascending=False)

        logging.info(f"Top {top_n} features by SHAP importance for {file_prefix}:")
        logging.info("\n" + shap_df.head(top_n).to_string(index=False))

        # SHAP summary plot
        plt.figure()
        shap.summary_plot(shap_values, X_train, plot_type="dot", show=False)
        plt.grid(True)

        shap_path = os.path.join(plot_dir_most_important_shap, f"{file_prefix}_shap_plot.png")
        plt.savefig(shap_path, dpi=300, bbox_inches='tight')
        plt.close()

        logging.info(f"Saved SHAP summary plot to {shap_path}")
    
    except Exception as e:
        logging.error(f"Failed to generate SHAP plot for {file_prefix}: {str(e)}")


In [9]:
"""
Plot predicted vs. true values for LOS.
"""

def plot_true_vs_pred(y_true, y_pred, file_prefix: str, set_name: str):

    # Ensure 1D arrays
    y_true = np.ravel(y_true)
    y_pred = np.ravel(y_pred)

    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, color='blue', label='Prediction', alpha=0.6)

    # Compute robust line range
    min_val = float(np.min([y_true.min(), y_pred.min()]))
    max_val = float(np.max([y_true.max(), y_pred.max()]))
    line = np.linspace(min_val, max_val, 100)
    plt.plot(line, line, color='red', linestyle='--', label='Perfect Prediction')

    plt.xlabel('True LOS')
    plt.ylabel('Predicted LOS')
    plt.legend()
    plt.grid(True)
    plt.title(f'Predicted vs. True LOS ({set_name.title()})')

    # Save plot
    true_vs_pred = os.path.join(plot_dir_true_vs_predict, f"{file_prefix}_true_vs_pred_{set_name}_plot.png")
    plt.savefig(true_vs_pred, dpi=300, bbox_inches='tight')
    plt.close()

    logging.info(f"Saved {set_name} True vs. Predicted plot to {plot_dir_true_vs_predict}")

In [10]:
"""
Plot residuals (true - predicted) with MAE bounds and save to file.
"""

def plot_residuals(y_true, y_pred, mae: float, file_prefix: str):

    y_true = np.ravel(y_true)
    y_pred = np.ravel(y_pred)
    residuals = y_true - y_pred

    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, residuals, color='blue', alpha=0.5, label="Residuals")
    plt.axhline(y=0, color='red', linestyle='--', label="Zero Line")
    plt.axhline(y=mae, color='green', linestyle='--', label=f"MAE = {mae:.2f}")
    plt.axhline(y=-mae, color='green', linestyle='--')
    
    plt.xlabel('True LOS')
    plt.ylabel('Residuals (True - Predicted)')
    plt.title('Residuals Plot with MAE Bounds')
    plt.grid(True)
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))


    # Save plot
    residuals = os.path.join(plot_dir_residuals, f"{file_prefix}_residuals_plot.png")
    plt.savefig(residuals, dpi=300, bbox_inches='tight')
    plt.close()

    logging.info(f"Saved residuals plot to {plot_dir_residuals}")

In [11]:
"""
Generate and save a calibration plot comparing predicted and actual LOS values.
"""

def plot_calibration(y_true, y_pred, file_prefix: str, set_name: str):
    y_true = np.ravel(y_true)
    y_pred = np.ravel(y_pred)

    plt.figure(figsize=(8, 6))
    sns.regplot(
        x=y_true,
        y=y_pred,
        lowess=True,
        line_kws={'color': 'red'},
        scatter_kws={'alpha': 0.4}
    )

    # Perfect calibration line (y = x)
    min_val = float(np.min([y_true.min(), y_pred.min()]))
    max_val = float(np.max([y_true.max(), y_pred.max()]))
    plt.plot([min_val, max_val], [min_val, max_val], 'k--', lw=2)

    plt.xlabel('Actual LOS')
    plt.ylabel('Predicted LOS')
    plt.title(f'Calibration Plot: {set_name.title()}')
    plt.grid(True)

    # Save plot
    calibration = os.path.join(plot_dir_calibration, f"{file_prefix}_calibration_{set_name}.png")
    plt.savefig(calibration, dpi=300, bbox_inches='tight')
    plt.close()

    logging.info(f"Saved calibration plot to {plot_dir_calibration}")

# Call the Models

In [12]:
main(model_type="xgboost")

2025-05-26 22:06:30,189 - INFO - Processing folder: seq_00_mean_mean_xgboost_xgboost_lstm_lstm_gan
2025-05-26 22:06:43,197 - INFO - Saved model to models//seq_00_mean_mean_xgboost_xgboost_lstm_lstm_gan_o4_los_model.json
2025-05-26 22:06:44,339 - INFO - Saved test True vs. Predicted plot to plots/02_Prediction_Plot/02_true_vs_pred
2025-05-26 22:06:46,699 - INFO - Saved external True vs. Predicted plot to plots/02_Prediction_Plot/02_true_vs_pred
2025-05-26 22:06:46,714 - INFO - [seq_00_mean_mean_xgboost_xgboost_lstm_lstm_gan] Test MSE: 4.9665 | MAE: 1.7880 | RMSE:2.2286 | R2: 1.66
2025-05-26 22:06:46,714 - INFO - [seq_00_mean_mean_xgboost_xgboost_lstm_lstm_gan] External MSE: 8.5075 | MAE: 2.5435 | RMSE: 2.9168 | R2: -94.11
2025-05-26 22:06:47,432 - INFO - Saved residuals plot to plots/02_Prediction_Plot/01_residuals
2025-05-26 22:06:48,303 - INFO - Saved calibration plot to plots/04_Calibration_Plots
2025-05-26 22:06:58,969 - INFO - Saved calibration plot to plots/04_Calibration_Plots
20

# HP GridSearchCV
## To slow

In [None]:
"""
A smaller learning rate makes the boosting
process more robust and can lead to better
generalization but requires more trees
(higher n_estimators) to achieve the same result.
A larger learning rate speeds up training bu
may risk overfitting.
"""

# Define the parameter grid

param_grid = {
    'n_estimators': [100, 200, 300], # controls the total number of trees in the ensemble
    'learning_rate': np.arange(0.01, 1.02, 0.2),
    'max_depth': np.arange(1, 10, 1),
    'reg_lambda': np.arange(0.1, 15.1, 1),
    'reg_alpha': np.arange(0.1, 15.1, 1)
}

# Create an XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Create GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=3,  # Number of folds for cross-validation
                           verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score (negative MSE):", grid_search.best_score_)

# Predict on the validation set with the best model
y_pred_validate = grid_search.predict(X_validate)

# Optionally: Evaluate the model on the validation set
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_validate, y_pred_validate)
print("Validation MSE:", mse)

# HP RandomizedSearchCV & Train Model
Choose randomly samples a subset of hyperparameter combinations

In [None]:
# Define the parameter grid
param_distributions = {
    'n_estimators': [100, 200, 300],
    'learning_rate': np.arange(0.01, 1.02, 0.2),
    'max_depth': np.arange(1, 10, 1),
    'reg_lambda': np.arange(0.1, 15.1, 1),
    'reg_alpha': np.arange(0.1, 15.1, 1)
}

# Number of random samples
n_iter = 50

# Generate random combinations
param_list = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=42))

# Tracking best model
best_score = float('inf')
best_params = None
best_model = None

# Progress bar
for params in tqdm(param_list, desc="Hyperparameter tuning"):
    model = xgb.XGBRegressor(objective='reg:squarederror', **params)
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred_val = model.predict(X_validate)
    
    # Evaluate with MSE
    mse = mean_squared_error(y_validate, y_pred_val)
    
    if mse < best_score:
        best_score = mse
        best_params = params
        best_model = model

# Evaluate best model on test set
y_pred = best_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)

# Evaluate on external validation set
y_pred_external = best_model.predict(X_external)
mse_external = mean_squared_error(y_external, y_pred_external)
mae_external = mean_absolute_error(y_external, y_pred_external)

# Results
logging.info(f"Best parameters: {best_params}")
logging.info(f"Best validation MSE: {best_score}")
logging.info(f"Test Set - MSE: {mse_test}, MAE: {mae_test}")
logging.info(f"External Validation Set - MSE: {mse_external}, MAE: {mae_external}")

# HP Bayesian Optimization & Train Model

In [None]:
# Initialize tqdm progress bar
pbar = tqdm(total=50, desc="Bayesian Optimization Progress")

# Callback to update tqdm
def on_step(optim_result):
    pbar.update(1)

# Define the parameter search space
param_space = {
    'n_estimators': (100, 300),
    'learning_rate': (0.01, 1.0, 'log-uniform'),
    'max_depth': (1, 10),
    'reg_lambda': (0.1, 15.0),
    'reg_alpha': (0.1, 15.0)
}

# Create the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Create BayesSearchCV for Bayesian Optimization
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    scoring='neg_mean_squared_error',
    n_iter=50,
    cv=3,
    verbose=0,
    random_state=42
)

# Fit BayesSearchCV with tqdm callback
bayes_search.fit(X_train, y_train, callback=on_step)
pbar.close()

# Log best parameters and score
logging.info("Best parameters: %s", bayes_search.best_params_)
logging.info("Best score (negative MSE): %.4f", bayes_search.best_score_)

# Predict on the validation set with the best model
y_pred_validate = bayes_search.predict(X_validate)

# Evaluate the model on the validation set
mse_validate = mean_squared_error(y_validate, y_pred_validate)
mae_validate = mean_absolute_error(y_validate, y_pred_validate)
logging.info("Validation MSE: %.4f", mse_validate)
logging.info("Validation MAE: %.4f", mae_validate)

# Extract the best hyperparameters from BayesSearchCV
best_params = bayes_search.best_params_

# Initialize the XGBoost model with the best hyperparameters
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    reg_lambda=best_params['reg_lambda'],
    reg_alpha=best_params['reg_alpha']
)

# Train the model on the training set
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Predict on the external validation set (eICU data)
y_pred_external = model.predict(X_external)

# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)

# Evaluate the model on the external validation set
mse_external = mean_squared_error(y_external, y_pred_external)
mae_external = mean_absolute_error(y_external, y_pred_external)

# Log final evaluation metrics
logging.info("Test Set - MSE: %.4f, MAE: %.4f", mse_test, mae_test)
logging.info("External Validation Set (eICU) - MSE: %.4f, MAE: %.4f", mse_external, mae_external)

# HP HyperOpt & Train Model

In [None]:
# Define the number of evaluations
MAX_EVALS = 50

# Initialize tqdm progress bar
pbar = tqdm(total=MAX_EVALS, desc="HyperOpt Progress")

# Define the wrapped objective function
def objective(params):
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=int(params['n_estimators']),
        learning_rate=params['learning_rate'],
        max_depth=int(params['max_depth']),
        reg_lambda=params['reg_lambda'],
        reg_alpha=params['reg_alpha']
    )
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on the validation set
    y_pred_validate = model.predict(X_validate)
    
    # Compute the MSE
    mse = mean_squared_error(y_validate, y_pred_validate)

    # Log the result
    logging.info("Params: %s | Validation MSE: %.4f", params, mse)
    
    # Update progress bar
    pbar.update(1)

    return {'loss': mse, 'status': 'ok'}

# Define the parameter search space
param_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 300, 50)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1.0)),
    'max_depth': scope.int(hp.quniform('max_depth', 1, 10, 1)),
    'reg_lambda': hp.uniform('reg_lambda', 0.1, 15.0),
    'reg_alpha': hp.uniform('reg_alpha', 0.1, 15.0)
}

# Create a Trials object to keep track of the search
trials = Trials()

# Perform the hyperparameter search
best = fmin(
    fn=objective,
    space=param_space,
    algo=tpe.suggest,
    max_evals=MAX_EVALS,
    trials=trials,
    show_progressbar=False  # Disable internal bar to avoid overlap with tqdm
)

# Close progress bar
pbar.close()

# Log the best parameters
logging.info("Best parameters: %s", best)

# Initialize the XGBoost model with the best hyperparameters
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=int(best['n_estimators']),
    learning_rate=best['learning_rate'],
    max_depth=int(best['max_depth']),
    reg_lambda=best['reg_lambda'],
    reg_alpha=best['reg_alpha']
)

# Train the model on the training set
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Predict on the external validation set (eICU data)
y_pred_external = model.predict(X_external)

# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, y_pred)
mae_test = mean_absolute_error(y_test, y_pred)

# Evaluate the model on the external validation set
mse_external = mean_squared_error(y_external, y_pred_external)
mae_external = mean_absolute_error(y_external, y_pred_external)

# Log final evaluation results
logging.info("Test Set - MSE: %.4f, MAE: %.4f", mse_test, mae_test)
logging.info("External Validation Set (eICU) - MSE: %.4f, MAE: %.4f", mse_external, mae_external)

In [None]:
# Path
save_path = 'CSV/exports/impute/o03_Interpolation/'

# Check if the directory exists, and if not, create it
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Save external validation set from eICU
X_external.to_csv(save_path + 'X_external.csv', index=False)
y_external.to_csv(save_path + 'y_external.csv', index=False)

# Save training, validation, and test sets
X_train.to_csv(save_path + 'X_train.csv', index=False)
y_train.to_csv(save_path + 'y_train.csv', index=False)

X_validate.to_csv(save_path + 'X_validate.csv', index=False)
y_validate.to_csv(save_path + 'y_validate.csv', index=False)

X_test.to_csv(save_path + 'X_test.csv', index=False)
y_test.to_csv(save_path + 'y_test.csv', index=False)

# Save Model

In [None]:
# Define the directory and file path

name = f"{file_name}_model.json"
directory = 'models/'

file_path = os.path.join(directory, name)

# Create the directory if it does not exist
os.makedirs(directory, exist_ok=True)

# Save the model as a JSON file
model.save_model(file_path)

# Load Model

In [None]:
# Define the model file path
file_name = "06"  # replace with the actual name you used before saving
directory = 'models/'
file_path = os.path.join(directory, f"{file_name}_model.json")

# Load the model
model = xgb.XGBRegressor()
model.load_model(file_path)

# Predict on the test set
y_pred = model.predict(X_test)

# Predict on the external validation set
y_pred_external = model.predict(X_external)