# Imports | Reads | Filter Patients

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from matplotlib import pyplot
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, mean_squared_log_error, r2_score

In [2]:
# Read MIMICs CSV file
mimic_df = pd.read_csv("CSV\\exports\\final\\mimic_mean_final.csv")

# Read eICUs CSV file
eicu_df = pd.read_csv("CSV\\exports\\final\\eicu_mean_final.csv")

In [3]:
day = 10

# Filter icu stay less than 10 days
mimic_df = mimic_df[mimic_df['los'] < day]

# Filter icu stay less than 10 days
eicu_df = eicu_df[eicu_df['los'] < day]

In [4]:
# Filter Time Zone

#time_zone = 16
#mimic_df = mimic_df[mimic_df['Time_Zone'] == time_zone]
#eicu_df = eicu_df[eicu_df['Time_Zone'] == time_zone]

In [5]:
row_count = mimic_df.shape[0]
print(f"Row count: {row_count}")

Row count: 48992


In [6]:
# concatenate dataframes
df_combined = pd.concat([mimic_df, eicu_df], ignore_index=True)

# Find all categorical columns in mimic
categorical_columns = df_combined.select_dtypes(include=['object', 'category']).columns.tolist()

# Apply one-hot encoding to all categorical columns
df_encoded = pd.get_dummies(df_combined, columns=categorical_columns)

# Split the concatenate dataframe
mimic_df = df_encoded.iloc[:row_count, :]  # Rows from 0 to row_count
eicu_df = df_encoded.iloc[row_count:, :]  # Rows from row_count to the end


"""--------------------------"""

# Group by `subject_id` and `hadm_id` to get unique patient admission records
unique_patients = mimic_df[['subject_id', 'hadm_id']].drop_duplicates()

# Split the unique patients into train, validation, and test sets
train_patients, test_patients = train_test_split(unique_patients, test_size=0.10, random_state=42)
train_patients, validate_patients = train_test_split(train_patients, test_size=0.11, random_state=42)  # 0.11 * 90% ~= 10%

# Merge the patients back with the original data to get the full records
train_set = mimic_df.merge(train_patients, on=['subject_id', 'hadm_id'])
validate_set = mimic_df.merge(validate_patients, on=['subject_id', 'hadm_id'])
test_set = mimic_df.merge(test_patients, on=['subject_id', 'hadm_id'])

# External validation from eICU
X_external = eicu_df.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_external = eicu_df['los']

# Separate features and target for the training, validation, and test sets
X_train = train_set.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_train = train_set['los']

X_validate = validate_set.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_validate = validate_set['los']

X_test = test_set.drop(columns=['hospital_expire_flag', 'los', 'subject_id', 'hadm_id', 'row_count', 'Time_Zone'])
y_test = test_set['los']

# Train Model without HP

In [None]:
# Default XGBoost Model
model = xgb.XGBRegressor(objective='reg:squarederror')

model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Predict on the external validation set (eICU data)
y_pred_external = model.predict(X_external)

# Train Model with HP RandomizedSearchCV

In [None]:
# Define the hyperparameter distributions
param_dist = {
    'learning_rate': np.arange(0.01, 1.01, 0.1),
    'max_depth': np.arange(1, 11, 1),
    'min_child_weight': np.arange(1, 6, 1),
    'reg_lambda': np.arange(0.1, 15.1, 1),
    'reg_alpha': np.arange(0.1, 15.1, 1),
    'n_estimators': [100, 200, 300]
}

# Initialize the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Set up RandomizedSearchCV with tqdm integration
random_search = RandomizedSearchCV(estimator=xgb_model, 
                                    param_distributions=param_dist, 
                                    n_iter=100,  # Number of random samples to try
                                    scoring='neg_mean_squared_error',  # Use negative MSE for minimization
                                    cv=2,  # Number of folds for cross-validation
                                    n_jobs=-1,  # Use all available cores
                                    verbose=1,  # Print progress
                                    random_state=42)  # Set seed for reproducibility

# Perform the RandomizedSearchCV
with tqdm(total=100, desc="Hyperparameter Tuning") as pbar:
    random_search.fit(X_train, y_train)
    pbar.update(100)

# Retrieve the best model and hyperparameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_score = -random_search.best_score_  # Convert from negative MSE to positive MSE

print(f"Best hyperparameters: {best_params}")
print(f"Best validation MSE: {best_score}")

# Predict on the test set with the best model
y_pred_test = best_model.predict(X_test)

# Predict on the external validation set with the best model
y_pred_external = best_model.predict(X_external)

# Evaluation on the test set
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test) * 100

print(f"Test Set MSE: {mse_test}")
print(f"Test Set RMSE: {rmse_test}")
print(f"Test Set MAE: {mae_test}")
print(f"Test Set R2 Score: {r2_test}")

# Evaluation on the external validation set
mse_external = mean_squared_error(y_external, y_pred_external)
rmse_external = np.sqrt(mse_external)
mae_external = mean_absolute_error(y_external, y_pred_external)
r2_external = r2_score(y_external, y_pred_external) * 100

print(f"External Validation Set MSE: {mse_external}")
print(f"External Validation Set RMSE: {rmse_external}")
print(f"External Validation Set MAE: {mae_external}")
print(f"External Validation Set R2 Score: {r2_external}")

# Train Model with HP gridsearchcv

In [None]:
from tqdm import tqdm
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score

# Define the hyperparameter grid
param_grid = {
    'learning_rate': np.arange(0.01, 1.01, 0.3),
    'max_depth': np.arange(1, 11, 1),
    'min_child_weight': np.arange(1, 6, 1),
    'reg_lambda': np.arange(0.1, 15.1, 1),
    'reg_alpha': np.arange(0.1, 15.1, 1),
    'n_estimators': [100, 200, 300]  # Keep n_estimators as is
}

# Initialize the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Set up GridSearchCV with tqdm integration
grid_search = GridSearchCV(estimator=xgb_model, 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error',  # Use negative MSE for minimization
                           cv=2,  # Number of folds for cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=1)  # Disable default verbosity

# Perform the GridSearchCV
grid_search.fit(X_train, y_train)

# Retrieve the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert from negative MSE to positive MSE

print(f"Best hyperparameters: {best_params}")
print(f"Best validation MSE: {best_score}")

# Predict on the test set with the best model
y_pred_test = best_model.predict(X_test)

# Predict on the external validation set with the best model
y_pred_external = best_model.predict(X_external)

# Evaluation on the test set
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = root_mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test) * 100

print(f"Test Set MSE: {mse_test}")
print(f"Test Set RMSE: {rmse_test}")
print(f"Test Set MAE: {mae_test}")
print(f"Test Set R2 Score: {r2_test}")

# Evaluation on the external validation set
mse_external = mean_squared_error(y_external, y_pred_external)
rmse_external = root_mean_squared_error(y_external, y_pred_external)
mae_external = mean_absolute_error(y_external, y_pred_external)
r2_external = r2_score(y_external, y_pred_external) * 100

print(f"External Validation Set MSE: {mse_external}")
print(f"External Validation Set RMSE: {rmse_external}")
print(f"External Validation Set MAE: {mae_external}")
print(f"External Validation Set R2 Score: {r2_external}")

# Test Set Plots

In [None]:
# Metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred) * 100

print(f"Test Set MSE: {mse}")
print(f"Test Set MAE: {mae}")
print(f"Test Set RMSE: {rmse}")
print(f"Test Set R2: {r2}")

# Plotting error metrics
error_metrics = ['MSE', 'MAE', 'RMSE']
values = [mse, mae, rmse]

plt.figure(figsize=(10, 6))
plt.bar(error_metrics, values, color=['blue', 'green', 'red'])
plt.xlabel('Error Metric')
plt.ylabel('Value')
plt.title('Comparison of Error Metrics')
plt.show()

# Plotting R-squared (R2)
plt.figure(figsize=(6, 6))
plt.pie([r2, 100 - r2], labels=['Explained Variance (R2)', 'Unexplained Variance'], colors=['lightblue', 'lightgrey'], autopct='%1.1f%%')
plt.title('Explained Variance by R-squared (R2)')
plt.show()

# Plotting MSLE if applicable
try:
    msle = mean_squared_log_error(y_test, y_pred)
    plt.figure(figsize=(10, 6))
    plt.plot(msle_values, marker='o', linestyle='-')
    plt.xlabel('Prediction')
    plt.ylabel('MSLE')
    plt.title('Mean Squared Logarithmic Error Across Predictions')
    plt.grid(True)
    plt.show()
except ValueError:
    print("Mean Squared Logarithmic Error cannot be calculated because targets contain negative values.")

# External Validation Plots

In [None]:
# Predict on the external validation set (eICU data)
y_pred_external = model.predict(X_external)

# Metrics for external validation set
mse_external = mean_squared_error(y_external, y_pred_external)
mae_external = mean_absolute_error(y_external, y_pred_external)
rmse_external = np.sqrt(mse_external)
r2_external = r2_score(y_external, y_pred_external) * 100

print(f"External Validation Set MSE: {mse_external}")
print(f"External Validation Set MAE: {mae_external}")
print(f"External Validation Set RMSE: {rmse_external}")
print(f"External Validation Set R2: {r2_external}")

# Plotting error metrics for the external validation set
error_metrics_external = ['MSE', 'MAE', 'RMSE']
values_external = [mse_external, mae_external, rmse_external]

plt.figure(figsize=(10, 6))
plt.bar(error_metrics_external, values_external, color=['blue', 'green', 'red'])
plt.xlabel('Error Metric')
plt.ylabel('Value')
plt.title('Comparison of Error Metrics (External Validation Set)')
plt.show()

# Plotting R-squared (R2) for the external validation set
plt.figure(figsize=(6, 6))

if r2_external >= 0:
    plt.pie([r2_external, 100 - r2_external], 
            labels=['Explained Variance (R2)', 'Unexplained Variance'], 
            colors=['lightblue', 'lightgrey'], autopct='%1.1f%%')
else:
    plt.pie([100], labels=['Unexplained Variance'], colors=['lightgrey'], autopct='%1.1f%%')

plt.title('Explained Variance by R-squared (R2) - External Validation Set')
plt.show()

# Plotting MSLE for the external validation set if applicable
try:
    msle_external = mean_squared_log_error(y_external, y_pred_external)
    plt.figure(figsize=(10, 6))
    plt.plot(y_external, y_pred_external, marker='o', linestyle='-', label='MSLE')
    plt.xlabel('Prediction')
    plt.ylabel('MSLE')
    plt.title('Mean Squared Logarithmic Error (MSLE) - External Validation Set')
    plt.grid(True)
    plt.show()
except ValueError:
    print("Mean Squared Logarithmic Error cannot be calculated because targets contain negative values.")

# Most important features

In [None]:
# Get feature importances
most_important_df = model.feature_importances_

# Create a DataFrame to store feature importances along with their corresponding names
most_important_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': most_important_df})

# Sort the DataFrame by feature importance in descending order
most_important_df = most_important_df.sort_values(by='Importance', ascending=False)

# Scale the importance
most_important_df['Importance'] *= 100000

# Print the top N most important features
top_n = 20  # set features number
print(f"Top {top_n} most important features:")
print(most_important_df.head(top_n))

In [None]:
# Set seaborn style and remove gridlines
sns.set_style("whitegrid")

# Top 10 most important features
top_10_features = most_important_df.head(20)

# Plotting
plt.figure(figsize=(12, 6))  # Reduce figure size
plot = sns.barplot(x='Importance', y='Feature', data=top_10_features, hue='Feature', palette="Blues", legend=False)

# Reduce font size slightly 
plt.xlabel('Importance', fontsize=18)
plt.ylabel('Feature', fontsize=18)
plt.title('Top 20 Features with Highest Importance', fontsize=20)

# Rotate x-axis labels for readability
plt.xticks(rotation=45)

plt.ylabel('')
plt.xlabel('')

# Save the plot in high resolution
#plt.savefig('plots/top_20_most_important_features.jpeg', dpi=300)
plt.show()

# Testing field

In [None]:
# Define hyperparameter distributions
param_grid = {
    'learning_rate': np.linspace(0.01, 0.5, 10),
    'max_depth': np.arange(1, 11),
    'min_child_weight': np.arange(1, 6),
    'reg_lambda': np.linspace(0.1, 15, 15),
    'reg_alpha': np.linspace(0.1, 15, 15),
    'n_estimators': [100, 200, 300]
}

# Initialize the XGBoost Regressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Set up GridSearchCV with tqdm integration
grid_search = GridSearchCV(estimator=xgb_model, 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error',  # Use negative MSE for minimization
                           cv=2,  # Number of folds for cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=1)  # Disable default verbosity

# Perform the GridSearchCV
grid_search.fit(X_train, y_train)

# Retrieve best model and parameters
best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_score = -random_search.best_score_

print(f"Best hyperparameters: {best_params}")
print(f"Best validation MSE: {best_score}")

# Predict and evaluate
y_pred_test = best_model.predict(X_test)
y_pred_external = best_model.predict(X_external)

mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test) * 100

print(f"Test Set MSE: {mse_test}")
print(f"Test Set RMSE: {rmse_test}")
print(f"Test Set MAE: {mae_test}")
print(f"Test Set R2 Score: {r2_test}")

mse_external = mean_squared_error(y_external, y_pred_external)
rmse_external = np.sqrt(mse_external)
mae_external = mean_absolute_error(y_external, y_pred_external)
r2_external = r2_score(y_external, y_pred_external) * 100

print(f"External Validation Set MSE: {mse_external}")
print(f"External Validation Set RMSE: {rmse_external}")
print(f"External Validation Set MAE: {mae_external}")
print(f"External Validation Set R2 Score: {r2_external}")