In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

In [2]:
# Read the stroke_diagnosis.csv file
df = pd.read_csv("CSV\imports\o05_30_percent_filled_dataset.csv")

In [3]:
# Assuming your DataFrame is named df
df = df[df['los'] < 10]

In [4]:
display (df)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,language,marital_status,race,Base Excess,...,CK-MB,Glucose.2,Potassium Whole Blood,Glucose (whole blood),Potassium (whole blood),Creatine Kinase MB Isoenzyme,hospital_expire_flag,los,GCS,Braden
0,1,10004733,27411876,1,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.000000,11.0
1,2,10004733,27411876,2,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.500000,11.0
2,3,10004733,27411876,3,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.000000,11.0
3,4,10004733,27411876,4,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.333333,11.0
4,5,10004733,27411876,5,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.333333,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55787,55788,19999987,23865745,12,F,57,ENGLISH,,UNKNOWN,1.0,...,43.0,,,,,43.0,0,1.937847,8.250000,12.8
55788,55789,19999987,23865745,13,F,57,ENGLISH,,UNKNOWN,1.0,...,45.5,,,,,45.5,0,1.937847,8.111111,12.5
55789,55790,19999987,23865745,14,F,57,ENGLISH,,UNKNOWN,1.0,...,43.0,,,,,43.0,0,1.937847,8.250000,12.8
55790,55791,19999987,23865745,15,F,57,ENGLISH,,UNKNOWN,1.0,...,44.0,,,,,44.0,0,1.937847,7.000000,13.0


In [5]:
# Set training percentage. The difference goes to test set
training_percentage = 0.7

# It's already sorted. Just for precaution. Sort by 'subject_id' and 'Time_Zone')
df = df.sort_values(by=['subject_id', 'Time_Zone'])

# Calculate the total number of unique subject IDs
unique_subject_ids = df['subject_id'].nunique()

# Calculate the number of unique subject IDs to include in the training set
train_subject_ids_count = int(training_percentage * unique_subject_ids)

# Initialize variables to track the number of subject IDs included in the training set
subject_ids_in_training = 0

# Initialize empty DataFrames for the training and test sets
train_df = pd.DataFrame(columns=df.columns)
test_df = pd.DataFrame(columns=df.columns)

# Iterate through the sorted DataFrame
for subject_id, subject_data in df.groupby('subject_id'):
    if subject_ids_in_training < train_subject_ids_count:
        # Add this subject's data to the training set
        train_df = pd.concat([train_df, subject_data])
        subject_ids_in_training += 1
    else:
        # Add this subject's data to the test set
        test_df = pd.concat([test_df, subject_data])

# Reset the index of the resulting DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# I'm going to use those numbers as the split point in rapidminer filter operator
display("The last row of the training set is -> " + str(train_df.tail(1)["row_count"].values[0]))

  train_df = pd.concat([train_df, subject_data])
  test_df = pd.concat([test_df, subject_data])


'The last row of the training set is -> 39008'

In [10]:
# Concatenate train_df and test_df for consistent encoding of categorical variables
combined_df = pd.concat([train_df, test_df], axis=0)

# Encode categorical variables
categorical_cols = ['gender', 'language', 'marital_status', 'race']
combined_df_encoded = pd.get_dummies(combined_df, columns=categorical_cols)

# Convert 'age' column to numeric type
combined_df_encoded['age'] = pd.to_numeric(combined_df_encoded['age'], errors='coerce')

# Convert 'hospital_expire_flag' column to boolean type
combined_df_encoded['hospital_expire_flag'] = combined_df_encoded['hospital_expire_flag'].astype(bool)


# Split the dataframe at the original row index (before concatenation)
combined_df_encoded_train = combined_df_encoded.iloc[:len(train_df)]
combined_df_encoded_test = combined_df_encoded.iloc[len(train_df):]

# Split data into features and target variable again
X_train = combined_df_encoded_train.drop(['row_count', 'subject_id', 'hadm_id', 'Time_Zone', 'los'], axis=1)
y_train = combined_df_encoded_train['los']
X_test = combined_df_encoded_test.drop(['row_count', 'subject_id', 'hadm_id', 'Time_Zone', 'los'], axis=1)
y_test = combined_df_encoded_test['los']

# Train XGBoost model
#model = xgb.XGBRegressor(objective='reg:squarederror')

model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.069)
#model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.7, max_depth=6)
#model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.7, max_depth=6, reg_lambda=3.7)
#model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.7, max_depth=6, reg_lambda=3.7, reg_alpha=0.1)

model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

In [11]:
# Metrics
print("Mean Square Error (MSE):", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", mean_squared_error(y_test, y_pred, squared=False))

# MSLE calculation must not have negative values in y_test and y_pred
try:
    msle = mean_squared_log_error(y_test, y_pred)
    print("Mean Squared Logarithmic Error (MSLE):", msle)
except ValueError:
    print("Mean Squared Logarithmic Error cannot be calculated because targets contain negative values.")

Mean Square Error (MSE): 2.420547587842547
Mean Absolute Error (MAE): 1.0985796399938885
Root Mean Squared Error (RMSE): 1.5558109100538366
Mean Squared Logarithmic Error (MSLE): 0.12038340984295598


# Most important features

In [8]:
# Get feature importances
feature_importance = model.feature_importances_

# Create a DataFrame to store feature importances along with their corresponding names
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort the DataFrame by feature importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the top N most important features
top_n = 20  # Change this value to display more or fewer top features
print(f"Top {top_n} most important features:")
print(feature_importance_df.head(top_n))

Top 20 most important features:
                                            Feature  Importance
92                                    Spont Vt (mL)    0.116994
45                                             pH.1    0.050216
202      Arterial Blood Pressure Alarm - Low (mmHg)    0.045596
153                   High risk (>51) interventions    0.034696
121                                        Eye Care    0.033455
138                              ICU Consent Signed    0.030525
9                                         Albumin.1    0.025447
98                  Tidal Volume (spontaneous) (mL)    0.024918
72                                    PH (dipstick)    0.023688
208                    Arterial Line Zero/Calibrate    0.020562
5                                                pH    0.017388
183                             Pain Level Response    0.017156
198                    20 Gauge placed in the field    0.016680
46                                 Specific Gravity    0.014439
70      

# Hyperparameter

In [9]:
"""
Testing field
"""

from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'learning_rate': np.arange(0.01, 1.00, 0.01),  # Learning rate from 0.01 to 0.51 with step 0.01
    #'learning_rate': [], # Learning rate set value
    #------------------------------------------------
    #'max_depth': np.arange(1, 11, 1),  # Max depth from 1 to 10 with step 1
    #'max_depth': [], # Max depth set value
    #------------------------------------------------
    #'lambda': np.arange(0.0, 10.0, 0.1),  # L2 from 0.0 to 10.0 with step 0.1
    #'lambda': [], # L2 set value
    #------------------------------------------------
    #'alpha': np.arange(0.0, 10.0, 0.1),  # L1 regularization from 0.0 to 10.0 with step 0.1
    #'alpha': [], # L1 set value
    #------------------------------------------------
    #'n_estimators': np.arange(1, 100, 1), # Number of trees from 1 to 100 with step 1
    #'n_estimators': [],  # Number of trees
    #-------------------------------------------------
    #'gamma': np.arange(0.0, 1.0, 0.1), # Minimum loss reduction required to make a further partition on a leaf node
    #'gamma': [0, 0.1, 0.2]  # Minimum loss reduction value
}

# Create a grid search object
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'),
                            param_grid=param_grid,
                            cv=5,  # Number of folds in cross-validation
                            scoring='neg_mean_squared_error',  # Scoring metric
                            verbose=1,  # Controls the verbosity
                            n_jobs=-1)  # Number of jobs to run in parallel (-1: all processors)

# Search for best hyperparameters
grid_search.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Best model
best_model = grid_search.best_estimator_

# Predict on test set using the best model
y_pred_best_stand_alone = best_model.predict(X_test)

# Best model evaluation
print("Mean Square Error (MSE):", mean_squared_error(y_test, y_pred_best_stand_alone))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_best_stand_alone))
print("Root Mean Squared Error (RMSE):", mean_squared_error(y_test, y_pred_best_stand_alone, squared=False))
print("R-squared (R2):", r2_score(y_test, y_pred_best_stand_alone))

# MSLE calculation must not have negative values in y_test and y_pred
try:
    msle = mean_squared_log_error(y_test, y_pred_best_stand_alone)
    print("Mean Squared Logarithmic Error (MSLE):", msle)
except ValueError:
    print("Mean Squared Logarithmic Error cannot be calculated because targets contain negative values.")

Fitting 5 folds for each of 99 candidates, totalling 495 fits
Best Hyperparameters: {'learning_rate': 0.06999999999999999}
Mean Square Error (MSE): 2.3923009896945806
Mean Absolute Error (MAE): 1.0991349113100208
Root Mean Squared Error (RMSE): 1.5467064975924103
R-squared (R2): 0.5852396136150441


In [None]:
# Learning Rate HyperParameter
param_grid_learning_rate = {
    'learning_rate': np.arange(0.01, 1.01, 0.01),  # Learning rate from 0.01 to 1.00 with step 0.01
}

# Create a grid search object for learning rate
grid_search_learning_rate = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'),
                            param_grid=param_grid_learning_rate,
                            cv=5,  # Number of folds in cross-validation
                            scoring='neg_mean_squared_error',  # Scoring metric
                            verbose=1,  # Controls the verbosity
                            n_jobs=-1)  # Number of jobs to run in parallel (-1: all processors)

# Search best learning rate
grid_search_learning_rate.fit(X_train, y_train)

# Best learning rate
best_learning_rate = grid_search_learning_rate.best_params_['learning_rate']

#----------------------------------------------------------------------------------------------------

# Max Depth HyperParameter
param_grid_max_depth = {
    'max_depth': np.arange(1, 11, 1),  # Max depth from 1 to 10 with step 1
}

# Create a grid search object for max depth
grid_search_max_depth = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', learning_rate=best_learning_rate),
                            param_grid=param_grid_max_depth,
                            cv=5,  # Number of folds in cross-validation
                            scoring='neg_mean_squared_error',  # Scoring metric
                            verbose=1,  # Controls the verbosity
                            n_jobs=-1)  # Number of jobs to run in parallel (-1: all processors)

# Search best max depth
grid_search_max_depth.fit(X_train, y_train)

# Best max depth
best_max_depth = grid_search_max_depth.best_params_['max_depth']

#----------------------------------------------------------------------------------------------------

# L2 HyperParameter
param_grid_lambda = {
    'lambda': np.arange(0.0, 10.1, 0.1),  # Lambda (L2 regularization) from 0.0 to 10.0 with step 0.1
}

# Create a grid search object for lambda
grid_search_lambda = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', learning_rate=best_learning_rate, max_depth=best_max_depth),
                            param_grid=param_grid_lambda,
                            cv=5,  # Number of folds in cross-validation
                            scoring='neg_mean_squared_error',  # Scoring metric
                            verbose=1,  # Controls the verbosity
                            n_jobs=-1)  # Number of jobs to run in parallel (-1: all processors)

# Search for best L2
grid_search_lambda.fit(X_train, y_train)

# Best L2
best_lambda = grid_search_lambda.best_params_['lambda']

#----------------------------------------------------------------------------------------------------

# L1 HyperParameter
param_grid_alpha = {
    'alpha': np.arange(0.0, 10.1, 0.1),  # Alpha (L1 regularization) from 0.0 to 10.0 with step 0.1
}

# Create a grid search object for alpha
grid_search_alpha = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', learning_rate=best_learning_rate, max_depth=best_max_depth, lambda=best_lambda),
                            param_grid=param_grid_alpha,
                            cv=5,  # Number of folds in cross-validation
                            scoring='neg_mean_squared_error',  # Scoring metric
                            verbose=1,  # Controls the verbosity
                            n_jobs=-1)  # Number of jobs to run in parallel (-1: all processors)

# Search for best L1
grid_search_alpha.fit(X_train, y_train)

# Best L1
best_alpha = grid_search_alpha.best_params_['alpha']

#----------------------------------------------------------------------------------------------------

# Gamma HyperParameter
param_grid_gamma = {
    'gamma': np.arange(0.0, 1.1, 0.1),  # Gamma from 0.0 to 1.0 with step 0.1
}

# Create a grid search object for gamma
grid_search_gamma = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', learning_rate=best_learning_rate, max_depth=best_max_depth, lambda=best_lambda, alpha=best_alpha),
                            param_grid=param_grid_gamma,
                            cv=5,  # Number of folds in cross-validation
                            scoring='neg_mean_squared_error',  # Scoring metric
                            verbose=1,  # Controls the verbosity
                            n_jobs=-1)  # Number of jobs to run in parallel (-1: all processors)

# Search for best gamma
grid_search_gamma.fit(X_train, y_train)

# Best gamma
best_gamma = grid_search_gamma.best_params_['gamma']

#----------------------------------------------------------------------------------------------------

# Train the model using the best hyperparameter values
best_model = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=best_learning_rate, max_depth=best_max_depth, lambda=best_lambda, alpha=best_alpha, gamma=best_gamma)
best_model.fit(X_train, y_train)

# Predict on test set using the best model
y_pred_best = best_model.predict(X_test)

# Best model evaluation
print("Mean Square Error (MSE):", mean_squared_error(y_test, y_pred_best))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_best))
print("Root Mean Squared Error (RMSE):", mean_squared_error(y_test, y_pred_best, squared=False))
print("R-squared (R2):", r2_score(y_test, y_pred_best))


# MSLE calculation must not have negative values in y_test and y_pred
try:
    msle = mean_squared_log_error(y_test, y_pred_best)
    print("Mean Squared Logarithmic Error (MSLE):", msle)
except ValueError:
    print("Mean Squared Logarithmic Error cannot be calculated because targets contain negative values.")