In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score

In [2]:
# Read the stroke_diagnosis.csv file
df = pd.read_csv("CSV\imports\o05_30_percent_filled_dataset.csv")

In [3]:
display (df)

Unnamed: 0,row_count,subject_id,hadm_id,Time_Zone,gender,age,language,marital_status,race,Base Excess,...,CK-MB,Glucose.2,Potassium Whole Blood,Glucose (whole blood),Potassium (whole blood),Creatine Kinase MB Isoenzyme,hospital_expire_flag,los,GCS,Braden
0,1,10004733,27411876,1,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.000000,11.0
1,2,10004733,27411876,2,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.500000,11.0
2,3,10004733,27411876,3,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.000000,11.0
3,4,10004733,27411876,4,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.333333,11.0
4,5,10004733,27411876,5,M,51,ENGLISH,SINGLE,UNKNOWN,0.0,...,,,,,,,0,8.357373,8.333333,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55787,55788,19999987,23865745,12,F,57,ENGLISH,,UNKNOWN,1.0,...,43.0,,,,,43.0,0,1.937847,8.250000,12.8
55788,55789,19999987,23865745,13,F,57,ENGLISH,,UNKNOWN,1.0,...,45.5,,,,,45.5,0,1.937847,8.111111,12.5
55789,55790,19999987,23865745,14,F,57,ENGLISH,,UNKNOWN,1.0,...,43.0,,,,,43.0,0,1.937847,8.250000,12.8
55790,55791,19999987,23865745,15,F,57,ENGLISH,,UNKNOWN,1.0,...,44.0,,,,,44.0,0,1.937847,7.000000,13.0


In [4]:
# Set training percentage. The difference goes to test set
training_percentage = 0.7

# It's already sorted. Just for precaution. Sort by 'subject_id' and 'Time_Zone')
df = df.sort_values(by=['subject_id', 'Time_Zone'])

# Calculate the total number of unique subject IDs
unique_subject_ids = df['subject_id'].nunique()

# Calculate the number of unique subject IDs to include in the training set
train_subject_ids_count = int(training_percentage * unique_subject_ids)

# Initialize variables to track the number of subject IDs included in the training set
subject_ids_in_training = 0

# Initialize empty DataFrames for the training and test sets
train_df = pd.DataFrame(columns=df.columns)
test_df = pd.DataFrame(columns=df.columns)

# Iterate through the sorted DataFrame
for subject_id, subject_data in df.groupby('subject_id'):
    if subject_ids_in_training < train_subject_ids_count:
        # Add this subject's data to the training set
        train_df = pd.concat([train_df, subject_data])
        subject_ids_in_training += 1
    else:
        # Add this subject's data to the test set
        test_df = pd.concat([test_df, subject_data])

# Reset the index of the resulting DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# I'm going to use those numbers as the split point in rapidminer filter operator
display("The last row of the training set is -> " + str(train_df.tail(1)["row_count"].values[0]))

  train_df = pd.concat([train_df, subject_data])
  test_df = pd.concat([test_df, subject_data])


'The last row of the training set is -> 39040'

In [5]:
# Concatenate df1 and df2 for consistent encoding of categorical variables
combined_df = pd.concat([train_df, test_df], axis=0)

# Encode categorical variables
categorical_cols = ['gender', 'language', 'marital_status', 'race']
combined_df_encoded = pd.get_dummies(combined_df, columns=categorical_cols)

# Convert 'age' column to numeric type
combined_df_encoded['age'] = pd.to_numeric(combined_df_encoded['age'], errors='coerce')

# Convert 'hospital_expire_flag' column to boolean type
combined_df_encoded['hospital_expire_flag'] = combined_df_encoded['hospital_expire_flag'].astype(bool)


# Split the dataframe at the original row index (before concatenation)
combined_df_encoded_train = combined_df_encoded.iloc[:len(train_df)]
combined_df_encoded_test = combined_df_encoded.iloc[len(train_df):]

# Split data into features and target variable again
X_train = combined_df_encoded_train.drop(['row_count', 'subject_id', 'hadm_id', 'Time_Zone', 'los'], axis=1)
y_train = combined_df_encoded_train['los']
X_test = combined_df_encoded_test.drop(['row_count', 'subject_id', 'hadm_id', 'Time_Zone', 'los'], axis=1)
y_test = combined_df_encoded_test['los']

# Train XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror')
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

In [6]:
# Metrics
print("Mean Square Error (MSE):", mean_squared_error(y_test, y_pred))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", mean_squared_error(y_test, y_pred, squared=False))

# For MSLE calculation must not have negative values in y_test and y_pred
try:
    msle = mean_squared_log_error(y_test, y_pred)
    print("Mean Squared Logarithmic Error (MSLE):", msle)
except ValueError:
    print("Mean Squared Logarithmic Error cannot be calculated because targets contain negative values.")

Mean Square Error (MSE): 15.10616076357753
Mean Absolute Error (MAE): 2.149766447260332
Root Mean Squared Error (RMSE): 3.886664477875281
Mean Squared Logarithmic Error cannot be calculated because targets contain negative values.


# Most important features

In [9]:
# Get feature importances
feature_importance = model.feature_importances_

# Create a DataFrame to store feature importances along with their corresponding names
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

# Sort the DataFrame by feature importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the top N most important features
top_n = 20  # Change this value to display more or fewer top features
print(f"Top {top_n} most important features:")
print(feature_importance_df.head(top_n))

Top 20 most important features:
                             Feature  Importance
92                     Spont Vt (mL)    0.119741
255                  race_PORTUGUESE    0.076467
46                  Specific Gravity    0.045117
62                               WBC    0.039946
217            Glucose (whole blood)    0.039477
138               ICU Consent Signed    0.026535
26                         Basophils    0.026392
127                 Differential-Eos    0.024652
93                    Spont RR (bpm)    0.019715
142                    Height (Inch)    0.019189
218          Potassium (whole blood)    0.017708
153    High risk (>51) interventions    0.015754
262      race_WHITE - OTHER EUROPEAN    0.015468
216            Potassium Whole Blood    0.014413
8                            Albumin    0.014256
14                   Bilirubin Total    0.014198
190                        Back Care    0.014106
98   Tidal Volume (spontaneous) (mL)    0.013241
183              Pain Level Response 

# Hyperparameter

In [None]:
"""
Testing field
"""

from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter grid
param_grid = {
    'max_depth': np.arange(1, 11, 1),  # Maximum depth from 1 to 10 with step 1
    'learning_rate': np.arange(0.01, 0.51, 0.01),  # Learning rate from 0.01 to 0.51 with step 0.01
    'lambda': np.arange(0.0, 10.0, 0.1)  # L2 regularization from 0.0 to 10.0 with step 0.1 
    'alpha': np.arange(0.0, 10.0, 0.1),  # L1 regularization from 0.0 to 10.0 with step 0.1 
    #'n_estimators': [100, 200, 300],  # Number of trees in the forest
    #'subsample': [0.7, 0.8, 0.9],  # Subsample ratio of the training instances
    #'colsample_bytree': [0.7, 0.8, 0.9],  # Subsample ratio of columns when constructing each tree
    #'gamma': [0, 0.1, 0.2]  # Minimum loss reduction required to make a further partition on a leaf node
}

# Create a randomized search object
random_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'),
                                   param_distributions=param_grid,
                                   n_iter=50,  # Number of parameter settings that are sampled
                                   cv=5,  # Number of folds in cross-validation
                                   scoring='neg_mean_squared_error',  # Scoring metric
                                   verbose=1,  # Controls the verbosity
                                   n_jobs=-1)  # Number of jobs to run in parallel (-1: all processors)

# Random search for best hyperparameters
random_search.fit(X_train, y_train)

# Show the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Predict on test set using the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the performance of the best model
print("Mean Square Error (MSE):", mean_squared_error(y_test, y_pred_best))
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred_best))
print("Root Mean Squared Error (RMSE):", mean_squared_error(y_test, y_pred_best, squared=False))
print("R-squared (R2):", r2_score(y_test, y_pred_best))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
