In [1]:
import pandas as pd
import numpy as np, warnings
from pathlib import Path
import os
import sweetviz as sv
from importlib import reload
import matplotlib.pyplot as plt
import seaborn as sns

np.warnings = warnings

ROOT_DIR = Path('..')

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_validate, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_selection import SelectFromModel

# model comparison
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
# from xgboost import XGBClassifier

# evaluation metrics
from sklearn.metrics import cohen_kappa_score,classification_report 
from sklearn.metrics import mean_squared_error, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, auc, make_scorer
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, balanced_accuracy_score
from sklearn.metrics import PrecisionRecallDisplay

In [3]:
# t0_df = pd.read_csv(df_final)
path_t0_df = ROOT_DIR / 'data/output_csv/t0_text.csv'  
t0_df = pd.read_csv(path_t0_df)
t0_df = t0_df.drop(columns=["SUBJECT_ID", "HADM_ID"])
t0_df

Unnamed: 0,AGE,GENDER_NUM,IS_SEPSIS,ITEMID_50813,ITEMID_50820,ITEMID_50821,ITEMID_50882,ITEMID_50893,ITEMID_50902,ITEMID_50912,...,TEXT_15,TEXT_16,TEXT_17,TEXT_18,TEXT_19,TEXT_20,TEXT_21,TEXT_22,TEXT_23,TEXT_24
0,76,0,0,-999.0,-999.0,-999.0,25.0,8.2,99.0,3.2,...,0,0,0,0,0,0,0,0,0,0
1,47,1,0,-999.0,-999.0,-999.0,24.0,8.9,97.0,0.5,...,0,0,0,0,0,0,0,0,0,0
2,41,0,0,-999.0,-999.0,-999.0,30.0,-999.0,100.0,1.2,...,0,0,0,0,0,0,0,0,0,0
3,50,1,0,-999.0,-999.0,-999.0,25.0,-999.0,104.0,0.7,...,0,0,0,0,0,0,0,0,0,0
4,72,0,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34420,77,0,0,-999.0,-999.0,-999.0,21.0,9.0,105.0,1.3,...,0,0,0,0,0,0,0,0,0,0
34421,65,1,0,-999.0,-999.0,-999.0,13.0,-999.0,118.0,1.5,...,0,0,0,0,0,0,0,0,0,0
34422,65,0,0,-999.0,-999.0,-999.0,34.0,-999.0,94.0,1.7,...,0,0,0,0,0,0,0,0,0,0
34423,53,0,1,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X_t0_train, X_t0_test, y_t0_train, y_t0_test = train_test_split(
    t0_df.drop('IS_SEPSIS', axis=1),  # dropping the target column for X values
    t0_df['IS_SEPSIS'],
    test_size=0.2,  # this means 20% test set
    stratify=t0_df['IS_SEPSIS'],  # stratify by the target variable
    random_state=42  # to ensure reproducibility
)

# If you want to get the train and test set as dataframes:
train_t0_df = pd.concat([X_t0_train, y_t0_train], axis=1)
test_t0_df = pd.concat([X_t0_test, y_t0_test], axis=1)

In [None]:
item_columns_train= X_t0_train.filter(like='ITEMID_')
item_columns_test= X_t0_test.filter(like='ITEMID_')

# Feature scaling - Standardisation
scaler = StandardScaler()

item_columns_train = scaler.fit_transform(item_columns_train)
item_columns_test = scaler.transform(item_columns_test)

demog_columns_train = X_t0_train.filter(regex=r'^(?!ITEMID_).*')
demog_columns_test = X_t0_test.filter(regex=r'^(?!ITEMID_).*')

demog_columns_train = demog_columns_train.to_numpy()
demog_columns_test = demog_columns_test.to_numpy()

X_t0_train = np.concatenate((demog_columns_train, item_columns_train), axis=1)
X_t0_test = np.concatenate((demog_columns_test, item_columns_test), axis=1)

In [None]:
# Feature scaling - Standardisation
scaler = StandardScaler()

X_t0_train = scaler.fit_transform(X_t0_train)
X_t0_test = scaler.transform(X_t0_test)

## Modeling Training

In [5]:
label_counts = train_t0_df["IS_SEPSIS"].value_counts()
label_proportions = label_counts / len(train_t0_df)*100

# Calculate class weights
class_weights = {0: 1 / (label_proportions[0] / 100), 1: 1 / (label_proportions[1] / 100)}

# Round the class weights to the desired precision (optional)
class_weights = {key: round(weight, 4) for key, weight in class_weights.items()}

print("Class Weights:", class_weights)

Class Weights: {0: 1.1273, 1: 8.8553}


In [8]:
label_counts

IS_SEPSIS
0    24430
1     3110
Name: count, dtype: int64

In [None]:
# Fitting a Logistic Regression Classifier
t0_lr = LogisticRegression(random_state=901, class_weight=class_weights)
t0_lr.fit(X_t0_train, y_t0_train)

In [None]:
# Function for performance measures on (i) confusion matrix, (ii) balanced accuracy, (iii) precision, (iv) recall and (v) F1 score
def performance_measure(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Computating the confusion matrix
    cm_train = confusion_matrix(y_train, y_pred_train)
    cm_test = confusion_matrix(y_test, y_pred_test)    

    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,4))
    
    x_labels = ["Predicted\nNon-Sepsis", "Predicted\nSepsis"]
    y_labels = ["Actual Non-Sepsis", "Actual Sepsis"]
    sns.heatmap(cm_train, annot=True, fmt='d', xticklabels=x_labels, yticklabels=y_labels, ax=axes[0])
    sns.heatmap(cm_test, annot=True, fmt='d', xticklabels=x_labels, yticklabels=y_labels, ax=axes[1])
    
    axes[0].set_title("CM in training set", fontsize = 10)
    axes[1].set_title("CM in test set", fontsize = 10)
    axes[0].tick_params(labelsize=9)
    axes[1].tick_params(labelsize=9)
    plt.tight_layout()
    plt.show()

    # Computing balanced accuracy
    balanced_acc_train = balanced_accuracy_score(y_train, y_pred_train)
    balanced_acc_test = balanced_accuracy_score(y_test, y_pred_test)
    print("\nBalanced Accuracy: Training set {:.4f}".format(balanced_acc_train), "; Test set {:.4f}".format(balanced_acc_test))    

    # Computing precision and recall
    precision_train = precision_score(y_train, y_pred_train)
    precision_test = precision_score(y_test, y_pred_test)
    recall_train = recall_score(y_train, y_pred_train)
    recall_test = recall_score(y_test, y_pred_test)
    print("Precision: Training set {:.4f}".format(precision_train), "; Test set {:.4f}".format(precision_test))
    print("Recall: Training set {:.4f}".format(recall_train), "; Test set {:.4f}".format(recall_test))

    # Computing F1 score
    f1_train = f1_score(y_train, y_pred_train)
    f1_test = f1_score(y_test, y_pred_test)
    print("F1 score: Training set {:.4f}".format(f1_train), "; Test set {:.4f}".format(f1_test))

# Plotting ROC curve to determine the threshold
def plot_roc_curve(fpr, tpr, label = None):
    plt.plot(fpr, tpr, linewidth=2, label = label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=11) 
    plt.ylabel('True Positive Rate (Recall)', fontsize=11)
    plt.grid(True)

In [None]:
# Looking at some predictions from the testing set
some_data = X_t0_test[:5]
some_labels = y_t0_test[:5]
print("Predictions:", t0_lr.predict(some_data))
print("Labels:", some_labels.values.flatten().tolist())

In [None]:
t0_lr_predictions = t0_lr.predict(X_t0_test)
t0_lr_predicted_probabilities = t0_lr.predict_proba(X_t0_test)

In [None]:
print("Performance measures on Logistic Regression Classifier:\n")
performance_measure(t0_lr, X_t0_train, X_t0_test, y_t0_train, y_t0_test)

In [None]:
feature_coeff = t0_lr.coef_[0]
feature_names = train_t0_df.columns[:-1]
feature_importance = list(zip(feature_names, feature_coeff))
sorted_features = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)

print('Feature importance from Logistic Regression:\n')

# Determine the maximum length of feature names for alignment
max_length = max([len(feature) for feature, _ in sorted_features])

for feature, coef in sorted_features:
    # Use ljust() to align feature names
    print('{} Coefficient: {:.4f}'.format(feature.ljust(max_length), coef))


## Random Forest

In [None]:
# Define the RMSE scoring metric as a callable
def rmse(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    return sqrt(mse)

rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [None]:
t0_rf = RandomForestClassifier(random_state=901,class_weight=class_weights, max_depth=7, min_samples_leaf=25, min_samples_split=250)

t0_rf.fit(X_t0_train, y_t0_train)

# Looking at some predictions from the testing set
some_data = X_t0_test[:5]
some_labels = y_t0_test[:5]
print("Predictions:", t0_lr.predict(some_data))
print("Labels:", some_labels.values.flatten().tolist())

In [None]:
t0_lr_predictions = t0_lr.predict(X_t0_test)
t0_lr_predicted_probabilities = t0_lr.predict_proba(X_t0_test)
print("Performance measures on Logistic Regression Classifier:\n")
performance_measure(t0_lr, X_t0_train, X_t0_test, y_t0_train, y_t0_test)