In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, fbeta_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


Downloading...
From (original): https://drive.google.com/uc?id=1m2zrxuamOjejPA_283824cvmtQ6EzRSM
From (redirected): https://drive.google.com/uc?id=1m2zrxuamOjejPA_283824cvmtQ6EzRSM&confirm=t&uuid=a99203f8-594a-439e-8ce3-1f2dd8b88153
To: /content/Dataset_ previous values_gpbyID+NoShift+F.Engv3.0.csv
100% 420M/420M [00:03<00:00, 138MB/s]


  df = pd.read_csv('/content/Dataset_ previous values_gpbyID+NoShift+F.Engv3.0.csv')


In [6]:

# Load the data
!gdown https://drive.google.com/uc?id=1m2zrxuamOjejPA_283824cvmtQ6EzRSM
df = pd.read_csv('/content/Dataset_ previous values_gpbyID+NoShift+F.Engv3.0.csv')


Downloading...
From (original): https://drive.google.com/uc?id=1m2zrxuamOjejPA_283824cvmtQ6EzRSM
From (redirected): https://drive.google.com/uc?id=1m2zrxuamOjejPA_283824cvmtQ6EzRSM&confirm=t&uuid=12402d93-fa5f-466a-8b13-cc0fc6629f4c
To: /content/Dataset_ previous values_gpbyID+NoShift+F.Engv3.0.csv
100% 420M/420M [00:04<00:00, 92.6MB/s]


  df = pd.read_csv('/content/Dataset_ previous values_gpbyID+NoShift+F.Engv3.0.csv')


In [7]:

# Dropping unnecessary columns
df.drop(['Unnamed: 0', '...1', 'Hour', 'Unit1', 'Unit2', 'Apache_II_Score', 'Patient_ID', 'SOFA_Score', 'SIRS', 'Bilirubin_Creatinine_Ratio'], axis=1, inplace=True)

# Data preparation
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month
df['weekday'] = df['datetime'].dt.weekday
df.drop('datetime', axis=1, inplace=True)

# Fill NaN values
df.fillna(-1, inplace=True)

# Addressing class imbalance
class_labels = np.unique(df['SepsisLabel'])
class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=df['SepsisLabel'].values)
class_weight_dict = {class_label: weight for class_label, weight in zip(class_labels, class_weights)}
print("Class Weight Dictionary:", class_weight_dict)

# Split the data into features (X) and target (y)
X = df.drop(['SepsisLabel'], axis=1)
y = df['SepsisLabel'].values

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


Class Weight Dictionary: {0: 0.5091570261380023, 1: 27.801440034388882}


In [8]:

# Logistic Regression
lr = LogisticRegression(class_weight=class_weight_dict, random_state=42)
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)
lr_prob_predictions = lr.predict_proba(X_test)[:, 1]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression:
Accuracy: 0.7429
Precision: 0.0413
Recall: 0.5981
F1 Score: 0.0772
F2 Score: 0.1617
ROC AUC: 0.7233
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.75      0.85    304859
           1       0.04      0.60      0.08      5583

    accuracy                           0.74    310442
   macro avg       0.52      0.67      0.46    310442
weighted avg       0.97      0.74      0.84    310442

Confusion Matrix:
[[227303  77556]
 [  2244   3339]]

Random Forest:
Accuracy: 0.9913
Precision: 0.9432
Recall: 0.5470
F1 Score: 0.6924
F2 Score: 0.5972
ROC AUC: 0.9966
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    304859
           1       0.94      0.55      0.69      5583

    accuracy                           0.99    310442
   macro avg       0.97      0.77      0.84    310442
weighted avg       0.99      0.99      0.99    310442

Confusion 

In [10]:

# Random Forest
rf = RandomForestClassifier(n_estimators=300, class_weight=class_weight_dict, random_state=42)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)
rf_prob_predictions = rf.predict_proba(X_test)[:, 1]


In [None]:

# XGBoost
xgb = XGBClassifier(scale_pos_weight=class_weight_dict[1], random_state=42)
xgb.fit(X_train, y_train)
xgb_predictions = xgb.predict(X_test)
xgb_prob_predictions = xgb.predict_proba(X_test)[:, 1]


In [11]:

# Evaluation metrics for Logistic Regression
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)
lr_roc_auc = roc_auc_score(y_test, lr_prob_predictions)
lr_precision = precision_score(y_test, lr_predictions)
lr_recall = recall_score(y_test, lr_predictions)
lr_f2 = fbeta_score(y_test, lr_predictions, beta=2)
lr_classification_report = classification_report(y_test, lr_predictions)
lr_confusion_matrix = confusion_matrix(y_test, lr_predictions)

# Evaluation metrics for Random Forest
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)
rf_roc_auc = roc_auc_score(y_test, rf_prob_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_f2 = fbeta_score(y_test, rf_predictions, beta=2)
rf_classification_report = classification_report(y_test, rf_predictions)
rf_confusion_matrix = confusion_matrix(y_test, rf_predictions)

# Evaluation metrics for XGBoost
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_f1 = f1_score(y_test, xgb_predictions)
xgb_roc_auc = roc_auc_score(y_test, xgb_prob_predictions)
xgb_precision = precision_score(y_test, xgb_predictions)
xgb_recall = recall_score(y_test, xgb_predictions)
xgb_f2 = fbeta_score(y_test, xgb_predictions, beta=2)
xgb_classification_report = classification_report(y_test, xgb_predictions)
xgb_confusion_matrix = confusion_matrix(y_test, xgb_predictions)


In [12]:

# Print the evaluation metrics for Logistic Regression
print("Logistic Regression:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1 Score: {lr_f1:.4f}")
print(f"F2 Score: {lr_f2:.4f}")
print(f"ROC AUC: {lr_roc_auc:.4f}")
print("Classification Report:")
print(lr_classification_report)
print("Confusion Matrix:")
print(lr_confusion_matrix)

# Print the evaluation metrics for Random Forest
print("\nRandom Forest:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")
print(f"F2 Score: {rf_f2:.4f}")
print(f"ROC AUC: {rf_roc_auc:.4f}")
print("Classification Report:")
print(rf_classification_report)
print("Confusion Matrix:")
print(rf_confusion_matrix)

# Print the evaluation metrics for XGBoost
print("\nXGBoost:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1 Score: {xgb_f1:.4f}")
print(f"F2 Score: {xgb_f2:.4f}")
print(f"ROC AUC: {xgb_roc_auc:.4f}")
print("Classification Report:")
print(xgb_classification_report)
print("Confusion Matrix:")
print(xgb_confusion_matrix)

Logistic Regression:
Accuracy: 0.7429
Precision: 0.0413
Recall: 0.5981
F1 Score: 0.0772
F2 Score: 0.1617
ROC AUC: 0.7233
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.75      0.85    304859
           1       0.04      0.60      0.08      5583

    accuracy                           0.74    310442
   macro avg       0.52      0.67      0.46    310442
weighted avg       0.97      0.74      0.84    310442

Confusion Matrix:
[[227303  77556]
 [  2244   3339]]

Random Forest:
Accuracy: 0.9914
Precision: 0.9406
Recall: 0.5554
F1 Score: 0.6984
F2 Score: 0.6050
ROC AUC: 0.9976
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    304859
           1       0.94      0.56      0.70      5583

    accuracy                           0.99    310442
   macro avg       0.97      0.78      0.85    310442
weighted avg       0.99      0.99      0.99    310442

Confusion 

In [13]:
# Feature importances for Logistic Regression
lr_feature_importances = lr.coef_[0]
lr_feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': lr_feature_importances})
lr_feature_importances_df['Absolute Importance'] = lr_feature_importances_df['Importance'].abs()
lr_feature_importances_df = lr_feature_importances_df.sort_values(by='Absolute Importance', ascending=False)

# Feature importances for Random Forest
rf_feature_importances = rf.feature_importances_
rf_feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': rf_feature_importances})
rf_feature_importances_df = rf_feature_importances_df.sort_values(by='Importance', ascending=False)

# Feature importances for XGBoost
xgb_feature_importances = xgb.feature_importances_
xgb_feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': xgb_feature_importances})
xgb_feature_importances_df = xgb_feature_importances_df.sort_values(by='Importance', ascending=False)

# Print feature importances
print("Logistic Regression Feature Importances:")
print(lr_feature_importances_df)

print("\nRandom Forest Feature Importances:")
print(rf_feature_importances_df)

print("\nXGBoost Feature Importances:")
print(xgb_feature_importances_df)

Logistic Regression Feature Importances:
                   Feature  Importance  Absolute Importance
6                     Resp    0.027621             0.027621
31                     WBC    0.025231             0.025231
17                 Calcium   -0.019087             0.019087
39              NEWS_Score    0.018775             0.018775
8               BaseExcess    0.018028             0.018028
37                  ICULOS    0.016983             0.016983
0                       HR    0.014529             0.014529
28                     Hct   -0.010154             0.010154
4                      MAP   -0.010072             0.010072
15                     BUN    0.008740             0.008740
9                     HCO3   -0.007124             0.007124
1                    O2Sat   -0.006050             0.006050
2                     Temp    0.005909             0.005909
7                    EtCO2   -0.005194             0.005194
38  Incomplete_qSOFA_Score    0.004938             0.004938