In [1]:
# importing all the required libraries

import pandas as pd
import seaborn as sns
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import scipy.stats as stats

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

#new imports from 16/07/2024
import random


# Data Loading

In [2]:
df = pd.read_csv('../raw_data/Dataset.csv')

In [42]:
(df1.isnull().sum()/len(df1)).sort_values(ascending=False)

EtCO2          0.962868
Temp           0.661627
DBP            0.313459
Resp           0.153546
SBP            0.145770
O2Sat          0.130611
MAP            0.124513
HR             0.098826
Hour           0.000000
Age            0.000000
Gender         0.000000
Patient_ID     0.000000
SepsisLabel    0.000000
dtype: float64

# Features Selection and Data Cleaning

In [3]:
#features to drop in the dataset

columns_drop = ['Unnamed: 0','EtCO2','BaseExcess', 'HCO3','pH','PaCO2','Alkalinephos', 'Calcium','Magnesium',
  'Phosphate','Potassium','PTT','Fibrinogen','Unit1','Unit2']

columns_for_baseline = ['Hour', 'HR','O2Sat', 'Temp','SBP', 'MAP', 'DBP', 'Resp', 'EtCO2','Age','Gender','Patient_ID','SepsisLabel']

In [4]:
df1 = df[columns_for_baseline]

In [5]:
# Fill NaN values with the next available value
df_filled = df1.bfill().ffill()

In [45]:
#df.head(50)

# Data Scaling

In [37]:
columns_to_scale = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2']

In [38]:
# Initialize the scalers
min_max_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

# Apply MinMaxScaler to the O2Sat column
df_filled['O2Sat'] = min_max_scaler.fit_transform(df_filled[['O2Sat']])

# Apply StandardScaler to the rest of the columns
columns_to_standardize = [col for col in columns_to_scale if col != 'O2Sat']
df_filled[columns_to_standardize] = standard_scaler.fit_transform(df_filled[columns_to_standardize])


# Balancing Dataset

In [8]:
# Filter patient IDs where at some point sepsis label is 1
sepsis_patients_ids = list(set(df_filled.loc[df_filled['SepsisLabel'] == 1, 'Patient_ID'].tolist()))

In [9]:
# Get unique Patient_IDs from df_encoded
all_patient_ids = set(df_filled['Patient_ID'])
# Find Patient_IDs not in sepsis_patients_ids
no_sepsis_patients_ids = list(all_patient_ids - set(sepsis_patients_ids))

In [10]:
#Now we have the list with the ID's with patients that had sepsis and patients which doesn't
len(sepsis_patients_ids), len(no_sepsis_patients_ids)

(2932, 37404)

In [11]:
non_sepsis_selected_patients_ids = random.sample(no_sepsis_patients_ids, 4000)

In [12]:
non_sepsis_selected_df = df_filled.loc[df_filled['Patient_ID'].isin(non_sepsis_selected_patients_ids)]
sepsis_selected_df = df_filled.loc[df_filled['Patient_ID'].isin(sepsis_patients_ids)]

In [13]:
final_df = pd.concat([non_sepsis_selected_df,sepsis_selected_df])

In [14]:
patient_ids = final_df['Patient_ID'].unique()
np.random.shuffle(patient_ids)

In [15]:
final_df.Patient_ID = final_df.Patient_ID.astype("category")
final_df.Patient_ID = final_df.Patient_ID.cat.set_categories(patient_ids)
final_df_shuffled = final_df.sort_values(["Patient_ID","Hour"]).set_index('Patient_ID')  ## 'sort' changed to 'patient_ids'

In [44]:
final_df_shuffled

Unnamed: 0_level_0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,Age,Gender,SepsisLabel
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
11771,0,1.009310,0.98125,0.272072,-0.705743,-0.078955,0.560970,-1.291060,0.096771,-1.471327,1,0
11771,1,0.894250,1.00000,0.747842,-0.533984,0.042734,0.669638,-0.514133,0.096771,-1.471327,1,0
11771,2,1.296960,1.00000,0.747842,-0.491045,0.103579,0.742084,-0.514133,0.096771,-1.471327,1,0
11771,3,1.181900,0.99375,1.019710,-0.684273,-0.139799,0.488524,-0.514133,0.096771,-1.471327,1,0
11771,4,1.527079,0.98750,1.019710,-0.813092,-0.109377,0.524747,-0.514133,0.096771,-1.471327,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
107956,44,-1.176828,1.00000,-1.155238,0.732736,1.229204,1.249203,-1.096828,-0.559937,-0.061605,0,0
107956,45,-0.831649,1.00000,-1.155238,0.732736,1.533427,1.249203,-0.902597,-0.559937,-0.061605,0,0
107956,46,-0.371409,0.98750,-1.155238,0.861555,1.290049,1.394094,-0.514133,-0.559937,-0.061605,0,0
107956,47,-0.716589,1.00000,-0.747435,0.689796,2.141873,1.321649,-1.096828,-0.559937,-0.061605,0,0


# Tabular Data Shifting for 1rst Baseline Model Approach

In [16]:
features_df = final_df_shuffled.drop(columns = ['Hour','Age','Gender','SepsisLabel'])
to_add_df = final_df_shuffled[['Hour','Age','Gender','SepsisLabel']]

In [17]:
balanced_shifted_df = pd.concat([

                    features_df,
                    features_df.groupby(features_df.index).shift(1).add_suffix('_1'),
                    features_df.groupby(features_df.index).shift(2).add_suffix('_2'),
                    features_df.groupby(features_df.index).shift(3).add_suffix('_3'),
                    features_df.groupby(features_df.index).shift(4).add_suffix('_4'),
                    features_df.groupby(features_df.index).shift(5).add_suffix('_5'),
                    to_add_df],
                    axis=1
)

  features_df.groupby(features_df.index).shift(1).add_suffix('_1'),
  features_df.groupby(features_df.index).shift(2).add_suffix('_2'),
  features_df.groupby(features_df.index).shift(3).add_suffix('_3'),
  features_df.groupby(features_df.index).shift(4).add_suffix('_4'),
  features_df.groupby(features_df.index).shift(5).add_suffix('_5'),


In [18]:
pd.set_option('display.max_columns', None)

In [19]:
balanced_shifted_cleaned_df = balanced_shifted_df.dropna()

In [20]:
balanced_shifted_cleaned_df

Unnamed: 0_level_0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,HR_1,O2Sat_1,Temp_1,SBP_1,MAP_1,DBP_1,Resp_1,EtCO2_1,HR_2,O2Sat_2,Temp_2,SBP_2,MAP_2,DBP_2,Resp_2,EtCO2_2,HR_3,O2Sat_3,Temp_3,SBP_3,MAP_3,DBP_3,Resp_3,EtCO2_3,HR_4,O2Sat_4,Temp_4,SBP_4,MAP_4,DBP_4,Resp_4,EtCO2_4,HR_5,O2Sat_5,Temp_5,SBP_5,MAP_5,DBP_5,Resp_5,EtCO2_5,Hour,Age,Gender,SepsisLabel
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1
11771,1.469549,0.9750,1.155645,-0.984851,-0.231066,0.379856,-0.514133,0.096771,1.527079,0.9875,1.019710,-0.813092,-0.109377,0.524747,-0.514133,0.096771,1.181900,0.99375,1.019710,-0.684273,-0.139799,0.488524,-0.514133,0.096771,1.296960,1.00000,0.747842,-0.491045,0.103579,0.742084,-0.514133,0.096771,0.894250,1.00000,0.747842,-0.533984,0.042734,0.669638,-0.514133,0.096771,1.009310,0.98125,0.272072,-0.705743,-0.078955,0.560970,-1.291060,0.096771,5,-1.471327,1,0
11771,1.584609,0.9875,1.563447,-0.941911,-0.291911,0.379856,-0.514133,0.096771,1.469549,0.9750,1.155645,-0.984851,-0.231066,0.379856,-0.514133,0.096771,1.527079,0.98750,1.019710,-0.813092,-0.109377,0.524747,-0.514133,0.096771,1.181900,0.99375,1.019710,-0.684273,-0.139799,0.488524,-0.514133,0.096771,1.296960,1.00000,0.747842,-0.491045,0.103579,0.742084,-0.514133,0.096771,0.894250,1.00000,0.747842,-0.533984,0.042734,0.669638,-0.514133,0.096771,6,-1.471327,1,0
11771,1.757199,0.9875,1.971250,-0.856032,-0.231066,0.452301,-0.514133,0.096771,1.584609,0.9875,1.563447,-0.941911,-0.291911,0.379856,-0.514133,0.096771,1.469549,0.97500,1.155645,-0.984851,-0.231066,0.379856,-0.514133,0.096771,1.527079,0.98750,1.019710,-0.813092,-0.109377,0.524747,-0.514133,0.096771,1.181900,0.99375,1.019710,-0.684273,-0.139799,0.488524,-0.514133,0.096771,1.296960,1.00000,0.747842,-0.491045,0.103579,0.742084,-0.514133,0.096771,7,-1.471327,1,0
11771,1.814729,0.9500,1.971250,-0.684273,-0.170222,0.379856,-0.514133,0.096771,1.757199,0.9875,1.971250,-0.856032,-0.231066,0.452301,-0.514133,0.096771,1.584609,0.98750,1.563447,-0.941911,-0.291911,0.379856,-0.514133,0.096771,1.469549,0.97500,1.155645,-0.984851,-0.231066,0.379856,-0.514133,0.096771,1.527079,0.98750,1.019710,-0.813092,-0.109377,0.524747,-0.514133,0.096771,1.181900,0.99375,1.019710,-0.684273,-0.139799,0.488524,-0.514133,0.096771,8,-1.471327,1,0
11771,1.412019,0.9625,1.971250,-0.727213,-0.352756,0.017628,-0.514133,0.096771,1.814729,0.9500,1.971250,-0.684273,-0.170222,0.379856,-0.514133,0.096771,1.757199,0.98750,1.971250,-0.856032,-0.231066,0.452301,-0.514133,0.096771,1.584609,0.98750,1.563447,-0.941911,-0.291911,0.379856,-0.514133,0.096771,1.469549,0.97500,1.155645,-0.984851,-0.231066,0.379856,-0.514133,0.096771,1.527079,0.98750,1.019710,-0.813092,-0.109377,0.524747,-0.514133,0.096771,9,-1.471327,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107956,-1.176828,1.0000,-1.155238,0.732736,1.229204,1.249203,-1.096828,-0.559937,-1.176828,1.0000,-1.155238,-0.040178,0.316535,0.452301,-1.291060,-0.559937,-0.601529,1.00000,-0.883369,1.333891,1.959339,1.249203,-0.902597,-0.559937,-0.716589,1.00000,-0.883369,1.119193,1.290049,0.886975,-0.708365,-0.559937,-0.486469,1.00000,-0.883369,0.904495,0.864137,1.249203,-0.514133,-0.559937,-0.371409,1.00000,-0.883369,0.904495,0.681603,0.886975,-0.125669,-0.559937,44,-0.061605,0,0
107956,-0.831649,1.0000,-1.155238,0.732736,1.533427,1.249203,-0.902597,-0.559937,-1.176828,1.0000,-1.155238,0.732736,1.229204,1.249203,-1.096828,-0.559937,-1.176828,1.00000,-1.155238,-0.040178,0.316535,0.452301,-1.291060,-0.559937,-0.601529,1.00000,-0.883369,1.333891,1.959339,1.249203,-0.902597,-0.559937,-0.716589,1.00000,-0.883369,1.119193,1.290049,0.886975,-0.708365,-0.559937,-0.486469,1.00000,-0.883369,0.904495,0.864137,1.249203,-0.514133,-0.559937,45,-0.061605,0,0
107956,-0.371409,0.9875,-1.155238,0.861555,1.290049,1.394094,-0.514133,-0.559937,-0.831649,1.0000,-1.155238,0.732736,1.533427,1.249203,-0.902597,-0.559937,-1.176828,1.00000,-1.155238,0.732736,1.229204,1.249203,-1.096828,-0.559937,-1.176828,1.00000,-1.155238,-0.040178,0.316535,0.452301,-1.291060,-0.559937,-0.601529,1.00000,-0.883369,1.333891,1.959339,1.249203,-0.902597,-0.559937,-0.716589,1.00000,-0.883369,1.119193,1.290049,0.886975,-0.708365,-0.559937,46,-0.061605,0,0
107956,-0.716589,1.0000,-0.747435,0.689796,2.141873,1.321649,-1.096828,-0.559937,-0.371409,0.9875,-1.155238,0.861555,1.290049,1.394094,-0.514133,-0.559937,-0.831649,1.00000,-1.155238,0.732736,1.533427,1.249203,-0.902597,-0.559937,-1.176828,1.00000,-1.155238,0.732736,1.229204,1.249203,-1.096828,-0.559937,-1.176828,1.00000,-1.155238,-0.040178,0.316535,0.452301,-1.291060,-0.559937,-0.601529,1.00000,-0.883369,1.333891,1.959339,1.249203,-0.902597,-0.559937,47,-0.061605,0,0


In [29]:
log_reg_df = balanced_shifted_cleaned_df.drop('Hour',axis=1)

# Data Split

In [30]:
# Split the data into training and testing sets, we have 5545 patients for train and 1386 for test so a total of 1386 + 5545 = 6931 patients
split_point = int(0.8 * len(patient_ids))  # 80% train, 20% test
split_point

5545

In [31]:
train_ids = patient_ids[:split_point]
test_ids = patient_ids[split_point:]

In [32]:
train_df = log_reg_df.loc[log_reg_df.index.isin(train_ids)]
test_df = log_reg_df.loc[log_reg_df.index.isin(test_ids)]


# Baseline Model Logistic Regression 1rst approach

In [33]:
X_train = train_df.drop('SepsisLabel',axis=1)
y_train = train_df['SepsisLabel']

In [34]:
X_test = test_df.drop('SepsisLabel',axis=1)
y_test = test_df['SepsisLabel']

In [35]:
# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=300)

# Perform cross-validation and print recall scores
cv_recall_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='recall')
print(f"Cross-Validation Recall Scores: {cv_recall_scores}")
print(f"Mean Cross-Validation Recall Score: {cv_recall_scores.mean()}")

# Train the model on the entire training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate recall on the test data
test_recall = recall_score(y_test, y_pred)
print(f"Test Recall Score: {test_recall}")

Cross-Validation Recall Scores: [0.         0.         0.00024956 0.         0.        ]
Mean Cross-Validation Recall Score: 4.991265285749938e-05
Test Recall Score: 0.0
