In [1]:
# =======================
# 1. Importing Libraries
# =======================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from imblearn.over_sampling import SMOTE
import optuna

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Bidirectional
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score


In [5]:
# =======================
# 2. Load and Explore Dataset
# =======================
df = pd.read_csv('sepsis_Data2.csv')

# Display basic info
print("First 5 rows:")
print(df.head())
print("\nLast 5 rows:")
print(df.tail())
print("\nRandom sample of 5 rows:")
print(df.sample(5))
print("\nData Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nSummary Statistics (Numerical):")
print(df.describe())


First 5 rows:
   Unnamed: 0    HR  O2Sat   Temp    SBP    MAP   DBP  Resp  EtCO2  \
0           1  61.0   99.0  36.44  124.0   65.0  43.0  17.5    0.0   
1           2  64.0   98.0  36.44  125.0   64.0  41.0  27.0    0.0   
2           3  56.0  100.0  36.44  123.0   65.0  41.0   9.0    0.0   
3           4  66.0   99.0  36.22  120.0   67.0  43.0  23.0    0.0   
4           5  94.0  100.0  36.22  194.0  116.0  66.0  14.0    0.0   

   BaseExcess  ...   WBC  Fibrinogen  Platelets    Age  Gender  Unit1  Unit2  \
0         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
1         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
2         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
3         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
4         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   

   HospAdmTime  ICULOS  SepsisLabel  
0        -98.6       2            0  
1       

In [7]:
df


Unnamed: 0.1,Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,1,61.0,99.0,36.44,124.0,65.00,43.0,17.5,0.0,0.0,...,11.0,0.0,158.0,75.91,0,0.0,1.0,-98.6,2,0
1,2,64.0,98.0,36.44,125.0,64.00,41.0,27.0,0.0,0.0,...,11.0,0.0,158.0,75.91,0,0.0,1.0,-98.6,3,0
2,3,56.0,100.0,36.44,123.0,65.00,41.0,9.0,0.0,0.0,...,11.0,0.0,158.0,75.91,0,0.0,1.0,-98.6,4,0
3,4,66.0,99.0,36.22,120.0,67.00,43.0,23.0,0.0,0.0,...,11.0,0.0,158.0,75.91,0,0.0,1.0,-98.6,5,0
4,5,94.0,100.0,36.22,194.0,116.00,66.0,14.0,0.0,0.0,...,11.0,0.0,158.0,75.91,0,0.0,1.0,-98.6,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58371,31,72.0,98.0,37.50,178.0,98.00,70.0,16.0,0.0,-3.0,...,7.2,220.0,98.0,60.47,1,0.0,0.0,-10.9,39,0
58372,32,75.0,98.0,37.60,170.5,90.67,69.0,16.0,0.0,-3.0,...,7.2,220.0,98.0,60.47,1,0.0,0.0,-10.9,40,0
58373,33,72.0,98.0,37.60,170.5,88.33,67.0,18.0,0.0,-3.0,...,7.2,220.0,98.0,60.47,1,0.0,0.0,-10.9,41,0
58374,34,83.0,97.0,37.90,182.0,97.33,84.0,16.0,0.0,-3.0,...,7.2,220.0,98.0,60.47,1,0.0,0.0,-10.9,42,0


In [11]:


# Assuming the sepsis label column is named 'sepsis' and uses 1 for positive and 0 for negative
positive_df = df[df['SepsisLabel'] == 1]
negative_df = df[df['SepsisLabel'] == 0]

# Check that we have enough negative samples
assert len(positive_df) == 1306, "The number of positive samples isn't 1306!"
assert len(negative_df) >= 1306, "Not enough negative samples to downsample!"

# Randomly sample 1306 negative samples
negative_sampled_df = negative_df.sample(n=1306, random_state=42)

# Concatenate positive and sampled negative examples
balanced_df = pd.concat([positive_df, negative_sampled_df])

# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset to a new CSV
balanced_df.to_csv('balanced_dataset.csv', index=False)

print("Balanced dataset created with shape:", balanced_df.shape)


Balanced dataset created with shape: (2612, 42)
