In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv('Sepsis_cleaned.csv')

In [3]:
data.dtypes

HR             float64
O2Sat          float64
Temp           float64
SBP            float64
MAP            float64
DBP            float64
Resp           float64
BaseExcess     float64
HCO3           float64
FiO2           float64
pH             float64
PaCO2          float64
BUN            float64
Chloride       float64
Glucose        float64
Potassium      float64
Hct            float64
Hgb            float64
Age            float64
Gender           int64
Unit1          float64
Unit2          float64
HospAdmTime    float64
ICULOS           int64
SepsisLabel      int64
dtype: object

In [4]:
# Filter the dataset to include only records where SepsisLabel is 1 or 0
sepsis_data = data[(data['SepsisLabel'] == 1) | (data['SepsisLabel'] == 0)]

In [5]:
# Split the filtered data into features (X) and target variable (y)
X = sepsis_data.drop(columns=['SepsisLabel'])  # Remove the target variable
y = sepsis_data['SepsisLabel']

In [6]:
# Sample an equal number of rows with sepsislabel 0 and 1
sepsis_label_0 = sepsis_data[sepsis_data['SepsisLabel'] == 0].sample(n=17136, random_state=42)
sepsis_label_1 = sepsis_data[sepsis_data['SepsisLabel'] == 1]

In [7]:
sepsis_label_1

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,FiO2,...,Potassium,Hct,Hgb,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
606,119.0,100.0,37.940,140.0,106.0,85.0,26.5,5.046,35.409,0.350,...,4.299,28.724,6.540,27.92,1,0.407,-0.029,-0.03,249,1
607,118.0,96.0,37.714,138.0,108.0,88.0,26.0,-1.596,20.058,0.336,...,4.409,20.394,8.493,27.92,1,0.452,0.016,-0.03,250,1
608,111.0,97.0,37.390,136.0,106.0,86.0,26.0,2.345,22.001,0.627,...,3.252,23.279,9.209,27.92,1,0.865,0.429,-0.03,251,1
609,116.0,96.0,37.720,143.0,109.0,88.0,30.0,0.004,23.406,0.808,...,3.857,29.742,9.265,27.92,1,0.750,0.314,-0.03,252,1
610,120.0,97.0,37.465,138.0,106.0,85.0,32.0,-6.041,18.821,0.350,...,2.946,26.233,11.687,27.92,1,1.403,0.967,-0.03,253,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790264,88.0,98.0,37.510,135.0,81.0,64.0,16.0,-11.292,21.715,0.500,...,5.448,28.824,10.042,62.29,1,0.836,0.400,-0.03,31,1
790265,96.0,98.0,38.720,174.0,97.0,72.0,16.0,2.000,20.795,0.505,...,3.900,27.800,9.516,62.29,1,1.299,0.863,-0.03,32,1
790266,140.0,97.0,37.273,133.0,81.5,62.5,16.0,-1.211,12.180,0.663,...,4.717,28.775,7.520,62.29,1,1.762,1.326,-0.03,33,1
790267,120.0,96.0,37.064,154.0,118.0,105.0,16.0,-2.842,22.569,0.187,...,3.568,20.331,7.669,62.29,1,0.780,0.344,-0.03,34,1


In [8]:
sepsis_label_0

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,FiO2,...,Potassium,Hct,Hgb,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
720867,73.770,98.0,36.945,118.5,77.00,58.5,18.0,5.053,21.269,0.559,...,4.450,35.300,10.089,88.08,1,1.000,0.000,-15.28,4,0
248349,89.000,92.0,37.087,118.5,70.00,58.5,23.0,-1.709,28.280,0.494,...,3.869,32.794,12.204,84.78,1,0.000,1.000,-205.43,45,0
109449,67.000,95.0,36.626,97.0,58.33,58.5,19.0,-4.776,20.634,0.569,...,3.091,30.225,9.905,71.78,1,0.621,0.615,-1.72,19,0
627472,84.000,100.0,38.283,136.0,88.00,65.0,14.0,1.580,31.735,0.733,...,3.185,31.086,12.320,69.74,1,0.000,1.000,-13.38,37,0
657776,91.000,91.0,37.060,143.0,96.00,70.0,17.0,-7.541,26.080,0.914,...,3.131,26.490,7.771,65.73,1,0.254,0.248,-68.17,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175273,73.000,100.0,36.280,116.0,66.00,41.0,15.0,5.280,23.880,0.400,...,3.786,31.135,11.266,81.68,0,1.034,1.028,-179.12,4,0
693139,77.347,98.0,37.386,118.5,77.00,58.5,18.0,1.263,23.679,0.518,...,4.053,24.741,7.707,63.77,1,0.000,1.000,-0.01,20,0
625573,64.000,100.0,36.527,148.0,106.67,58.5,16.0,1.407,20.781,0.350,...,3.774,24.358,9.910,67.52,1,1.047,1.040,-0.02,3,0
522889,98.000,100.0,36.488,144.0,97.00,58.5,16.0,0.320,23.900,0.327,...,4.381,33.224,7.648,22.00,1,0.908,0.902,-0.02,8,0


In [7]:
# Combine the sampled rows into a single dataframe
balanced_data = pd.concat([sepsis_label_0, sepsis_label_1])
balanced_data

Unnamed: 0,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,BaseExcess,HCO3,FiO2,...,Potassium,Hct,Hgb,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
720867,73.77,98.0,36.945,118.5,77.00,58.5,18.0,5.053,21.269,0.559,...,4.450,35.300,10.089,88.08,1,1.000,0.000,-15.28,4,0
248349,89.00,92.0,37.087,118.5,70.00,58.5,23.0,-1.709,28.280,0.494,...,3.869,32.794,12.204,84.78,1,0.000,1.000,-205.43,45,0
109449,67.00,95.0,36.626,97.0,58.33,58.5,19.0,-4.776,20.634,0.569,...,3.091,30.225,9.905,71.78,1,0.621,0.615,-1.72,19,0
627472,84.00,100.0,38.283,136.0,88.00,65.0,14.0,1.580,31.735,0.733,...,3.185,31.086,12.320,69.74,1,0.000,1.000,-13.38,37,0
657776,91.00,91.0,37.060,143.0,96.00,70.0,17.0,-7.541,26.080,0.914,...,3.131,26.490,7.771,65.73,1,0.254,0.248,-68.17,15,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
790264,88.00,98.0,37.510,135.0,81.00,64.0,16.0,-11.292,21.715,0.500,...,5.448,28.824,10.042,62.29,1,0.836,0.400,-0.03,31,1
790265,96.00,98.0,38.720,174.0,97.00,72.0,16.0,2.000,20.795,0.505,...,3.900,27.800,9.516,62.29,1,1.299,0.863,-0.03,32,1
790266,140.00,97.0,37.273,133.0,81.50,62.5,16.0,-1.211,12.180,0.663,...,4.717,28.775,7.520,62.29,1,1.762,1.326,-0.03,33,1
790267,120.00,96.0,37.064,154.0,118.00,105.0,16.0,-2.842,22.569,0.187,...,3.568,20.331,7.669,62.29,1,0.780,0.344,-0.03,34,1


In [9]:
#balanced_data.to_csv("Sepsis_cleaned.csv") #can use this afterwards for further improvement

In [10]:
# Split the balanced data into features (X) and target variable (y)
X = balanced_data.drop(columns=['SepsisLabel'])  # Remove the target variable
y = balanced_data['SepsisLabel']

In [11]:
# Split the balanced data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Initialize the random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [13]:
# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

In [14]:
# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

In [15]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.8662290299051787


In [16]:
# Generate a classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      3459
           1       0.86      0.87      0.87      3396

    accuracy                           0.87      6855
   macro avg       0.87      0.87      0.87      6855
weighted avg       0.87      0.87      0.87      6855



In [17]:
# Generate a confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

Confusion Matrix:
[[2996  463]
 [ 454 2942]]


In [19]:
import joblib

# Save the trained model to a file
joblib.dump(rf_classifier, 'sepsis_model.pkl')

['sepsis_model.pkl']