In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df_train = pd.read_csv('Dataset.csv')
df_train = df_train.drop(columns=['Unnamed: 0'])
df_train.head()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0,,,,,,,,,,...,,,68.54,0,,,-0.02,1,0,17072
1,1,65.0,100.0,,,72.0,,16.5,,,...,,,68.54,0,,,-0.02,2,0,17072
2,2,78.0,100.0,,,42.5,,,,,...,,,68.54,0,,,-0.02,3,0,17072
3,3,73.0,100.0,,,,,17.0,,,...,,,68.54,0,,,-0.02,4,0,17072
4,4,70.0,100.0,,129.0,74.0,69.0,14.0,,,...,,330.0,68.54,0,,,-0.02,5,0,17072


In [None]:
df_train

In [7]:
label = df_train.columns[-2]
label_deplace = df_train.pop(label)
df_train.insert(0, label, label_deplace)
df_train.head()

Unnamed: 0,Patient_ID,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,17072,0,,,,,,,,,...,,,,68.54,0,,,-0.02,1,0
1,17072,1,65.0,100.0,,,72.0,,16.5,,...,,,,68.54,0,,,-0.02,2,0
2,17072,2,78.0,100.0,,,42.5,,,,...,,,,68.54,0,,,-0.02,3,0
3,17072,3,73.0,100.0,,,,,17.0,,...,,,,68.54,0,,,-0.02,4,0
4,17072,4,70.0,100.0,,129.0,74.0,69.0,14.0,,...,11.3,,330.0,68.54,0,,,-0.02,5,0


In [5]:
df_train['SepsisLabel'].value_counts()

SepsisLabel
0    1524294
1      27916
Name: count, dtype: int64

In [8]:
df_train['Patient_ID'].nunique()

40336

In [9]:
X = df_train.iloc[:,:-1]
y = df_train.iloc[:,-1]

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2023)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1241768, 42), (310442, 42), (1241768,), (310442,))

# Optuna

In [11]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

In [12]:
def objective(trial):
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
    max_depth = trial.suggest_int('max_depth', 2, 11)
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_uniform('gamma', 0.01, 5)
    subsample = trial.suggest_uniform('subsample', 0.01, 1)
    clf1 = XGBClassifier(learning_rate=learning_rate,
                         max_depth=max_depth,
                         n_estimators=n_estimators,
                         min_child_weight=min_child_weight,
                         gamma=gamma)
    score = cross_val_score(clf1, X_train, y_train, cv=5)
    return np.mean(score)

In [13]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=30)
study.best_params

[I 2024-06-10 14:43:17,199] A new study created in memory with name: no-name-3253aaa5-4099-4f87-8b55-09bc86f9771b
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 14:44:56,064] Trial 0 finished with value: 0.98416209817719 and parameters: {'learning_rate': 0.3807947176588889, 'max_depth': 11, 'n_estimators': 160, 'min_child_weight': 6, 'gamma': 0.7885330158077583, 'subsample': 0.16443457513284063}. Best is trial 0 with value: 0.98416209817719.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 14:46:59,301] Trial 1 finished with value: 0.9824443857499482 and parameters: {'learning_rate': 0.06750277604651747, 'max_depth': 10, 'n_estimators': 140, 'min_child_weight': 8, 'gamma': 0.11271662653605422, 'subsample': 0.97

[I 2024-06-10 14:50:41,431] Trial 6 finished with value: 0.9820119377975184 and parameters: {'learning_rate': 0.31156763148163696, 'max_depth': 2, 'n_estimators': 153, 'min_child_weight': 5, 'gamma': 0.6189707918754463, 'subsample': 0.5002251410101575}. Best is trial 0 with value: 0.98416209817719.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 14:52:21,431] Trial 7 finished with value: 0.9822567500504868 and parameters: {'learning_rate': 0.044044635904066216, 'max_depth': 11, 'n_estimators': 89, 'min_child_weight': 7, 'gamma': 1.5654382696861608, 'subsample': 0.5248673409660327}. Best is trial 0 with value: 0.98416209817719.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 14:53:10,227] Trial 8 finished with v

[I 2024-06-10 14:59:57,139] Trial 13 finished with value: 0.9828905237668961 and parameters: {'learning_rate': 0.7373709438079181, 'max_depth': 7, 'n_estimators': 117, 'min_child_weight': 3, 'gamma': 2.1401200965525535, 'subsample': 0.22466514443570168}. Best is trial 9 with value: 0.9852540893124806.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 15:01:41,857] Trial 14 finished with value: 0.9846162889590657 and parameters: {'learning_rate': 0.9758850456446434, 'max_depth': 9, 'n_estimators': 174, 'min_child_weight': 4, 'gamma': 0.8643074534032664, 'subsample': 0.6705731543738487}. Best is trial 9 with value: 0.9852540893124806.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 15:02:24,482] Trial 15 finished w

[I 2024-06-10 15:08:37,397] Trial 20 finished with value: 0.9823115107819739 and parameters: {'learning_rate': 0.7153561465489235, 'max_depth': 5, 'n_estimators': 176, 'min_child_weight': 3, 'gamma': 1.1195083209366277, 'subsample': 0.6473831646229633}. Best is trial 9 with value: 0.9852540893124806.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 15:10:19,603] Trial 21 finished with value: 0.9850640377099344 and parameters: {'learning_rate': 0.6848718621582176, 'max_depth': 8, 'n_estimators': 200, 'min_child_weight': 4, 'gamma': 0.33073779082807697, 'subsample': 0.02351895403947442}. Best is trial 9 with value: 0.9852540893124806.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 15:12:14,436] Trial 22 finished 

[I 2024-06-10 15:18:38,838] Trial 27 finished with value: 0.9841717613485391 and parameters: {'learning_rate': 0.5198181672394189, 'max_depth': 11, 'n_estimators': 75, 'min_child_weight': 1, 'gamma': 1.8860414736181843, 'subsample': 0.1796566021007477}. Best is trial 25 with value: 0.9857090852473069.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 15:19:32,074] Trial 28 finished with value: 0.9839656039891576 and parameters: {'learning_rate': 0.33909239727080287, 'max_depth': 11, 'n_estimators': 64, 'min_child_weight': 2, 'gamma': 1.1625051092266454, 'subsample': 0.34766897036296085}. Best is trial 25 with value: 0.9857090852473069.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-10 15:20:45,443] Trial 29 finishe

{'learning_rate': 0.554709989725495,
 'max_depth': 11,
 'n_estimators': 189,
 'min_child_weight': 2,
 'gamma': 0.430871760151312,
 'subsample': 0.13225635058741714}

In [14]:
xgbc = XGBClassifier(learning_rate= list(study.best_params.values())[0],
 max_depth= list(study.best_params.values())[1],
 n_estimators= list(study.best_params.values())[2],
 min_child_weight= list(study.best_params.values())[3],
 gamma= list(study.best_params.values())[4],
 subsample= list(study.best_params.values())[5]
                     )

In [15]:
from sklearn.metrics import classification_report

In [16]:
xgbc.fit(X_train, y_train)

In [17]:
y_predicted = xgbc.predict(X_test)

In [18]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99    304859
           1       0.21      0.14      0.17      5583

    accuracy                           0.98    310442
   macro avg       0.60      0.56      0.58    310442
weighted avg       0.97      0.98      0.97    310442



**I also tried using standardization and SMOTE, but the ACCURACY was low. I wonder why?**