In [12]:
import numpy as np 
import pandas as pd 

In [13]:
df_train = pd.read_csv('Dataset.csv')
df_train = df_train.drop(columns=['Unnamed: 0'])
df_train.head()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0,,,,,,,,,,...,,,68.54,0,,,-0.02,1,0,17072
1,1,65.0,100.0,,,72.0,,16.5,,,...,,,68.54,0,,,-0.02,2,0,17072
2,2,78.0,100.0,,,42.5,,,,,...,,,68.54,0,,,-0.02,3,0,17072
3,3,73.0,100.0,,,,,17.0,,,...,,,68.54,0,,,-0.02,4,0,17072
4,4,70.0,100.0,,129.0,74.0,69.0,14.0,,,...,,330.0,68.54,0,,,-0.02,5,0,17072


In [4]:
df_train.shape

(1552210, 43)

In [5]:
df_train.columns

Index(['Hour', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel', 'Patient_ID'],
      dtype='object')

In [14]:
interest_columns = ['Patient_ID', 'Hour', 'HR', 'Temp', 'WBC', 'SBP', 'DBP', 'MAP', 'Age', 'Creatinine', 'Gender', 'BUN', 'ICULOS', 'Platelets', 'SepsisLabel']

seuil = 0.3 * len(interest_columns)

cleaned_df = df_train.dropna(subset=interest_columns, thresh=len(interest_columns) - seuil)

cleaned_df.shape

(502739, 43)

In [7]:
cleaned_df.head()

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
4,4,70.0,100.0,,129.0,74.0,69.0,14.0,,,...,,330.0,68.54,0,,,-0.02,5,0,17072
7,7,68.0,100.0,35.78,142.0,93.5,78.0,16.0,,,...,,,68.54,0,,,-0.02,8,0,17072
11,11,84.0,100.0,36.39,128.0,80.0,60.0,14.0,,,...,,,68.54,0,,,-0.02,12,0,17072
13,13,85.0,100.0,,141.0,95.0,69.0,14.0,,,...,,303.0,68.54,0,,,-0.02,14,0,17072
16,16,89.0,100.0,37.5,112.0,82.5,63.0,14.0,,,...,,,68.54,0,,,-0.02,17,0,17072


In [31]:
cleaned_df['SepsisLabel'].value_counts()

SepsisLabel
0    493307
1      9432
Name: count, dtype: int64

In [32]:
# Équilibrage de classes
from sklearn.utils import resample

major_classe = cleaned_df[cleaned_df.SepsisLabel == 0]
minor_classe = cleaned_df[cleaned_df.SepsisLabel == 1]

major_classe_resample = resample(major_classe,
                               replace=False,    # échantillonnage sans remplacement
                               n_samples=len(minor_classe), # pour faire correspondre le nombre de la classe minoritaire
                               random_state=123) 

df_final = pd.concat([major_classe_resample, minor_classe])
print(df_final.SepsisLabel.value_counts())




SepsisLabel
0    9432
1    9432
Name: count, dtype: int64


In [33]:
df_final['Patient_ID'].count()

18864

In [34]:
sub_columns = ['Hour',          'HR',       'O2Sat',        'Temp',
               'MAP',        'Resp',         'BUN',    'Chloride',
        'Creatinine',     'Glucose',         'Hct',         'Hgb',
               'WBC',   'Platelets',         'Age', 'HospAdmTime',
            'ICULOS', 'SepsisLabel']

df_final = df_final[sub_columns]

In [35]:
# Création des ensembles de données X et y
X = df_final.drop(columns=['SepsisLabel'])
y = df_final['SepsisLabel']

# Répartition des données
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2023)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(15091, 17) (3773, 17) (15091,) (3773,)


In [36]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
import numpy as np

# Définition de la fonction objectif pour Optuna
def objective(trial):
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
    max_depth = trial.suggest_int('max_depth', 2, 15)
    n_estimators = trial.suggest_int('n_estimators', 50, 250)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_uniform('gamma', 0.01, 5)
    subsample = trial.suggest_uniform('subsample', 0.01, 1)
    
    clf1 = XGBClassifier(learning_rate=learning_rate,
                         max_depth=max_depth,
                         n_estimators=n_estimators,
                         min_child_weight=min_child_weight,
                         gamma=gamma,
                         subsample=subsample,
                         use_label_encoder=False,
                         eval_metric='logloss')
    
    score = cross_val_score(clf1, X_train, y_train, cv=7)
    return np.mean(score)


In [37]:
# Création de l'étude Optuna
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=30)

# Affichage des meilleurs paramètres trouvés par Optuna
print(study.best_params)

[I 2024-06-14 14:47:50,543] A new study created in memory with name: no-name-6c383ca5-3d4b-4cd9-a915-06f33dd24f56
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:47:53,897] Trial 0 finished with value: 0.6910086504243426 and parameters: {'learning_rate': 0.3807947176588889, 'max_depth': 15, 'n_estimators': 197, 'min_child_weight': 6, 'gamma': 0.7885330158077583, 'subsample': 0.16443457513284063}. Best is trial 0 with value: 0.6910086504243426.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:47:59,941] Trial 1 finished with value: 0.7679417745945517 and parameters: {'learning_rate': 0.06750277604651747, 'max_depth': 14, 'n_estimators': 170, 'min_child_weight': 8, 'gamma': 0.11271662653605422, 'subsample': 

[I 2024-06-14 14:48:09,183] Trial 6 finished with value: 0.7407726962180298 and parameters: {'learning_rate': 0.31156763148163696, 'max_depth': 3, 'n_estimators': 187, 'min_child_weight': 5, 'gamma': 0.6189707918754463, 'subsample': 0.5002251410101575}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:48:14,884] Trial 7 finished with value: 0.7560805097643964 and parameters: {'learning_rate': 0.044044635904066216, 'max_depth': 14, 'n_estimators': 102, 'min_child_weight': 7, 'gamma': 1.5654382696861608, 'subsample': 0.5248673409660327}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:48:17,610] Trial 8 finished w

[I 2024-06-14 14:48:46,438] Trial 13 finished with value: 0.7654903905697031 and parameters: {'learning_rate': 0.2236776803057568, 'max_depth': 8, 'n_estimators': 159, 'min_child_weight': 4, 'gamma': 0.10247270807374692, 'subsample': 0.9580994031539216}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:48:49,054] Trial 14 finished with value: 0.752502239935357 and parameters: {'learning_rate': 0.25494993582657577, 'max_depth': 7, 'n_estimators': 158, 'min_child_weight': 4, 'gamma': 0.8643074534032664, 'subsample': 0.9954276348729897}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:48:52,409] Trial 15 finished w

[I 2024-06-14 14:49:13,146] Trial 20 finished with value: 0.7257312458837151 and parameters: {'learning_rate': 0.9646105346403706, 'max_depth': 7, 'n_estimators': 122, 'min_child_weight': 7, 'gamma': 1.1889954415682085, 'subsample': 0.781648809357816}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:49:16,973] Trial 21 finished with value: 0.7623093133960126 and parameters: {'learning_rate': 0.17051543896485719, 'max_depth': 9, 'n_estimators': 115, 'min_child_weight': 8, 'gamma': 0.19116101802827012, 'subsample': 0.9987729260949273}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:49:22,277] Trial 22 finished w

[I 2024-06-14 14:49:50,185] Trial 27 finished with value: 0.7588634718659815 and parameters: {'learning_rate': 0.11556875111598885, 'max_depth': 8, 'n_estimators': 169, 'min_child_weight': 3, 'gamma': 1.231960789556175, 'subsample': 0.5877969342907241}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:49:58,066] Trial 28 finished with value: 0.7655561588844414 and parameters: {'learning_rate': 0.0804164499191974, 'max_depth': 10, 'n_estimators': 149, 'min_child_weight': 5, 'gamma': 0.43166872346973245, 'subsample': 0.8525030848685566}. Best is trial 1 with value: 0.7679417745945517.
  learning_rate = trial.suggest_uniform('learning_rate', 0.01, 1)
  gamma = trial.suggest_uniform('gamma', 0.01, 5)
  subsample = trial.suggest_uniform('subsample', 0.01, 1)
[I 2024-06-14 14:50:05,751] Trial 29 finished 

{'learning_rate': 0.06750277604651747, 'max_depth': 14, 'n_estimators': 170, 'min_child_weight': 8, 'gamma': 0.11271662653605422, 'subsample': 0.9702107536403743}


In [38]:
# Création du modèle XGBoost avec les meilleurs paramètres
xgbc = XGBClassifier(
    learning_rate=study.best_params['learning_rate'],
    max_depth=study.best_params['max_depth'],
    n_estimators=study.best_params['n_estimators'],
    min_child_weight=study.best_params['min_child_weight'],
    gamma=study.best_params['gamma'],
    subsample=study.best_params['subsample'],
    use_label_encoder=False,
    eval_metric='logloss'
)

# Entraînement du modèle
xgbc.fit(X_train, y_train)


In [46]:
# toutes les variables
# Prédiction et évaluation
y_predicted = xgbc.predict(X_test)

# Affichage du rapport d'évaluation
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.83      0.80      0.81      1886
           1       0.81      0.83      0.82      1887

    accuracy                           0.82      3773
   macro avg       0.82      0.82      0.82      3773
weighted avg       0.82      0.82      0.82      3773



In [39]:
# sous ensemble de variables
# Prédiction et évaluation
y_predicted = xgbc.predict(X_test)

# Affichage du rapport d'évaluation
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1886
           1       0.78      0.78      0.78      1887

    accuracy                           0.78      3773
   macro avg       0.78      0.78      0.78      3773
weighted avg       0.78      0.78      0.78      3773

