# Sepsis Pipeline

Этот документ слодержи конвейер обучения и использования модели для определения септических больных на подмножестве из датасета MIMIC 3.

Шаги
- Берем датасет
- Подготавливаем (заполняем пропуски, балансируем классы, считаем MEWS-столбец, sepsislabel итп)
- Выбираем признаки (список названий колонок)
- Опционально PCA
- Обучаем модель
- Валидируем, получаем оценки
- Инференс для заданных значений (условно “пришел новый больной”)

## Загрузка и подготовка данных

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Проекты/Без названия/Dataset 2.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0,0,,,,,,,,,...,,,68.54,0,,,-0.02,1,0,17072
1,1,1,65.0,100.0,,,72.0,,16.5,,...,,,68.54,0,,,-0.02,2,0,17072
2,2,2,78.0,100.0,,,42.5,,,,...,,,68.54,0,,,-0.02,3,0,17072
3,3,3,73.0,100.0,,,,,17.0,,...,,,68.54,0,,,-0.02,4,0,17072
4,4,4,70.0,100.0,,129.0,74.0,69.0,14.0,,...,,330.0,68.54,0,,,-0.02,5,0,17072


In [None]:
# Заполним метки
df['SepsisLabel'] = df.groupby('Patient_ID')['SepsisLabel'].transform('max')
df['SepsisLabel'] = df['SepsisLabel'].astype(int)

In [None]:
# Заполним пропуски в данных
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

df_ex = df[['Hour', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP',
       'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2',
       'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
       'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate',
       'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel', 'Patient_ID']]

columns_for_imputation = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP',
       'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2',
       'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
       'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate',
       'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Unit1', 'Unit2',
       'HospAdmTime']

imputer = IterativeImputer(max_iter=10, random_state=0)
imputed_data = imputer.fit_transform(df_ex[columns_for_imputation])
df_ex[columns_for_imputation] = imputed_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ex[columns_for_imputation] = imputed_data


In [None]:
df_ex

Unnamed: 0,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,BaseExcess,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,Patient_ID
0,0,84.279959,97.178126,36.931550,123.498621,82.431335,63.011864,18.682164,32.970181,-0.541309,...,336.239531,191.662298,68.54,0,0.496571,0.503429,-0.02,1,0,17072
1,1,65.000000,100.000000,36.693758,110.646229,72.000000,54.687923,16.500000,32.946200,-0.638068,...,310.425018,189.063069,68.54,0,0.496571,0.503429,-0.02,2,0,17072
2,2,78.000000,100.000000,36.900298,67.977105,42.500000,36.104727,17.188583,32.930573,-0.845859,...,319.009770,191.248619,68.54,0,0.496571,0.503429,-0.02,3,0,17072
3,3,73.000000,100.000000,36.775440,124.353017,82.309014,62.080678,17.000000,32.949803,-0.591128,...,319.239940,190.925780,68.54,0,0.496571,0.503429,-0.02,4,0,17072
4,4,70.000000,100.000000,36.841445,129.000000,74.000000,69.000000,14.000000,34.909359,-0.183245,...,415.675192,330.000000,68.54,0,0.496572,0.503428,-0.02,5,0,17072
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1552205,21,83.000000,99.000000,36.850558,121.000000,77.000000,54.000000,22.000000,32.968722,-0.617389,...,355.833964,204.875423,88.00,0,1.000000,0.000000,-2.93,22,0,113911
1552206,22,80.000000,92.000000,36.791271,102.000000,73.000000,51.000000,24.000000,33.008331,-0.468429,...,368.655043,200.843008,88.00,0,1.000000,0.000000,-2.93,23,0,113911
1552207,23,95.000000,97.000000,36.700000,128.500000,83.000000,58.500000,25.000000,32.982829,-0.639308,...,358.325051,204.515428,88.00,0,1.000000,0.000000,-2.93,24,0,113911
1552208,24,104.000000,99.000000,37.097703,127.000000,85.000000,59.000000,24.000000,32.972840,-0.622101,...,380.227735,209.704234,88.00,0,1.000000,0.000000,-2.93,25,0,113911


In [None]:
df_ex.to_csv('/content/drive/MyDrive/Проекты/Без названия/End_Dataset.csv', index=False)

In [None]:
df_first_10_hours = df_ex[df_ex['Hour'] < 10]

In [None]:
df_melted = pd.melt(df_first_10_hours, id_vars=['Hour', 'Patient_ID'], value_vars=['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP',
       'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2',
       'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine',
       'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate',
       'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel'])


In [None]:
df_melted['variable'] = 'patient_h' + df_melted['Hour'].astype(str) + '_' + df_melted['variable']

In [None]:
df_pivot = df_melted.pivot_table(index='Patient_ID', columns='variable', values='value', aggfunc='first')

In [None]:
df_pivot.columns = df_pivot.columns.map(lambda x: x.replace('.', '_'))

In [None]:
df_pivot

variable,patient_h0_AST,patient_h0_Age,patient_h0_Alkalinephos,patient_h0_BUN,patient_h0_BaseExcess,patient_h0_Bilirubin_direct,patient_h0_Bilirubin_total,patient_h0_Calcium,patient_h0_Chloride,patient_h0_Creatinine,...,patient_h9_Resp,patient_h9_SBP,patient_h9_SaO2,patient_h9_SepsisLabel,patient_h9_Temp,patient_h9_TroponinI,patient_h9_Unit1,patient_h9_Unit2,patient_h9_WBC,patient_h9_pH
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,275.064176,83.14,93.204673,23.473382,-0.541307,1.233080,2.076439,7.598193,105.744994,1.509974,...,30.000000,134.0,92.619893,0.0,37.316083,8.292412,0.496571,0.503429,13.311836,7.380835
2,304.821110,75.91,85.079708,21.045160,-0.544803,1.450268,2.020577,6.979024,106.033111,1.341316,...,11.000000,143.0,92.616538,0.0,36.110000,8.364433,0.000000,1.000000,11.000000,7.362068
3,-47.534393,45.82,143.192408,37.274798,-0.299041,5.275016,2.575849,7.863526,103.855174,1.580345,...,33.000000,140.0,92.630656,0.0,37.060931,8.286299,1.000000,0.000000,12.874655,7.382771
4,325.096519,65.71,82.184994,20.258936,-0.561354,1.154758,1.989879,7.004150,106.143888,1.348292,...,18.000000,111.5,92.642657,0.0,37.309918,8.293950,0.000000,1.000000,14.337517,7.380287
5,253.259637,28.09,104.336536,23.970306,-0.364772,1.081065,2.126438,8.452754,105.239612,1.748610,...,15.000000,124.0,92.639076,0.0,36.509935,8.290560,1.000000,0.000000,9.367366,7.377395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119996,273.560956,84.00,93.419193,23.531675,-0.540080,1.254988,2.078715,7.596330,105.736781,1.509457,...,18.083711,153.0,92.638267,0.0,36.809120,8.300704,0.496571,0.503429,10.710417,7.384020
119997,275.066433,30.00,93.204351,23.473295,-0.541309,1.233047,2.076436,7.598196,105.745006,1.509975,...,22.000000,105.0,92.622676,0.0,37.300000,8.279667,0.496571,0.503429,11.431132,7.382178
119998,210.240695,60.00,106.389929,27.278994,-0.509473,1.517995,2.185553,8.182966,105.263553,1.669038,...,21.000000,141.0,92.620078,0.0,36.611119,8.282564,1.000000,0.000000,9.928726,7.379377
119999,219.923597,84.00,105.007504,26.903518,-0.517377,1.376868,2.170892,8.194966,105.316456,1.672369,...,22.000000,143.0,92.618786,0.0,36.924524,8.281033,1.000000,0.000000,11.507965,7.377341


In [None]:
new_df = df_ex.groupby('Patient_ID').last().reset_index()

In [None]:
new_df

Unnamed: 0,Patient_ID,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,WBC,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel
0,1,53,84.000000,85.000000,37.050737,78.000000,44.000000,36.340019,18.000000,33.036951,...,13.103808,347.120154,180.520310,83.14,0,0.496571,0.503429,-0.03,54,0
1,2,22,55.000000,95.000000,36.703151,81.617284,51.000000,39.294563,11.000000,32.950262,...,11.814043,286.422685,168.975375,75.91,0,0.000000,1.000000,-98.60,23,0
2,3,47,78.000000,97.000000,36.763996,138.000000,83.000000,53.000000,26.000000,32.962212,...,11.738358,408.944834,204.354997,45.82,0,1.000000,0.000000,-1195.71,48,0
3,4,28,84.592037,97.273965,37.098524,123.910669,82.617903,62.188710,18.255916,32.954813,...,13.031676,325.561527,180.576408,65.71,0,0.000000,1.000000,-8.77,29,0
4,5,47,84.009899,97.082939,36.761865,123.069938,82.241554,63.841993,19.122869,32.985574,...,10.615625,347.426560,202.917875,28.09,1,1.000000,0.000000,-0.05,49,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40331,119996,47,81.000000,97.000000,36.927964,119.000000,74.000000,61.000000,19.000000,32.970968,...,11.699846,330.075270,191.126412,84.00,0,0.496571,0.503429,-6.69,48,0
40332,119997,24,64.000000,99.000000,36.352343,111.000000,97.000000,68.000000,16.697083,32.944601,...,11.051430,346.233398,186.142617,30.00,1,0.496568,0.503433,-0.02,25,0
40333,119998,48,57.000000,98.000000,36.405701,141.000000,101.000000,72.000000,22.000000,32.978679,...,8.588067,338.624127,201.514991,60.00,0,1.000000,0.000000,-53.64,49,0
40334,119999,19,89.000000,96.000000,37.900000,155.000000,109.000000,78.500000,19.000000,33.046138,...,10.197999,421.404409,210.671765,84.00,0,1.000000,0.000000,-10.74,20,0


## MEWS

In [None]:
def assign_systolic_bp_category(value):
    if value < 71:
        return 3
    elif 71 <= value <= 80:
        return 2
    elif 80 < value <= 100:
        return 1
    elif 100 < value <= 199:
        return 0
    elif value > 199:
        return 2
    else:
        return None

new_df['Systolic BP (mmHg)'] = new_df['SBP'].apply(assign_systolic_bp_category)

def assign_heart_rate_category(value):
    if value < 41:
        return 2
    elif 41 <= value <= 50:
        return 1
    elif 50 < value <= 100:
        return 0
    elif 100 < value <= 110:
        return 1
    elif 110 < value <= 129:
        return 2
    elif value > 129:
        return 3
    else:
        return None

new_df['Heart rate (beats per minute)'] = new_df['HR'].apply(assign_heart_rate_category)

def assign_respiratory_rate_category(value):
    if value < 9:
        return 2
    elif 9 <= value <= 14:
        return 0
    elif 14 < value <= 20:
        return 1
    elif 20 < value <= 29:
        return 2
    elif value > 29:
        return 3
    else:
        return None

new_df['Respiratory rate (breaths per minute)'] = new_df['Resp'].apply(assign_respiratory_rate_category)

def assign_temperature_category(value):
    if value < 35:
        return 2
    elif 35.0 <= value <= 38.4:
        return 0
    elif value > 38.4:
        return 2
    else:
        return None

new_df['Temperature in °C'] = new_df['Temp'].apply(assign_temperature_category)

new_df['Sum'] = new_df[['Systolic BP (mmHg)', 'Heart rate (beats per minute)', 'Respiratory rate (breaths per minute)', 'Temperature in °C']].sum(axis=1)

In [None]:
new_df['count_greater_than_5'] = (new_df['Sum'] >= 5).astype(int)

In [None]:
columns_to_drop = ['Systolic BP (mmHg)', 'Heart rate (beats per minute)', 'Respiratory rate (breaths per minute)', 'Temperature in °C', 'Sum']
new_df = new_df.drop(columns=columns_to_drop)

In [None]:
new_df = new_df.rename(columns={'count_greater_than_5': 'MEWS'})

## Выбор признаков, по которым будет строиться модель

Здесь можно добавить объяснение что из них что означает.

In [None]:
features = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC','Fibrinogen', 'Platelets', 'MEWS']
target_variable = 'SepsisLabel'


X = new_df[features]
y = new_df[target_variable]

## Обучение модели:

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

class_weights = {0: 4, 1: 56}

model = CatBoostClassifier(
    learning_rate=0.25,
    depth=5,
    l2_leaf_reg=3,
    iterations=45,
    border_count=21,
    custom_metric=['Logloss', 'AUC'],
    early_stopping_rounds=50,
    class_weights=class_weights,
    boosting_type='Plain',
    # bootstrap_type='Bayesian',
    verbose=100
)

model.fit(X_train, y_train)

0:	learn: 0.6652322	total: 65.4ms	remaining: 2.88s
44:	learn: 0.5142869	total: 460ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7bfb630d6650>

## Валидация
Самые важные показателя для нас - это confusion martix, recall, f2 или f3 score

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

conf_matrix_val = confusion_matrix(y_test, y_pred)
accuracy_val = accuracy_score(y_test, y_pred)
precision_val = precision_score(y_test, y_pred)
recall_val = recall_score(y_test, y_pred)
f1_val = f1_score(y_test, y_pred)
roc_auc_val = roc_auc_score(y_test, y_pred)

print("Confusion Matrix (Validation):")
print(conf_matrix_val)
print(f"Validation Accuracy: {accuracy_val}")
print(f"Validation Precision: {precision_val}")
print(f"Validation Recall: {recall_val}")
print(f"Validation F1 Score: {f1_val}")
print(f"Validation ROC AUC: {roc_auc_val}")

Accuracy: 0.7315053550178501
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.74      0.84     18696
           1       0.17      0.68      0.27      1472

    accuracy                           0.73     20168
   macro avg       0.57      0.71      0.55     20168
weighted avg       0.91      0.73      0.79     20168

Confusion Matrix (Validation):
[[13755  4941]
 [  474   998]]
Validation Accuracy: 0.7315053550178501
Validation Precision: 0.16804175787169556
Validation Recall: 0.6779891304347826
Validation F1 Score: 0.26932937525300227
Validation ROC AUC: 0.7068540003906905


## Вывод
Обработка нового экземпляра

In [None]:
new_df

Unnamed: 0,Patient_ID,Hour,HR,O2Sat,Temp,SBP,MAP,DBP,Resp,EtCO2,...,Fibrinogen,Platelets,Age,Gender,Unit1,Unit2,HospAdmTime,ICULOS,SepsisLabel,MEWS
0,1,53,84.000000,85.000000,37.050737,78.000000,44.000000,36.340019,18.000000,33.036951,...,347.120154,180.520310,83.14,0,0.496571,0.503429,-0.03,54,0,0
1,2,22,55.000000,95.000000,36.703151,81.617284,51.000000,39.294563,11.000000,32.950262,...,286.422685,168.975375,75.91,0,0.000000,1.000000,-98.60,23,0,0
2,3,47,78.000000,97.000000,36.763996,138.000000,83.000000,53.000000,26.000000,32.962212,...,408.944834,204.354997,45.82,0,1.000000,0.000000,-1195.71,48,0,0
3,4,28,84.592037,97.273965,37.098524,123.910669,82.617903,62.188710,18.255916,32.954813,...,325.561527,180.576408,65.71,0,0.000000,1.000000,-8.77,29,0,0
4,5,47,84.009899,97.082939,36.761865,123.069938,82.241554,63.841993,19.122869,32.985574,...,347.426560,202.917875,28.09,1,1.000000,0.000000,-0.05,49,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40331,119996,47,81.000000,97.000000,36.927964,119.000000,74.000000,61.000000,19.000000,32.970968,...,330.075270,191.126412,84.00,0,0.496571,0.503429,-6.69,48,0,0
40332,119997,24,64.000000,99.000000,36.352343,111.000000,97.000000,68.000000,16.697083,32.944601,...,346.233398,186.142617,30.00,1,0.496568,0.503433,-0.02,25,0,0
40333,119998,48,57.000000,98.000000,36.405701,141.000000,101.000000,72.000000,22.000000,32.978679,...,338.624127,201.514991,60.00,0,1.000000,0.000000,-53.64,49,0,0
40334,119999,19,89.000000,96.000000,37.900000,155.000000,109.000000,78.500000,19.000000,33.046138,...,421.404409,210.671765,84.00,0,1.000000,0.000000,-10.74,20,0,0


In [None]:
# новый образец
X_next = pd.DataFrame({
    'HR' : [84],
    'O2Sat' : [99],
    'Temp' : [37],
    'SBP' : [138],
    'MAP' : [50],
    'DBP' : [55],
    'Resp' : [19],
    'EtCO2' : [32],
    'BaseExcess' : [23],
    'HCO3' : [67],
    'FiO2' : [33],
    'pH' : [35],
    'PaCO2' : [12],
    'SaO2' : [4],
    'AST' : [45],
    'BUN' : [12],
    'Alkalinephos' : [13],
    'Calcium' : [5],
    'Chloride' : [10],
    'Creatinine' : [34],
    'Bilirubin_direct' : [12],
    'Glucose' : [56],
    'Lactate' : [7],
    'Magnesium' : [23],
    'Phosphate' : [24],
    'Potassium' : [25],
    'Bilirubin_total' : [24],
    'TroponinI' : [12],
    'Hct' : [56],
    'Hgb' : [34],
    'PTT' : [9],
    'WBC' : [34],
    'Fibrinogen' : [17],
    'Platelets' : [21],
    'MEWS' : [1]
})


y_pred = model.predict(X_next)

In [None]:
diagnosis_message = "Здоров" if y_pred[0] == 0 else "Болен"
print(f"Диагноз: {diagnosis_message}")

Диагноз: Здоров
