<h1> Prepare Data </h1>
To prepare the data for our models we need to:
<ol>
<li> cut unnecessary rows from the csv's (the rows that will not be used as input for the model, as explained in the instructions) </li>
<li> create train and validation datasets </li>
</ol>
<h2> Adding Columns </h2>
For each patients (and row) we will add 3 columns to the data frame:
<ol>
<li> max_ICULOS - the total time a patient was in the ICU </li>
<li> time_bm - the difference between the current time and the total time a patient was in the ICU. defined as $time_bm = ICULOS-max ICULOS$
</li>
<li> Label column -  1 if the patient had sepsis after some time in the ICU and 0 otherwise
</li>
</ol>


In [4]:
import pandas as pd
import os
import tqdm
from random import sample

In [28]:
def create_patients_df(patients, data_path):
    tmp_df = pd.read_csv(os.path.join(data_path,patients[0]),delimiter ='|')
    tmp_df['ID'] = patients[0].split('_')[-1].split('.')[0]
    new_df = tmp_df[tmp_df['SepsisLabel']==0]
    if max(tmp_df['SepsisLabel'])==1:
        new_df=new_df.append(tmp_df[tmp_df['SepsisLabel']==1][:1])
        new_df['Label'] = [1]*new_df.shape[0]
    else:
        new_df['Label'] = [0]*new_df.shape[0]
    new_df['max_ICULOS'] = [new_df['ICULOS'].values[-1]]*new_df.shape[0]
    new_df['time_bm'] =  new_df['ICULOS']-new_df['max_ICULOS']
    for patient in tqdm.tqdm(patients[1:]):
        patient_path = os.path.join(data_path,patient)
        patient_number = patient.split('_')[-1].split('.')[0]
        tmp_df = pd.read_csv(os.path.join(data_path,patient_path),delimiter ='|')
        tmp_df['ID'] = patient_number
        tmp_new_df = tmp_df[tmp_df['SepsisLabel']==0]
        if max(tmp_df['SepsisLabel'])==1:
            tmp_new_df=tmp_new_df.append(tmp_df[tmp_df['SepsisLabel']==1][:1])
            tmp_new_df['Label'] = [1]*tmp_new_df.shape[0]
        else:
            tmp_new_df['Label'] = [0]*tmp_new_df.shape[0]
        tmp_new_df['max_ICULOS'] = [tmp_new_df['ICULOS'].values[-1]]*tmp_new_df.shape[0]
        tmp_new_df['time_bm'] =  tmp_new_df['ICULOS']-tmp_new_df['max_ICULOS']
        new_df = new_df.append(tmp_new_df)
    return new_df

In [30]:
for d_type in ['train', 'test']:
    data_path = f'/home/student/Early_Prediction_of_Sepsis/data/{d_type}/'
    patients = os.listdir(f'data/{d_type}')
    if d_type=='train':
        train_patients = sample(patients,int(len(patients)*0.8))
        val_patients = [x for x in patients if x not in train_patients]
        train_df = create_patients_df(train_patients,data_path)
        train_df.to_csv('filtered_train_df_0705.csv',index=False)
        val_df = create_patients_df(val_patients,data_path)
        val_df.to_csv('filtered_val_df_0705.csv',index=False)
    else:
        test_df = create_patients_df(patients,data_path)
        test_df.to_csv('filtered_test_df_0705.csv',index=False)

100%|██████████| 15999/15999 [43:41<00:00,  6.10it/s]
100%|██████████| 3999/3999 [02:23<00:00, 27.90it/s]
100%|██████████| 9999/9999 [17:54<00:00,  9.31it/s]


In [2]:
train_df = pd.read_csv('filtered_train_df_0705.csv')

In [24]:
train_df.columns

Index(['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2',
       'HospAdmTime', 'ICULOS', 'SepsisLabel', 'ID', 'Label', 'max_ICULOS',
       'time_bm'],
      dtype='object')

In [28]:
frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
values_used_attributes = [ 'Hct',  'Glucose','Potassium']
constant_attributes = ['ID','max_ICULOS','Gender']
other_attributes = ['time_bm','HR','MAP','O2Sat', 'Resp','SBP','ICULOS']
label_attributes= ['Label','SepsisLabel']

In [4]:
def add_rolling_window(df, attr, window_size):
    df = df.sort_values(by=['ID','ICULOS'], ascending =[True,True])
    rolling = df[['ID']+attr].groupby('ID').rolling(window=window_size, closed='both').count()
    rolling= rolling.rename(columns={at: f'{window_size}w_sum_{at}' for at in attr})
    rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
    combined = df.join(rolling,how='left', rsuffix= 'r')
    return combined, rolling

In [6]:
train_df_with_roll, train_roll = add_rolling_window(train_df,frequency_used_attributes,5)



In [33]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=50, random_state=0)
cols =values_used_attributes+constant_attributes+other_attributes+[ f'{5}w_sum_{at}' for at in frequency_used_attributes]
imp.fit(train_df_with_roll[cols])

IterativeImputer(max_iter=50, random_state=0)

In [34]:
train_df_with_roll_imputed = pd.DataFrame(imp.transform(train_df_with_roll[cols]), columns = cols)

In [35]:
train_df_with_roll_imputed

Unnamed: 0,Hct,Glucose,Potassium,ID,max_ICULOS,Gender,time_bm,HR,MAP,O2Sat,Resp,SBP,ICULOS,5w_sum_BaseExcess,5w_sum_FiO2,5w_sum_pH,5w_sum_PaCO2,5w_sum_Glucose,5w_sum_Lactate,5w_sum_PTT
0,31.879524,137.262885,3.999588,0.0,23.0,0.0,-22.0,82.693367,86.854252,97.213871,18.327825,125.630528,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,28.877697,131.196166,4.023412,0.0,23.0,0.0,-21.0,61.000000,65.000000,99.000000,17.500000,124.000000,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,29.299494,136.441808,4.156775,0.0,23.0,0.0,-20.0,64.000000,64.000000,98.000000,27.000000,125.000000,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28.344263,125.479708,3.903682,0.0,23.0,0.0,-19.0,56.000000,65.000000,100.000000,9.000000,123.000000,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,29.344315,134.214942,4.103983,0.0,23.0,0.0,-18.0,66.000000,67.000000,99.000000,23.000000,120.000000,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603450,28.731362,113.457522,4.209190,19999.0,54.0,0.0,-4.0,76.000000,53.000000,97.615033,17.000000,85.000000,50.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0
603451,26.694765,118.651314,4.146776,19999.0,54.0,0.0,-3.0,81.000000,51.000000,99.000000,17.000000,99.000000,51.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0
603452,25.650483,131.589612,4.179673,19999.0,54.0,0.0,-2.0,85.000000,48.000000,100.000000,26.000000,103.000000,52.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
603453,28.574842,128.339147,4.263596,19999.0,54.0,0.0,-1.0,86.000000,44.000000,93.000000,22.000000,87.000000,53.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [32]:
train_df_with_roll[cols]

Unnamed: 0,Hct,Glucose,Potassium,ID,max_ICULOS,Gender,time_bm,HR,MAP,O2Sat,Resp,SBP,ICULOS,5w_sum_BaseExcess,5w_sum_FiO2,5w_sum_pH,5w_sum_PaCO2,5w_sum_Glucose,5w_sum_Lactate,5w_sum_PTT
75951,,,,0,23,0,-22,,,,,,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75952,,,,0,23,0,-21,61.0,65.0,99.0,17.5,124.0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75953,,,,0,23,0,-20,64.0,64.0,98.0,27.0,125.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75954,,,,0,23,0,-19,56.0,65.0,100.0,9.0,123.0,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75955,,,,0,23,0,-18,66.0,67.0,99.0,23.0,120.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154924,,,,19999,54,0,-4,76.0,53.0,,17.0,85.0,50,1.0,3.0,1.0,0.0,0.0,0.0,0.0
154925,,,,19999,54,0,-3,81.0,51.0,99.0,17.0,99.0,51,1.0,2.0,1.0,0.0,0.0,0.0,0.0
154926,,,,19999,54,0,-2,85.0,48.0,100.0,26.0,103.0,52,0.0,1.0,0.0,0.0,0.0,0.0,0.0
154927,,,,19999,54,0,-1,86.0,44.0,93.0,22.0,87.0,53,0.0,1.0,0.0,0.0,0.0,0.0,0.0


train: Label 0: 14857/16000=0.9285625
train: Label 1: 1143/16000=0.0714375 
val: Label 0: 3728/4000=0.932
val: Label 1: 272/4000=0.068 
test: Label 0: 9259/10000=0.9259
test: Label 1: 741/10000=0.0741 
