<h1> Prepare Data </h1>
To prepare the data for our models we need to:
<ol>
<li> cut unnecessary rows from the csv's (the rows that will not be used as input for the model, as explained in the instructions) </li>
<li> create train and validation datasets </li>
</ol>
<h2> Adding Columns </h2>
For each patients (and row) we will add 3 columns to the data frame:
<ol>
<li> max_ICULOS - the total time a patient was in the ICU </li>
<li> time_bm - the difference between the current time and the total time a patient was in the ICU. defined as $time_bm = ICULOS-max ICULOS$
</li>
<li> Label column -  1 if the patient had sepsis after some time in the ICU and 0 otherwise
</li>
</ol>


In [1]:
import pandas as pd
import os
import tqdm
from random import sample
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
TRAIN_PATH = 'filtered_train_df_0705'
VAL_PATH = 'filtered_val_df_0705'
TEST_PATH = 'filtered_test_df_0705'

In [28]:
def create_patients_df(patients, data_path):
    tmp_df = pd.read_csv(os.path.join(data_path,patients[0]),delimiter ='|')
    tmp_df['ID'] = patients[0].split('_')[-1].split('.')[0]
    new_df = tmp_df[tmp_df['SepsisLabel']==0]
    if max(tmp_df['SepsisLabel'])==1:
        new_df=new_df.append(tmp_df[tmp_df['SepsisLabel']==1][:1])
        new_df['Label'] = [1]*new_df.shape[0]
    else:
        new_df['Label'] = [0]*new_df.shape[0]
    new_df['max_ICULOS'] = [new_df['ICULOS'].values[-1]]*new_df.shape[0]
    new_df['time_bm'] =  new_df['ICULOS']-new_df['max_ICULOS']
    for patient in tqdm.tqdm(patients[1:]):
        patient_path = os.path.join(data_path,patient)
        patient_number = patient.split('_')[-1].split('.')[0]
        tmp_df = pd.read_csv(os.path.join(data_path,patient_path),delimiter ='|')
        tmp_df['ID'] = patient_number
        tmp_new_df = tmp_df[tmp_df['SepsisLabel']==0]
        if max(tmp_df['SepsisLabel'])==1:
            tmp_new_df=tmp_new_df.append(tmp_df[tmp_df['SepsisLabel']==1][:1])
            tmp_new_df['Label'] = [1]*tmp_new_df.shape[0]
        else:
            tmp_new_df['Label'] = [0]*tmp_new_df.shape[0]
        tmp_new_df['max_ICULOS'] = [tmp_new_df['ICULOS'].values[-1]]*tmp_new_df.shape[0]
        tmp_new_df['time_bm'] =  tmp_new_df['ICULOS']-tmp_new_df['max_ICULOS']
        new_df = new_df.append(tmp_new_df)
    return new_df

In [30]:
for d_type in ['train', 'test']:
    data_path = f'/home/student/Early_Prediction_of_Sepsis/data/{d_type}/'
    patients = os.listdir(f'data/{d_type}')
    if d_type=='train':
        train_patients = sample(patients,int(len(patients)*0.8))
        val_patients = [x for x in patients if x not in train_patients]
        train_df = create_patients_df(train_patients,data_path)
        train_df.to_csv(f'{TRAIN_PATH}.csv',index=False)
        val_df = create_patients_df(val_patients,data_path)
        val_df.to_csv(f'{VAL_PATH}.csv',index=False)
    else:
        test_df = create_patients_df(patients,data_path)
        test_df.to_csv(f'{TEST_PATH}.csv',index=False)

100%|██████████| 15999/15999 [43:41<00:00,  6.10it/s]
100%|██████████| 3999/3999 [02:23<00:00, 27.90it/s]
100%|██████████| 9999/9999 [17:54<00:00,  9.31it/s]


In [3]:
frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
FREQUENCY_ATTR =['5w_sum_BaseExcess', '5w_sum_FiO2', '5w_sum_pH', '5w_sum_PaCO2', '5w_sum_Glucose', '5w_sum_Lactate', '5w_sum_PTT']
LAB_ATTR = [ 'Hct',  'Glucose','Potassium']
CONST_ATTR = ['ID','max_ICULOS','Gender']
OTHER_ATTR = ['HR','MAP','O2Sat', 'Resp','SBP','ICULOS']
COLS = FREQUENCY_ATTR+LAB_ATTR+CONST_ATTR+OTHER_ATTR

In [15]:
class data_preperator():
    def __init__(self,train_df,seq_len=10,window=5):
        self.all_data_means= train_df.mean()
        self.seq_len=seq_len
        self.window = window

    def impute_per_patient(self,df):
        patients = list(set(df.ID.values))
        imputed = pd.DataFrame()
        for patient in patients:
            tmp_df = df[df['ID']==patient][COLS+['Label']]
            for f in LAB_ATTR+OTHER_ATTR:
                if tmp_df[f].isnull().all:
                    tmp_df[f]=tmp_df[f].fillna(self.all_data_means[f])
            imp = IterativeImputer(max_iter=50, random_state=0)
            try:
                imp.fit(tmp_df)
                tmp_df= pd.DataFrame(imp.transform(tmp_df), columns = COLS+['Label'])
                imputed=imputed.append(tmp_df)
            except:
                print(tmp_df.shape)
        return imputed

    def add_rolling_window(self,df, attr, window_size):
        df = df.sort_values(by=['ID','ICULOS'], ascending =[True,True])
        rolling = df[['ID']+attr].groupby('ID').rolling(window=window_size, closed='both').count()
        rolling= rolling.rename(columns={at: f'{window_size}w_sum_{at}' for at in attr})
        rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
        combined = df.join(rolling,how='left', rsuffix= 'r')
        return combined

    def prepare_data(self,df):
        df = df[df['time_bm']>=-1*(self.seq_len+self.window)]
        df = self.add_rolling_window(df, frequency_used_attributes, self.window)
        df = df[df['time_bm']>=-1*(self.seq_len)]
        df= df[COLS+['Label']]
        df = self.impute_per_patient(df)
        return df

In [16]:
train_df = pd.read_csv(f'{TRAIN_PATH}.csv')
p = data_preperator(train_df)
train_df = p.prepare_data(train_df)
train_df.to_csv(f'{TRAIN_PATH}_LSTM.csv')



In [17]:
val_df = pd.read_csv(f'{VAL_PATH}.csv')
val_df = p.prepare_data(val_df)
val_df.to_csv(f'{VAL_PATH}_LSTM.csv')



In [18]:
test_df = pd.read_csv(f'{TEST_PATH}.csv')
test_df = p.prepare_data(test_df)
test_df.to_csv(f'{TEST_PATH}_LSTM.csv')

