<h1> Prepare Data </h1>
To prepare the data for our models we need to:
<ol>
<li> cut unnecessary rows from the csv's (the rows that will not be used as input for the model, as explained in the instructions) </li>
<li> create train and validation datasets </li>
</ol>
<h2> Adding Columns </h2>
For each patients (and row) we will add 3 columns to the data frame:
<ol>
<li> max_ICULOS - the total time a patient was in the ICU </li>
<li> time_bm - the difference between the current time and the total time a patient was in the ICU. defined as $time_bm = ICULOS-max ICULOS$
</li>
<li> Label column -  1 if the patient had sepsis after some time in the ICU and 0 otherwise
</li>
</ol>


In [1]:
import pandas as pd
import os
import tqdm
from random import sample
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from DataPreaparators import create_patients_df, DataPreparator

In [2]:
TRAIN_PATH = 'filtered_train_df_0705'
VAL_PATH = 'filtered_val_df_0705'
TEST_PATH = 'filtered_test_df_0705'
TRAIN_MEAN_PATH = 'filtered_train_mean.csv'

In [3]:
# def create_patients_df(patients, data_path):
#     tmp_df = pd.read_csv(os.path.join(data_path,patients[0]),delimiter ='|')
#     tmp_df['ID'] = patients[0].split('_')[-1].split('.')[0]
#     new_df = tmp_df[tmp_df['SepsisLabel']==0]
#     if max(tmp_df['SepsisLabel'])==1:
#         new_df=new_df.append(tmp_df[tmp_df['SepsisLabel']==1][:1])
#         new_df['Label'] = [1]*new_df.shape[0]
#     else:
#         new_df['Label'] = [0]*new_df.shape[0]
#     new_df['max_ICULOS'] = [new_df['ICULOS'].values[-1]]*new_df.shape[0]
#     new_df['time_bm'] =  new_df['ICULOS']-new_df['max_ICULOS']
#     for patient in tqdm.tqdm(patients[1:]):
#         patient_path = os.path.join(data_path,patient)
#         patient_number = patient.split('_')[-1].split('.')[0]
#         tmp_df = pd.read_csv(os.path.join(data_path,patient_path),delimiter ='|')
#         tmp_df['ID'] = patient_number
#         tmp_new_df = tmp_df[tmp_df['SepsisLabel']==0]
#         if max(tmp_df['SepsisLabel'])==1:
#             tmp_new_df=tmp_new_df.append(tmp_df[tmp_df['SepsisLabel']==1][:1])
#             tmp_new_df['Label'] = [1]*tmp_new_df.shape[0]
#         else:
#             tmp_new_df['Label'] = [0]*tmp_new_df.shape[0]
#         tmp_new_df['max_ICULOS'] = [tmp_new_df['ICULOS'].values[-1]]*tmp_new_df.shape[0]
#         tmp_new_df['time_bm'] =  tmp_new_df['ICULOS']-tmp_new_df['max_ICULOS']
#         new_df = new_df.append(tmp_new_df)
#     return new_df

In [4]:
# for d_type in ['train', 'test']:
#     data_path = f'/home/student/Early_Prediction_of_Sepsis/data/{d_type}/'
#     patients = os.listdir(f'data/{d_type}')
#     if d_type=='train':
#         train_patients = sample(patients,int(len(patients)*0.8))
#         val_patients = [x for x in patients if x not in train_patients]
#         train_df = create_patients_df(train_patients,data_path)
#         train_df.to_csv(f'{TRAIN_PATH}.csv',index=False)
#         val_df = create_patients_df(val_patients,data_path)
#         val_df.to_csv(f'{VAL_PATH}.csv',index=False)
#     else:
#         test_df = create_patients_df(patients,data_path)
#         test_df.to_csv(f'{TEST_PATH}.csv',index=False)

In [5]:
frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
# FREQUENCY_ATTR =['5w_sum_BaseExcess', '5w_sum_FiO2', '5w_sum_pH', '5w_sum_PaCO2', '5w_sum_Glucose', '5w_sum_Lactate', '5w_sum_PTT']
# LAB_ATTR = ['Hct',  'Glucose','Potassium']
CONST_ATTR = ['max_ICULOS','Gender']
OTHER_ATTR = ['HR','MAP','O2Sat', 'Resp','SBP','ICULOS']
ALL_LAB_ATTR = ['BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2',
 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride',
 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate',
 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total',
 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen','Platelets']
COLS = CONST_ATTR+OTHER_ATTR

<h2> RNN Pre Process </h2>

In order to use the data as input for RNN models we need some additional pre process and imputation, as explained in the report.
The DataPreparator class will add frequency and window columns and will use iterative imputer to impute each patient missing data.

In [6]:
class DataPreparator():

    def __init__(self,columns,window_columns=None, freq_columns=None, seq_len=10,window=5):
        self.all_data_means= pd.read_csv(TRAIN_MEAN_PATH)
        self.seq_len=seq_len
        self.window = window
        self.window_columns = window_columns
        self.freq_columns = freq_columns
        self.freq_columns_final =  [f'freq_{at}' for at in self.freq_columns]
        self.columns = columns

    def impute_per_patient(self,df):
        patients = list(set(df.ID.values))
        imputed = pd.DataFrame()
        for patient in patients:
            tmp_df = df[df['ID']==patient][self.columns+self.freq_columns_final+['time_bm']]
            # tmp_labels = df[df['ID']==patient]['Label']
            for f in self.columns:
                if tmp_df[f].isnull().all:
                    mean_val = self.all_data_means[self.all_data_means['index']==f]['0'].values[0]
                    tmp_df[f]=tmp_df[f].fillna(mean_val)
            imp = IterativeImputer(max_iter=50, random_state=0)
            # try:
            imp.fit(tmp_df)
            tmp_df= pd.DataFrame(imp.transform(tmp_df), columns = self.columns+self.freq_columns_final+['time_bm'])
            tmp_df['Label'] = df[df['ID']==patient]['Label'].values
            tmp_df['ID'] = [patient]*tmp_df.shape[0]
            imputed=imputed.append(tmp_df)
        return imputed

    def add_rolling_window(self,df):
        df = df.sort_values(by=['ID','ICULOS'], ascending =[True,True])
        rolling = df[['ID']+self.window_columns].groupby('ID').rolling(window=self.window, closed='both').count()
        rolling= rolling.rename(columns={at: f'{self.window}w_sum_{at}' for at in self.window_columns})
        rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
        combined = df.join(rolling,how='left', rsuffix= 'r')
        self.columns +=[f'{self.window}w_sum_{at}' for at in self.window_columns]
        return combined

    def add_frequency(self,df):
        df = df.sort_values(by=['ID', 'ICULOS'], ascending=[True, True])
        rolling = df[['ID','ICULOS']+self.freq_columns].groupby(by=['ID'])[self.freq_columns].expanding().count().reset_index().rename(columns={'level_1':'old_index'})
        df=df.reset_index().rename(columns={'index':'old_index'})
        rolling = rolling.rename(columns={at: f'freq_{at}' for at in self.freq_columns})
        combined = pd.merge(df,rolling, on=['ID','old_index'])
        for at in self.freq_columns_final:
            combined[at] = combined[at] / combined['ICULOS']
        return combined

    def prepare_data(self,df, rolling=False,freq=True):
        if rolling:
            df = self.add_rolling_window(df)
        if freq:
            df = self.add_frequency(df)
        df = df[df['time_bm']>=-1*(self.seq_len)]
        df = df[self.columns+self.freq_columns_final+['time_bm','ID','Label']]
        df = self.impute_per_patient(df)
        return df


To impute features data for patient who have some all NULL values in some feature, we use mean imputation using the train mean value for this feature

In [7]:
# train_df = pd.read_csv(f'{TRAIN_PATH}.csv')
# all_data_mean = train_df.mean().reset_index().to_csv(TRAIN_MEAN_PATH,index=False)

In [8]:
p = DataPreparator(columns=COLS,freq_columns=ALL_LAB_ATTR)

In [9]:
train_df = pd.read_csv(f'{TRAIN_PATH}.csv')
train_df = p.prepare_data(train_df,rolling=False, freq=True)
train_df.to_csv(f'{TRAIN_PATH}_LSTM_new.csv',index=False)

In [10]:
val_df = pd.read_csv(f'{VAL_PATH}.csv')
val_df = p.prepare_data(val_df)
val_df.to_csv(f'{VAL_PATH}_LSTM_new.csv',index=False)

In [11]:
test_df = pd.read_csv(f'{TEST_PATH}.csv')
test_df = p.prepare_data(test_df)
test_df.to_csv(f'{TEST_PATH}_LSTM_new.csv',index=False)