In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import pickle

In [2]:
with open('EDA_pickles/col_lists_dict.pkl', 'rb') as file:
    col_lists_dict = pickle.load(file)
    
with open('EDA_pickles/global_values_dict.pkl', 'rb') as file:
    global_values_dict = pickle.load(file)

In [3]:
def table_to_feature_vec(df):
    # create missing flags for features with missing values
    for col in col_lists_dict['cols_with_missing']:
        df[col + '_miss_flag'] = df[col].notnull().astype(int)
    
    # log transform to non-binary columns
    for col in col_lists_dict['cols_with_missing_not_binary']:
        if col in ['BaseExcess', 'O2Sat', 'FiO2', 'SaO2', 'Hct']:
            continue
        df[col] = df[col].apply(lambda x: math.log(abs(x)+1))

    # linear interpolation
    df = df.interpolate(limit_direction='both')
    
    # fill the left NAN values with the global medians found in the EDA
    for col in col_lists_dict['cols_with_missing']:
        df[col] = df[col].fillna(global_values_dict[col])
    
    # Repeat rows for timewise examination
    len_df = len(df)-1
    cols_to_repeat = col_lists_dict['cols_with_missing'] + [col + '_miss_flag' for col in col_lists_dict['cols_with_missing']] + ['ICULOS']
    sampled_rows = df[cols_to_repeat].iloc[[len_df*frac for frac in [0, 0.25, 0.5, 0.75, 1]]].fillna(0)
    finished_row = np.hstack(sampled_rows.to_numpy())
    finished_row = np.hstack([df[['Age', 'Gender', 'HospAdmTime']].iloc[0].fillna(0).to_numpy(), finished_row, df[cols_to_repeat].mean().fillna(0).to_numpy(), int(bool(df['SepsisLabel'].fillna(0).sum()))])
    
    return finished_row

In [4]:
train_path = 'data/train_orig/'

train_rows = []
for file in os.listdir(train_path):
    # load original df
    try:
        df = pd.read_csv(train_path + file, sep='|')
        # find first row with SepsisLabel == 1
        for idx, row in df.iterrows():
            if row['SepsisLabel'] == 1:
                break
        # trim the df accordingly
        df = df.iloc[:idx+1, :]
        row = table_to_feature_vec(df)
        train_rows.append(row)
    except:
        continue

train_rows = np.array(train_rows)
np.save('data/train_rows.npy', train_rows)

In [5]:
test_path = 'data/test/'

test_rows = []
for file in os.listdir(test_path):
    # load original df
    try:
        df = pd.read_csv(test_path + file, sep='|')
        # find first row with SepsisLabel == 1
        for idx, row in df.iterrows():
            if row['SepsisLabel'] == 1:
                break
        # trim the df accordingly
        df = df.iloc[:idx+1, :]
        # save the trimmed df in a new directory
        row = table_to_feature_vec(df)
        test_rows.append(row)
    except:
        continue

test_rows = np.array(test_rows)
np.save('data/test_rows.npy', test_rows)

In [6]:
def get_features_vec_names(df):
    # create missing flags for features with missing values
    for col in col_lists_dict['cols_with_missing']:
        df[col + '_miss_flag'] = 0
    
    for col in col_lists_dict['cols_with_missing_not_binary']:
        if col in ['BaseExcess', 'O2Sat', 'FiO2', 'SaO2', 'Hct']:
            continue
        df[col] = 0

    cols_to_repeat = col_lists_dict['cols_with_missing'] + [col + '_miss_flag' for col in col_lists_dict['cols_with_missing']] + ['ICULOS']
    features_names = []
    for frac in [0, 0.25, 0.5, 0.75, 1]:
        for col in cols_to_repeat:
            features_names.append(f'{col}_{frac}')
    features_names = ['Age', 'Gender', 'HospAdmTime'] + features_names + [f'{col}_mean' for col in cols_to_repeat]
    
    return features_names

In [7]:
features_names = get_features_vec_names(df)

with open('data/features_names.pkl', 'wb') as file:
    pickle.dump(features_names, file)