In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE ##TODO: Insert to ENV.yml
from imblearn.under_sampling import RandomUnderSampler
import statsmodels.api as sm
import random
seed=42
random.seed(seed)
from xgboost import XGBClassifier
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

In [2]:
def lr_preprocess_sepsis(df, window_size=5, time_bm=-10):

    # columns for using
    frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
    values_used_attributes = [ 'Hct', 'Glucose','Potassium']
    constant_attributes = ['ID','max_ICULOS', 'Gender']
    other_attributes = ['time_bm','HR','MAP','O2Sat', 'Resp','SBP', 'ICULOS']
    units_attributes = ['Unit1', 'Unit2']
    label_attributes= ['Label','SepsisLabel']

    # create frequency columns for some lab variables
    def add_rolling_window(df, attr, window_size):
        df = df.sort_values(by=['ID','ICULOS'], ascending =[True,True])
        rolling = df[['ID']+attr].groupby('ID').rolling(window=window_size, closed='both').count()
        rolling= rolling.rename(columns={at: f'{window_size}w_sum_{at}' for at in attr})
        rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
        combined = df.join(rolling,how='left', rsuffix= 'r')
        return combined, rolling
    df_with_roll, df_roll = add_rolling_window(df,frequency_used_attributes,window_size)
    frequency_used_attributes_fixed = [f'{window_size}w_sum_{x}' for x in frequency_used_attributes]
    # df_with_roll = df_with_roll[constant_attributes + other_attributes + \
    #                                     values_used_attributes + frequency_used_attributes_fixed + \
    #                                     units_attributes + label_attributes]

    # crop 10 (time_bm) last ICULOS hours for each patient
    df_with_roll = df_with_roll[df_with_roll['time_bm']>=time_bm]

    # handle Units123
    df_with_roll['Unit3'] = ( (1*(df_with_roll['Unit1']+df_with_roll['Unit2'])<1) |
                          (df_with_roll['Unit1'].isna() & df_with_roll['Unit2'].isna()) )*1
    df_with_roll['Unit1'][df_with_roll['Unit1'].isna()] = 0
    df_with_roll['Unit2'][df_with_roll['Unit2'].isna()] = 0
    df_with_roll[['Unit1','Unit2','Unit3']]

    # aggregations
    data_final = df_with_roll.groupby(['ID', 'Label','max_ICULOS','Gender']).agg({
                                                        'Unit1': 'max',\
                                                        'Unit2': 'max',\
                                                        'Unit3': 'max',\
                                                        'HR': ['median', 'max'],\
                                                        'MAP': ['median', 'min'],\
                                                        'O2Sat': ['mean'],\
                                                        'Resp': ['median', 'max'],\
                                                        'SBP': ['median', 'min'],\
                                                        'Hct': ['median', 'min'],\
                                                        'Potassium': 'mean',\
                                                        'Glucose': 'mean',\
                                                        'Temp': ['mean', 'min'],\
                                                        'DBP': 'mean',\
                                                        'WBC': ['median', 'min'],\
                                                        'EtCO2': 'mean',\
                                                        'BaseExcess': 'mean',\
                                                        'HCO3': 'mean',\
                                                        'FiO2': 'mean',\
                                                        'SaO2': 'mean',\
                                                        'AST': 'mean',\
                                                        'Lactate': 'mean',\
                                                        'Magnesium': 'mean',\
                                                        'Phosphate': 'mean',\
                                                        'TroponinI': 'mean',\
                                                        'Hgb': 'mean',\
                                                        'PTT': 'mean',\
                                                        'Platelets': 'mean',\
                                                        'Age': 'mean',\
                                                        'HospAdmTime': 'mean',\
                                                        f'{window_size}w_sum_BaseExcess': 'mean',\
                                                        f'{window_size}w_sum_FiO2': 'mean',\
                                                        f'{window_size}w_sum_pH': 'mean',\
                                                        f'{window_size}w_sum_PaCO2': 'mean',\
                                                        f'{window_size}w_sum_Glucose': 'mean',\
                                                        f'{window_size}w_sum_Lactate': 'mean',\
                                                        f'{window_size}w_sum_PTT': 'mean'}).reset_index()
    data_final.columns = ['__'.join(col).strip() for col in data_final.columns.values]
    data_final.rename(columns={"ID__": "ID", "Label__": "Label", "max_ICULOS__":"max_ICULOS", "Gender__":"Gender"}, inplace=True)

    return data_final


In [3]:
train_df = pd.read_csv("/home/student/filtered_train_df_0705.csv")

In [4]:
train_df = lr_preprocess_sepsis(train_df)



In [5]:
train_df.columns

Index(['ID', 'Label', 'max_ICULOS', 'Gender', 'Unit1__max', 'Unit2__max',
       'Unit3__max', 'HR__median', 'HR__max', 'MAP__median', 'MAP__min',
       'O2Sat__mean', 'Resp__median', 'Resp__max', 'SBP__median', 'SBP__min',
       'Hct__median', 'Hct__min', 'Potassium__mean', 'Glucose__mean',
       'Temp__mean', 'Temp__min', 'DBP__mean', 'WBC__median', 'WBC__min',
       'EtCO2__mean', 'BaseExcess__mean', 'HCO3__mean', 'FiO2__mean',
       'SaO2__mean', 'AST__mean', 'Lactate__mean', 'Magnesium__mean',
       'Phosphate__mean', 'TroponinI__mean', 'Hgb__mean', 'PTT__mean',
       'Platelets__mean', 'Age__mean', 'HospAdmTime__mean',
       '5w_sum_BaseExcess__mean', '5w_sum_FiO2__mean', '5w_sum_pH__mean',
       '5w_sum_PaCO2__mean', '5w_sum_Glucose__mean', '5w_sum_Lactate__mean',
       '5w_sum_PTT__mean'],
      dtype='object')