In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE ##TODO: Insert to ENV.yml

In [1]:
train_df = pd.read_csv("/home/student/filtered_train_df_0705.csv")

In [None]:
def lr_preprocess_sepsis(df, window_size=5):

    # columns for using
    frequency_used_attributes = ['BaseExcess',  'FiO2', 'pH', 'PaCO2', 'Glucose','Lactate', 'PTT']
    values_used_attributes = [ 'Hct', 'Glucose','Potassium']
    constant_attributes = ['ID','max_ICULOS', 'Gender']
    other_attributes = ['time_bm','HR','MAP','O2Sat', 'Resp','SBP', 'ICULOS']
    label_attributes= ['Label','SepsisLabel']

    # create frequency columns for some lab variables
    def add_rolling_window(df, attr, window_size):
        df = df.sort_values(by=['ID','ICULOS'], ascending =[True,True])
        rolling = df[['ID']+attr].groupby('ID').rolling(window=window_size, closed='both').count()
        rolling= rolling.rename(columns={at: f'{window_size}w_sum_{at}' for at in attr})
        rolling=rolling[list(rolling.columns)[1:]].reset_index().set_index('level_1')
        combined = df.join(rolling,how='left', rsuffix= 'r')
        return combined, rolling
    df_with_roll, df_roll = add_rolling_window(df,frequency_used_attributes,window_size)
    frequency_used_attributes_fixed = [f'{window_size}w_sum_{x}' for x in frequency_used_attributes]
    df_with_roll = df_with_roll[constant_attributes + other_attributes + \
                                        values_used_attributes + frequency_used_attributes_fixed + \
                                        label_attributes]

    # crop 10 last ICULOS hours for each patient
    df_with_roll = df_with_roll[df_with_roll['time_bm']>=-10]

    # aggregations
    data_final = train_df_with_roll.groupby(['ID', 'Label','max_ICULOS','Gender']).agg({
                                                        'HR': ['median', 'max'],\
                                                        'MAP': ['median', 'min'],\
                                                        'O2Sat': 'mean',\
                                                        'Resp': ['median', 'max'],\
                                                        'SBP': ['median', 'min'],\
                                                        'Hct': ['median', 'min'],\
                                                        'Potassium': 'mean',\
                                                        'Glucose': 'mean',\
                                                        f'{window_size}w_sum_BaseExcess': 'mean',\
                                                        f'{window_size}w_sum_FiO2': 'mean',\
                                                        f'{window_size}w_sum_pH': 'mean',\
                                                        f'{window_size}w_sum_PaCO2': 'mean',\
                                                        f'{window_size}w_sum_Glucose': 'mean',\
                                                        f'{window_size}w_sum_Lactate': 'mean',\
                                                        f'{window_size}w_sum_PTT': 'mean'}).reset_index()
    data_final.columns = ['__'.join(col).strip() for col in data_final.columns.values]
    data_final.rename(columns={"ID__": "ID", "Label__": "Label", "max_ICULOS__":"max_ICULOS", "Gender__":"Gender"}, inplace=True)

    #Imputation with KNNimputer
    data_knn_imputed = data_final.copy(deep=True)    # Copy the data
    knn_imp = KNNImputer(n_neighbors=3) # Init the transformer
    data_knn_imputed.loc[:, :] = knn_imp.fit_transform(data_knn_imputed) # Fit/transform
    data_final = data_knn_imputed

    # Over-sampling using SMOTE
    X = data_final.loc[:, data_final.columns != 'Label']
    y = data_final.loc[:, data_final.columns == 'Label']
    os = SMOTE(random_state=0)
    columns = X.columns
    os_data_X, os_data_y = os.fit_resample(X, y)
    os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
    os_data_y = pd.DataFrame(data=os_data_y,columns=['Label'])

