In [None]:
import pandas as pd
import numpy as np
import glob
import os
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

In [None]:
! pip install fancyimpute

In [None]:
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler, IterativeImputer

In [None]:
from google.colab import drive
drive.mount('/content/drive') 

In [None]:
# Read data and impute missing values
#os.chdir('/content/drive/My Drive/Colab Notebooks/PhysioNet_Sepsis_Challenge')
os.chdir('/content/drive/My Drive/Colab Notebooks/PhysionetSepsisChallenge')
df = pd.DataFrame()
for file in glob.iglob('Data/Data_partial/*.psv'):
    f_name = float(os.path.splitext(os.path.basename(file))[0][1:])  
    # print(f_name)  
    tempdf = pd.read_csv(file, sep = '|', index_col = None, header = 0)    
    tempdf['Hour'] = tempdf.index
    tempdf['Identifier'] = f_name
    df = pd.concat([df, tempdf], axis=0)

# Names of all columns in the data that contain physiological data
physiological_cols = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets']

# Names of all columns in the data that contain demographic data
demographic_cols = ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']

# Columns of features
feature_cols = physiological_cols + ['Hour'] + demographic_cols + ['Identifier']

# The name of the column that contains the value we are trying to predict
label_col = 'SepsisLabel'

#cols = list(df)
cols = feature_cols + [label_col]
# Move the SepsisLabel column to end of dataframe
#cols.insert(len(cols), cols.pop(cols.index('SepsisLabel')))
df = df.loc[:, cols]

# Plot percentage of missing values (NaNs) for each feature
cutoff = 60
fig = plt.figure(figsize=(20,10))
percent_missing = (df.isna().sum()/df.shape[0])*100
percent_missing.plot(kind="bar")
plt.plot(percent_missing, np.array([cutoff for i in range(len(percent_missing))]), 'r--') 
fig.suptitle('Percentage Missing Values', fontsize=20)
plt.xlabel('Feature', fontsize=16)
plt.ylabel('% Missing Values', fontsize=16)

# Retain columns in dataframe with <= cutoff% missing values 
df = df.loc[:, df.columns[percent_missing <= cutoff]]
print('Retained features:')
print(df.columns.values)

feature_cols = df.columns.values[:-1]
# Adjust physiological and demographic column names
physiological_cols = [x for x in feature_cols if x in set(physiological_cols)]
demographic_cols = [x for x in feature_cols if x in set(demographic_cols)]

# Impute missing data using fancyimpute package 
df_filled = pd.DataFrame(data = IterativeImputer().fit_transform(df.to_numpy()),
                         columns = df.columns,
                         index = df.index)
pd.set_option('display.expand_frame_repr', False)
print(df_filled.head(5))

In [None]:
# Names of all columns in the data that contain demographic data
demographic_cols = ['Age','Gender' ,'HospAdmTime', 'ICULOS', 'Identifier']
physiological_cols = ['HR' ,'O2Sat' ,'SBP', 'MAP', 'DBP' ,'Resp']
# The combination of physiological and demographic data is what we will use as features in our model
feature_cols = physiological_cols + demographic_cols

# The name of the column that contains the value we are trying to predict
label_col = "SepsisLabel"

def flatten(in_df, hours=4):
    res = []

    new_cols = []
    for i in range(hours):
        new_cols.append([c + "_" + str(i) for c in feature_cols])

    #print(new_cols) #list that has names of new columns

    df = in_df.sort_values("Hour")
   # print(df.iloc[-500:,:])
    for patient, _df in df.groupby("Identifier"):
        n = int(len(_df) / hours) #_df has the number of rows in each unique group of "Identifier"

        for i in range(n):
            window = _df.iloc[i*hours:(i+1)*hours]
            window_dict = {}

            for j in range(hours):
                for c in physiological_cols:
                    window_dict[c + "_" + str(j)] = window[c].iloc[j]

            for c in demographic_cols:
                window_dict[c] = window[c].iloc[0]

            window_dict[label_col] = window[label_col].mean()
            window_dict["patient"] = patient

            res.append(window_dict)

    res = pd.DataFrame(res)

    res = res[res[label_col] <= 1 / hours]
    res[label_col] = res[label_col].apply(lambda x: 1 if x else 0)

    return res
  
df_filled_flatten = flatten(df_filled)
df_filled_flatten[74:136] #printing the information of just patient 9 



In [None]:
# Features and output labels
X = df_filled_flatten.drop(['Identifier', 'SepsisLabel', 'patient'], axis=1).values
y = df_filled_flatten['SepsisLabel'].values
#print(X.shape)
print(y.shape)
#X
y

In [None]:
# Group k-fold cross-validation to ensure a 
# specific patient's data is either in the
# training or test set

group = df_filled_flatten['Identifier'].to_numpy()
train_pred = []
train_actual = []

test_pred = []
test_actual = []

kf = GroupKFold(n_splits=5)
for train_idx, test_idx in kf.split(X, y, group):
   X_train, y_train = X[train_idx, :], y[train_idx]
   X_test, y_test = X[test_idx, :], y[test_idx]
   
   # Decision tree classifier with higher penalty
   # for misclassifying the low frequency output
   # label
   clf = DecisionTreeClassifier(class_weight="balanced",
                                max_depth=20,
                                max_leaf_nodes=20)
   model = clf.fit(X_train, y_train)

   train_pred.extend(clf.predict(X_train))
   train_actual.extend(y_train)

   test_pred.extend(clf.predict(X_test))
   test_actual.extend(y_test)

In [None]:
# Function for evaluating train and test accuracy
def evaluate(actual, predicted, prefix=""):
    precision = precision_score(actual, predicted)
    recall = recall_score(actual, predicted)
    accuracy = accuracy_score(actual, predicted)

    print("%s Precision: %.3f%%, Recall: %.3f%%, Accuracy: %.3f%%" % (prefix, precision * 100, recall * 100, accuracy * 100))

In [None]:
# Evaluate train and test accuracy
evaluate(train_actual, train_pred, "Train")
evaluate(test_actual, test_pred, "Test")