In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

from cache_em_all import Cachable

import os

In [2]:
DATA_DIR = "training" # Path to the data

# Names of all columns in the data that contain physiological data
physiological_cols = ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2',
       'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN',
       'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct',
       'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium',
       'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC',
       'Fibrinogen', 'Platelets']

# Names of all columns in the data that contain demographic data
demographic_cols = ['Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']

# The combination of physiological and demographic data is what we will use as features in our model
feature_cols = physiological_cols + demographic_cols

# The name of the column that contains the value we are trying to predic
label_col = "SepsisLabel"

# Pre-calculated means and standard deviation of all physiological and demographic columns. We will use this to normalize
# data using their z-score. This isn't as important for simpler models such as random forests and decision trees,
# but can result in significant improvements when using neural networks
physiological_mean = np.array([
        83.8996, 97.0520,  36.8055,  126.2240, 86.2907,
        66.2070, 18.7280,  33.7373,  -3.1923,  22.5352,
        0.4597,  7.3889,   39.5049,  96.8883,  103.4265,
        22.4952, 87.5214,  7.7210,   106.1982, 1.5961,
        0.6943,  131.5327, 2.0262,   2.0509,   3.5130,
        4.0541,  1.3423,   5.2734,   32.1134,  10.5383,
        38.9974, 10.5585,  286.5404, 198.6777])
physiological_std = np.array([
        17.6494, 3.0163,  0.6895,   24.2988, 16.6459,
        14.0771, 4.7035,  11.0158,  3.7845,  3.1567,
        6.2684,  0.0710,  9.1087,   3.3971,  430.3638,
        19.0690, 81.7152, 2.3992,   4.9761,  2.0648,
        1.9926,  45.4816, 1.6008,   0.3793,  1.3092,
        0.5844,  2.5511,  20.4142,  6.4362,  2.2302,
        29.8928, 7.0606,  137.3886, 96.8997])
demographic_mean = np.array([60.8711, 0.5435, 0.0615, 0.0727, -59.6769, 28.4551])
demographic_std = np.array([16.1887, 0.4981, 0.7968, 0.8029, 160.8846, 29.5367])

In [3]:
def load_single_file(file_path):
    # TODO: Implement helper function: load_single_file
    pass

In [4]:
def get_data_files():
    return [os.path.join(DATA_DIR, x) for x in sorted(os.listdir(DATA_DIR)) if int(x[1:-4]) % 5 > 0]

def clean_data(data):
    data.reset_index(inplace=True, drop=True)

    # Normalizes physiological and demographic data using z-score.
    data[physiological_cols] = (data[physiological_cols] - physiological_mean) / physiological_std
    data[demographic_cols] = (data[demographic_cols] - demographic_mean) / demographic_std

    # Maps invalid numbers (NaN, inf, -inf) to numbers (0, really large number, really small number)
    data[feature_cols] = np.nan_to_num(data[feature_cols])

    return data


# @Cachable("data.csv")
def load_data():
    # TODO: Implement helper function: load_data
    pass

In [5]:
def evaluate(actual, predicted, prefix=""):
    precision = precision_score(actual, predicted)
    recall = recall_score(actual, predicted)
    accuracy = accuracy_score(actual, predicted)

    print("%s Precision: %.3f%%, Recall: %.3f%%, Accuracy: %.3f%%" % (prefix, precision * 100, recall * 100, accuracy * 100))

def train_simple(data, feature_cols, label_col):
    # TODO: Implement helper function: train_simple
    pass

In [6]:
def train_stratified(data, feature_cols, label_col, stratify_col):
    X = data[feature_cols]
    y = data[label_col]
    group = data[stratify_col]

    train_pred = []
    train_actual = []

    test_pred = []
    test_actual = []

    kf = GroupKFold(n_splits=5)
    for train_idx, test_idx in kf.split(X, y, group):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

        # TODO: Implement your classifier here

        train_pred.extend(clf.predict(X_train))
        train_actual.extend(y_train)

        test_pred.extend(clf.predict(X_test))
        test_actual.extend(y_test)



    evaluate(train_actual, train_pred, "Train")
    evaluate(test_actual, test_pred, "Test")