In [1]:
import numpy as np
import pandas as pd

RELATIVE_PATH = "x_all/"

ID = "ID"
HOUR = "Hour"
VARIABLE = "Variable"
VALUE = "Value"
OUTCOME = "Outcome"

This file will merge all the feature values into one single table.

Based on the assumption that the physiological status of the patients over consecutive time periods do not change significantly, forward fill is automatically operated for each patient. After that, backward fill is operated for each patient.

Note: If the patient has at least one value of the feature, then all the
values of that features of that patient is not NAN.

In [2]:
data = pd.DataFrame(columns=[
    "ID", "Age", "Gender", "Unit1", "Unit2",
    "HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "EtCO2",
    "BaseExcess", "HCO3", "FiO2", "pH", "PaCO2", "SaO2", "AST", "BUN",
    "Alkalinephos", "Calcium", "Chloride", "Creatinine", "Bilirubin_direct",
    "Glucose", "Lactate", "Magnesium", "Phosphate", "Potassium", "Bilirubin_total",
    "TroponinI", "Hct", "Hgb", "PTT", "WBC", "Fibrinogen", "Platelets"
])

outcome = pd.read_csv("train_outcome.csv")
for cur_id in outcome[ID]:
    file_path = RELATIVE_PATH + str(cur_id) + ".txt"
    cur_patient = pd.read_csv(file_path)

    cur_max_hour = np.max(cur_patient[HOUR])
    anchor_hour = np.arange(1, cur_max_hour + 1)
    anchor_val = np.repeat(cur_id, cur_max_hour)
    anchor_var = np.repeat(ID, cur_max_hour)
    anchor = pd.DataFrame({HOUR: anchor_hour, VARIABLE: anchor_var, VALUE: anchor_val})
    cur_patient = pd.concat([cur_patient, anchor])

    cur_patient = cur_patient.pivot_table(index=HOUR, columns=VARIABLE, values=VALUE)
    cur_patient = cur_patient.fillna(method="ffill")
    cur_patient = cur_patient.fillna(method="bfill")

    data = pd.concat([data, cur_patient])
data.to_csv("x_train.csv")

The above code will generate the training set and the following code will generate the test set.

In [3]:
data = pd.DataFrame(columns=[
    "ID", "Age", "Gender", "Unit1", "Unit2",
    "HR", "O2Sat", "Temp", "SBP", "MAP", "DBP", "Resp", "EtCO2",
    "BaseExcess", "HCO3", "FiO2", "pH", "PaCO2", "SaO2", "AST", "BUN",
    "Alkalinephos", "Calcium", "Chloride", "Creatinine", "Bilirubin_direct",
    "Glucose", "Lactate", "Magnesium", "Phosphate", "Potassium", "Bilirubin_total",
    "TroponinI", "Hct", "Hgb", "PTT", "WBC", "Fibrinogen", "Platelets"
])

outcome = pd.read_csv("test_nolabel.csv")
for cur_id in outcome[ID]:
    file_path = RELATIVE_PATH + str(cur_id) + ".txt"
    cur_patient = pd.read_csv(file_path)

    cur_max_hour = np.max(cur_patient[HOUR])
    anchor_hour = np.arange(1, cur_max_hour + 1)
    anchor_val = np.repeat(cur_id, cur_max_hour)
    anchor_var = np.repeat(ID, cur_max_hour)
    anchor = pd.DataFrame({HOUR: anchor_hour, VARIABLE: anchor_var, VALUE: anchor_val})
    cur_patient = pd.concat([cur_patient, anchor])

    cur_patient = cur_patient.pivot_table(index=HOUR, columns=VARIABLE, values=VALUE)
    cur_patient = cur_patient.fillna(method="ffill")
    cur_patient = cur_patient.fillna(method="bfill")

    data = pd.concat([data, cur_patient])
data.to_csv("x_test.csv")