# Loading dataset

* y : (N,) discrete for classification, real values for regression
* x : (N, D, tn) input multivariate time series data with dimension. 
  * N is number of data cases, D is the dimension of sparse and irregularly sampled time series and tn is the union of observed time stamps in all the dimension for a data case n. Since each tn is of variable length, we pad them with zeros to have an array representation.

* m : (N, D, tn) where m[i,j,k] = 0 means that x[i,j,k] is not observed.
* T : (N, D, tn) represents the actual time stamps of observation;

In [None]:
import pickle
import copy
import numpy as np

with open('vitals_records.p', 'rb') as file:
    vitals = pickle.load(file)

In [None]:
with open('adm_type_los_mortality.p', 'rb') as file:
    adm_info = pickle.load(file)

In [None]:
adm_id = [record[0] for record in adm_info]
adm_id_needed = [record[0] for record in adm_info if record[2] >= 48]

In [None]:
vitals_dict = {}
for i in range(len(adm_id)):
    vitals_dict[adm_id[i]] = vitals[i]

In [None]:
vitals = [vitals_dict[x] for x in adm_id_needed]
label = [rec[3] for x in adm_id_needed for rec in adm_info if x == rec[0]]

# Trim lossing

In [None]:
# Original code: https://github.com/mlds-lab/interp-net/blob/master/src/mimic_preprocessing.py#L25

hours_from_adm=48 # Hours of record to look at

num_features = 12  # final features (excluding EtCO2)
max_length = 2881  # maximum length of time stamp
vitals_new = np.zeros((len(vitals), num_features, max_length))
timestamps = []

for i in range(len(vitals)):
    l = []
    for elem in vitals[i][7]:
        if elem[1] != None:
            # Fahrenheit->Celcius conversion
            tup = (elem[0], elem[1]*1.8 + 32)
            vitals[i][6].append(tup)

    for elem in vitals[i][10]:
        vitals[i][9].append(elem)
    for elem in vitals[i][11]:
        vitals[i][9].append(elem)

    # removing duplicates and EtCO2
    del vitals[i][5]
    del vitals[i][6]
    del vitals[i][8]
    del vitals[i][8]

    # taking union of all time stamps,
    # we don't actually need this for our model
    for j in range(num_features):
        for k in range(len(vitals[i][j])):
            l.append(vitals[i][j][k][0])

    # keeping only unique elements
    TS = []
    for j in l:
        if j not in TS:
            TS.append(j)
    TS.sort()

    # extracting first 48hr vitals
    T = copy.deepcopy(TS)
    TS = []
    for t in T:
        if (t - T[0]).total_seconds()/3600 <= hours_from_adm:
            TS.append(t)
    T = []
    timestamps.append(TS)
    for j in range(num_features):
        c = 0
        for k in range(len(TS)):
            if c < len(vitals[i][j]) and TS[k] == vitals[i][j][c][0]:
                if vitals[i][j][c][1] is None:
                    vitals_new[i, j, k] = -100  # missing vitals
                elif (vitals[i][j][c][1] == 'Normal <3 secs' or
                        vitals[i][j][c][1] == 'Normal <3 Seconds' or
                        vitals[i][j][c][1] == 'Brisk'):
                    vitals_new[i, j, k] = 1
                elif (vitals[i][j][c][1] == 'Abnormal >3 secs' or
                        vitals[i][j][c][1] == 'Abnormal >3 Seconds' or
                        vitals[i][j][c][1] == 'Delayed'):
                    vitals_new[i, j, k] = 2
                elif (vitals[i][j][c][1] == 'Other/Remarks' or
                    vitals[i][j][c][1] == 'Comment'):
                    vitals_new[i, j, k] = -100  # missing vitals
                else:
                    vitals_new[i, j, k] = vitals[i][j][c][1]

                c += 1
            else:
                vitals_new[i, j, k] = -100  # missing vitals

# Fixing input format

Return the input in the proper format

* x: observed values
* M: masking, 0 indicates missing values
* delta: time points of observation

In [None]:
timestamp = 200
num_features = 12

for i in range(len(timestamps)):
    if len(timestamps[i]) > timestamp:
        timestamps[i] = timestamps[i][:timestamp]

In [None]:
vitals_new = vitals_new[:, :, :timestamp]
M = np.zeros_like(vitals_new)
delta = np.zeros_like(vitals_new)
print(vitals_new.shape, len(timestamps))

In [None]:
for t in timestamps:
    for i in range(1, len(t)):
        t[i] = (t[i] - t[0]).total_seconds()/3600.0
    if len(t) != 0:
        t[0] = 0

# count outliers and negative values as missing values
# M = 0 indicates missing value
# M = 1 indicates observed value
# now since we have mask variable, we don't need -100
M[vitals_new > 500] = 0
vitals_new[vitals_new > 500] = 0.0
M[vitals_new < 0] = 0
vitals_new[vitals_new < 0] = 0.0
M[vitals_new > 0] = 1

In [None]:
for i in range(num_features):
    for j in range(vitals_new.shape[0]):
        for k in range(len(timestamps[j])):
            delta[j, i, k] = timestamps[j][k]

print(len(vitals_new))
print(len(M))
print(len(delta))

# Mean inputation

In [None]:
def mean_imputation(vitals, mask):
    """For the time series missing entirely, our interpolation network 
    assigns the starting point (time t=0) value of the time series to 
    the global mean before applying the two-layer interpolation network.
    In such cases, the first interpolation layer just outputs the global
    mean for that channel, but the second interpolation layer performs 
    a more meaningful interpolation using the learned correlations from
    other channels."""
    counts = np.sum(np.sum(mask, axis=2), axis=0)
    mean_values = np.sum(np.sum(vitals*mask, axis=2), axis=0)/counts
    for i in range(mask.shape[0]):
        for j in range(mask.shape[1]):
            if np.sum(mask[i, j]) == 0:
                mask[i, j, 0] = 1
                vitals[i, j, 0] = mean_values[j]
    return


mean_imputation(vitals_new, M)

In [None]:
def hold_out(mask, perc=0.2):
    """To implement the autoencoder component of the loss, we introduce a set
    of masking variables mr (and mr1) for each data point. If drop_mask = 0,
    then we removecthe data point as an input to the interpolation network,
    and includecthe predicted value at this time point when assessing
    the autoencoder loss. In practice, we randomly select 20% of the
    observed data points to hold out from
    every input time series."""
    drop_mask = np.ones_like(mask)
    drop_mask *= mask
    for i in range(mask.shape[0]):
        for j in range(mask.shape[1]):
            count = np.sum(mask[i, j], dtype='int')
            if int(0.20*count) > 1:
                index = 0
                r = np.ones((count, 1))
                b = np.random.choice(count, int(0.20*count), replace=False)
                r[b] = 0
                for k in range(mask.shape[2]):
                    if mask[i, j, k] > 0:
                        drop_mask[i, j, k] = r[index]
                        index += 1
    return drop_mask

x = np.concatenate((vitals_new, M, T, hold_out(m)), axis=1)  # input format

In [None]:
print(x.shape)

In [None]:
y= np.array(label)
print(y.shape)

In [None]:
np.savez('preprocessed_data.npz', array1=x, array2=y)

# Acknowledgement

* https://github.com/mlds-lab/interp-net/blob/master/src/mimic_preprocessing.py#L25