This notebook documents the procedures to extract ventilation and vasoperessor treatment data for training. The files to be use (1) pivoted_vital.csv, (2) sepsis3.csv, (3) ICUSTAYS.csv, (4) vasopressor_durations.csv, (5) ventilation_classification.csv,  (6) ventilation_durations.csv, and (7) pivoted_lab.csv.

(3) is used to extract the indices of the sepsis3 patients. In (5), only patient received mechanical ventilator is considered. The covarites&outcomes are found from (1) and the static features are from (2). Both (4) and (6) are treatments for vasoparessor and ventilation, respectively. (7) stores the labe results as covariates of the patients.

# Step 1 get sepsis3 cohort patient index

In [1]:
import csv

# only icustay_id is unique, the others have duplicates
icu_adm_sub = {} # ICUSYATS-ADMISSION-SUBJECT
sub_adm_icu = {} # SUBJECT-ADMISSION-ICUSYATS


with open('./ICUSTAYS.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        sub = row[1]
        adm = row[2]
        icu = row[3]
        if sub not in sub_adm_icu:
            sub_adm_icu[sub] = {}
        if adm not in sub_adm_icu[sub]:
            sub_adm_icu[sub][adm] = []
        if icu not in sub_adm_icu[sub][adm]:
            sub_adm_icu[sub][adm].append(icu)
        icu_adm_sub[icu] = {adm:sub}

In [7]:
sub_adm_icu['7']

{'118037': ['278444', '236754']}

In [21]:
angus = [] #31
martin = [] # 32
nqf = [] # 36
cdc = [] # 37
explicit = [] # 33, 34, 35
with open('./sepsis3.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        p = icu_adm_sub[row[0]][row[1]]
        if row[32] == '1':
            martin.append(p)
        if row[31] == '1':
            angus.append(p)
        if row[36] == '1':
            nqf.append(p)
        if row[37] == '1':
            cdc.append(p)
        if '1' in [row[33] ,  row[34]  ,row[35]]:
            explicit.append(p)
print(len(set(angus)))
print(len(set(martin)))
print(len(set(nqf)))
print(len(set(cdc)))
print(len(set(explicit)))

12612
6742
5010
11307
3711


In [19]:
type(row)

list

# Step 2 get cohort static features

In [3]:
'''
icustay_id = 0
hadm_id = 1
admission time = 3
age = 13
gender = 14
ethinity = 16
metastatic_cancer = 21
diabetes = 22
height = 23
weight = 24
bmi = 25
sepsis_cdc = 37

Male = 1, Female = 2

WHITE = 1
BLACK = 2
HISPANIC = 3
ASIAN = 4
OTHER = 5


use cdc definition of sepsis to select patients (total = 11307)
'''
from sklearn.linear_model import LinearRegression
import numpy as np
import json

X_id = []
x = {}
weight_sex = []
weight_sex_h = []

with open('./sepsis3.csv') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        if row[37] == '1': # select cohort by cdc definition
            icustay = row[0]
            hadm = row[1]
            sub = icu_adm_sub[icustay][hadm]
            if sub not in x:
                x[sub] = {}
            x[sub]['age'] = float(row[13])
            gender = 1 if row[14] == 'M' else 2
            x[sub]['sex'] = gender
            race = row[16]
            if 'WHITE' in race:
                ethinity = 1
            elif 'BLACK' in race:
                ethinity = 2
            elif 'HISPANIC' in race:
                ethinity = 3
            elif 'ASIAN' in race:
                ethinity = 4
            else:
                ethinity = 5
            x[sub]['race'] = ethinity
            x[sub]['metastatic_cancer'] = 1 if row[21] == '1' else 2
            x[sub]['diabetes'] = 1 if row[22] == '1' else 2
            if 'weight' not in x[sub]:
                x[sub]['weight'] = {}
            if 'bmi' not in x[sub]:
                x[sub]['bmi'] = {}
            if 'height' not in x[sub]:
                h = row[23]
                if h == '':
                    x[sub]['height'] = ''
                else:
                    x[sub]['height'] = float(h)
            if x[sub]['height'] == '' and row[23] != '':
                x[sub]['height'] = float(row[23])
            w = '' if row[24] == '' else float(row[24])
            bmi = '' if row[25] == '' else float(row[25])
            if w != '':
                weight_sex.append((w, gender))
                if row[23] != '':
                    weight_sex_h.append((w, gender, float(row[23])))
            
            index = str((icustay, hadm))
            x[sub]['weight'][index] = w
            x[sub]['bmi'][index] = bmi
        

# data cleaning
mean_w = 0
for i in weight_sex:
    mean_w += i[0]
mean_w = mean_w / len(weight_sex)

# train linear regression model height = a*weight + b*gender + c
h = []
w_s = []
for i in weight_sex_h:
    h.append(i[2])
    w_s.append([i[0],i[1]])

reg = LinearRegression().fit(np.array(w_s), np.array(h))

# impute weight
for i in x:
    weight = x[i]['weight']
    values = list(weight.values())
    impute_w = ''
    for j in values:
        if j != '':
            impute_w = j
            break
    for j in weight:
        if weight[j] == '':
            x[i]['weight'][j] = impute_w if impute_w != '' else mean_w
# impute height and bmi
for i in x:
    if x[i]['height'] == '':
        weights = list(x[i]['weight'].values())
        avg_w = sum(weights) / len(weights)
        sex = x[i]['sex']
        x[i]['height'] = reg.predict(np.array([[avg_w, sex]])).tolist()[0]
for i in x:
    for j in x[i]['bmi']:
        if x[i]['bmi'][j] == '':
            w = x[i]['weight'][j]
            x[i]['bmi'][j] = w / (x[i]['height']/100)**2
        
# save to JSON
f = open('./static_features.json', 'w')
json.dump(x, f)
f.close()

'''
x['93535'] = {'age': 1,
 'sex': 2,
 'race': 1,
 'metastatic_cancer': 2,
 'diabetes': 2,
 'weight': {('200011', '121562'): 101.4},
 'bmi': {('200011', '121562'): ''},
 'height': ''}
'''
    

"\nx['93535'] = {'age': 1,\n 'sex': 2,\n 'race': 1,\n 'metastatic_cancer': 2,\n 'diabetes': 2,\n 'weight': {('200011', '121562'): 101.4},\n 'bmi': {('200011', '121562'): ''},\n 'height': ''}\n"

Seelct cohort from (1) and (7) and impute the mixing values. Output the derived tables.

In [1]:
import numpy as np
from datetime import datetime

def check_none(row):
    # row: a np.array containing elements. A none value = ''
    # returns [is_none, idx, idx1] where is_none = 1 if there exists a none otherwise 0
    # idx is a np.array of indexs of the none values
    # idx1 is a np.array of indexs of the non-none values
    
    is_none = 0
    idx,idx1 = [], []
    for i,j in enumerate(row):
        if j == '':
            is_none = 1
            idx.append(i)
        else:
            idx1.append(i)
    return [is_none, np.array(idx), np.array(idx1)]

def impute_table(table):
    # table: a np.array that contains data and the first conlumn is time index
    # output the table as a list of lists and perform inputation using nearest neighbor
    
    attr_len = len(table[0])
    time1 = table[:, 0]
    time = []
    for i in time1:
        time.append(datetime.strptime(i, '%Y-%m-%d %H:%M:%S'))
    for i in range(1, attr_len):
        is_none, none_idx, valid = check_none(table[:, i])
        while is_none == 1:
            for j in none_idx:
                dif = abs(valid - j)
                neigh = []
                for k, ij in enumerate(dif):
                    if ij == 1:
                        neigh.append(valid[k])
                if len(neigh) == 1:
                    table[:, i][j] = table[:, i][neigh[0]]
                elif len(neigh) == 2:
                    refer1_t = time[neigh[0]]
                    refer2_t = time[neigh[1]]
                    none_t = time[j]
                    nearest = neigh[0] if none_t - refer1_t < refer2_t - none_t else neigh[1]
                    table[:, i][j] = table[:, i][nearest]
                else:
                    continue

            is_none, none_idx, valid = check_none(table[:, i])
    return table.tolist()

def no_data(table):
    # table: a np.array that contains data and the first conlumn is time index
    # return 1 if at least on column has only '' otherwise 0
    
    attr_len = len(table[0])
    for i in range(1, attr_len):
        col = table[:, i]
        if all(col==''):
            return 1
    return 0

def reformat(table):
    # table = {icu:[[t1,...]]}
    # output = {icu:{t1:[], t2:[],...}}
    for i in table:
        t = table[i]
        temp = {}
        for j in t:
            temp[j[0]] = j[1:]
        table[i] = temp
    return table

In [3]:
import json
import numpy as np


p_vital = './pivoted_vital.csv'

## process pivoted_vital
'''
heart rate = 2
systolic bp = 3
dias. bp = 4
mean bp =  5
respiratory rate = 6
temperature = 7
spo2 = 8
'''
# get cohort icustay_id

cohort_icustay = set()
file = open('./static_features.json')
data = json.load(file)
for key in data:
    w = data[key]['weight']
    for idx in w:
        icu = idx[2:8] # length of id = 6
        cohort_icustay.add(icu)

select_vital = {}

with open(p_vital, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        idx = row[0]
        if idx in cohort_icustay:
            if idx not in select_vital:
                select_vital[idx] = []
            select_vital[idx].append(row[1:9])
#
to_be_delete = [] # keys to be deleted
count = 0
max_row = 1900 # typically consider first 30 hours and vital is sample every minute
for i in select_vital:
    table = np.array(select_vital[i][:max_row])
    if len(table) == 1:
        to_be_delete.append(i) # delete tables only have one time step
        continue
    if no_data(table):
        to_be_delete.append(i) # delete tables that at least one attribute has no value at all time
        continue
    select_vital[i] = impute_table(table)
    #print('Table: '+str(count))
    count += 1
    if count % 100 == 0:
        print('Table: '+str(count))
        
for i in to_be_delete:
    del select_vital[i]
    
#select_vital = reformat(select_vital)

file = open('./selected_vital.json', 'w')
json.dump(select_vital, file)
################--IMPORTANT--#################
## mnually open the file and scroll dowm to the end to see if the file format is correct.
## Error can occur due to some unknown bug. Delete the incomplete data if neccessary and the total
## number of tables should remain the same.
###########################################

Table: 100
Table: 200
Table: 300
Table: 400
Table: 500
Table: 600
Table: 700
Table: 800
Table: 900
Table: 1000
Table: 1100
Table: 1200
Table: 1300
Table: 1400
Table: 1500
Table: 1600
Table: 1700
Table: 1800
Table: 1900
Table: 2000
Table: 2100
Table: 2200
Table: 2300
Table: 2400
Table: 2500
Table: 2600
Table: 2700
Table: 2800
Table: 2900
Table: 3000
Table: 3100
Table: 3200
Table: 3300
Table: 3400
Table: 3500
Table: 3600
Table: 3700
Table: 3800
Table: 3900
Table: 4000
Table: 4100
Table: 4200
Table: 4300
Table: 4400
Table: 4500
Table: 4600
Table: 4700
Table: 4800
Table: 4900
Table: 5000
Table: 5100
Table: 5200
Table: 5300
Table: 5400
Table: 5500
Table: 5600
Table: 5700
Table: 5800
Table: 5900
Table: 6000
Table: 6100
Table: 6200
Table: 6300
Table: 6400
Table: 6500
Table: 6600
Table: 6700
Table: 6800
Table: 6900
Table: 7000
Table: 7100
Table: 7200
Table: 7300
Table: 7400
Table: 7500
Table: 7600
Table: 7700
Table: 7800
Table: 7900
Table: 8000
Table: 8100
Table: 8200
Table: 8300
Table: 8400
T

In [3]:
import json
import csv
p_lab = './pivoted_lab.csv'

## process lab

'''
icustay_id = 0
charttime = 3
aniongap = 4 
albumin = 5
bands = 6
bicarbonate = 7
bilirubin = 8
creatinine = 9
chloride = 10
glucose = 11
hematocrit = 12
hemoglobin = 13
lactate = 14
platelet = 15
potassium = 16
ptt = 17
inr = 18
pt = 19
sodium = 20
bun = 21
wbc = 22
'''
cohort_icustay = set()
file = open('./static_features.json')
data = json.load(file)
for key in data:
    w = data[key]['weight']
    for idx in w:
        icu = idx[2:8] # length of id = 6
        cohort_icustay.add(icu)

select_lab = {}
with open(p_lab, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        idx = row[0]
        if idx in cohort_icustay:
            if idx not in select_lab:
                select_lab[idx] = []
            r1 = [row[3]]
            r1.extend(row[4:23])
            select_lab[idx].append(r1)

to_be_delete = [] # keys to be deleted
count = 0
max_row = 1900 # typically consider first 30 hours and vital is sample every minute
for i in select_lab:
    table = np.array(select_lab[i][:max_row])
    if len(table) <= 1:
        to_be_delete.append(i) # delete tables only have one time step
        continue
    if no_data(table):
        to_be_delete.append(i) # delete tables that at least one attribute has no value at all time
        continue
    select_lab[i] = impute_table(table)
    #print('Table: '+str(count))
    count += 1
    if count % 100 == 0:
        print('Table: '+str(count))
        
for i in to_be_delete:
    del select_lab[i]
print(len(select_lab))
file = open('./selected_lab.json', 'w')
#select_lab = reformat(select_lab)
json.dump(select_lab, file)
################--IMPORTANT--#################
## mnually open the file and scroll dowm to the end to see if the file format is correct.
## Error can occur due to some unknown bug. Delete the incomplete data if neccessary and the total
## number of tables should remain the same.
###########################################

Table: 100
Table: 200
Table: 300
Table: 400
Table: 500
Table: 600
Table: 700
Table: 800
Table: 900
Table: 1000
Table: 1100
Table: 1200
Table: 1300
Table: 1400
Table: 1500
Table: 1600
Table: 1700
Table: 1800
Table: 1900
Table: 2000
Table: 2100
Table: 2200
Table: 2300
Table: 2400
Table: 2500
Table: 2600
Table: 2700
Table: 2800
Table: 2900
Table: 3000
3042


In [1]:
# generate data for training and testing
import json
import csv
from datetime import datetime
import numpy as np
from bisect import bisect_right
from bisect import bisect_left

def read_json(path):
    file = open(path)
    data = json.load(file)
    return data

def str2date(string):
    return datetime.strptime(string, '%Y-%m-%d %H:%M:%S')

def read_duration(path, dataset='ventilation'):
    # path = csv file of duration
    data = {}
    with open(path, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = next(reader, None)
        if dataset == 'ventilation':
            for row in reader:
                idx = row[0]
                data[idx] = [row[2], float(row[4])] # start time as datetime object
            return data # {icu:[starttime, hours]}
        elif dataset == 'vassopressor':
            for row in reader:
                idx = row[0]
                if idx not in data:
                    data[idx] = [row[2], row[3]]
                else:
                    data[idx].append(row[2])
                    data[idx].append(row[3])
            return data # {icu:[stime1, etime1, stime2, etime2, ...]}
        else:
            raise Exception('Unknown dataset')

def is_match(idx, tables):
    # check if idx is in all tables
    for i in tables:
        if idx not in i:
            return 0
    return 1

def read_classification(path):
    data = {}
    with open(path, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = next(reader, None)
        for row in reader:
            idx = row[0]
            if idx not in data:
                data[idx] = []
            r1 = [row[1]]
            r1.extend(row[2:4])
            data[idx].append(r1)
    return data

def get_indexed(table, icu):
    # table = {icu:[time1, x1, x2,...],...}
    # return selected table as a list of lists and time string is converted to datetime obj

    t = np.array(table[icu])
    data = []
    for i in t:
        time = datetime.strptime(i[0], '%Y-%m-%d %H:%M:%S').timestamp() # unit: second
        r = [time]
        for j in i[1:]:
            r.append(float(j))
        data.append(r)
    return np.array(data)


def get_weights(start, step, end):
    # start >= step >= end
    # w1: weight of start, w2: weight of end
    if start == step and step == end:
        return 0.5, 0.5
    elif start == step:
        return 1.0, 0.0
    elif step == end:
        return 0.0, 1.0
    d1_reci = 1 / (step - start)
    d2_reci = 1 / (end - step)
    total = d1_reci + d2_reci
    w1 = d1_reci / total
    w2 = d2_reci / total
    return w1, w2

def extract(duration, select_lab, selected_vital, icu, total_duration, treatment_inter, dataset, classification=None):
    # extract covariate and treatment given an icuid
    # output format: {'x': [], 'y':[], 'T':[]}
    try:
        starttime = str2date(duration[icu][0]).timestamp()
    except:
        return 0
    

    lab = get_indexed(select_lab, icu)
    vital = get_indexed(selected_vital, icu)
    y = []
    x = []
    T = []

    
    if dataset.lower() == 'ventilation':
        classi = get_indexed(classification, icu)
        length = [len(lab), len(vital), len(classi)]
        for i in range(0, total_duration, treatment_inter):
            time_setp = starttime + i*3600
            #classi_idx = bisect_right(classi[:, 0], time_setp)
            bisect_num = [bisect_right(lab[:,0], time_setp), bisect_right(vital[:, 0], time_setp), bisect_right(classi[:, 0], time_setp)]
            if 0 in bisect_num:
                continue
            #outcome_time = time_setp + treatment_inter * 3600 / 2 # half hour between T and Y
            if  np.any(np.diff(np.dstack((bisect_num,length)))==0):
                break
                
                
            ## the treatment time step is 5 min after x_time, the treatment is indicated by the classification 
            ## to the left of the t_time.
            t_time = time_setp + 300 # 5 min = 300 sec
            t_idx = bisect_right(classi[:, 0], t_time)
            if t_idx == length[2] - 1:
                break
            mchv = classi[t_idx-1][1]  # mechanical ventilator
            oxyth = classi[t_idx-1][2]  # oxygentherapy

            if oxyth == 1.0:
                break
            
            
            x_idx = bisect_right(vital[:, 0], time_setp)
            if x_idx == length[1] - 1:
                break # out of range
            w1, w2 = get_weights(vital[x_idx-1][0], time_setp, vital[x_idx][0])
            x_temp = (vital[x_idx - 1][1:] * w1 + vital[x_idx][1:] * w2).tolist() # interpolate vital

            x_idx = bisect_right(lab[:, 0], time_setp)
            if x_idx == length[0] - 1:
                break # out of range
            w1, w2 = get_weights(lab[x_idx-1][0], time_setp, lab[x_idx][0])
            x_temp1 = (lab[x_idx - 1][1:] * w1 + lab[x_idx][1:] * w2).tolist() # interpolate lab
            x_temp.extend(x_temp1)
            
            
            y_timestep = t_time + 300 # 5 min after the treatment
            y_idx = bisect_right(vital[:, 0], y_timestep)
            if y_idx == length[1] - 1:
                break
            w1, w2 = get_weights(vital[y_idx-1][0], y_timestep, vital[y_idx][0])
            y.append((vital[y_idx-1][-1] * w1 + vital[y_idx][-1] * w2) / 2)
            
            x.append(x_temp)
            T.append(mchv)
            
        return 0 if x == [] or y == [] or T == [] else {'x': x, 'y':y, 'T':T} 
            
    
            
    elif dataset.lower() == 'vasopressor':
        length = [len(lab), len(vital)]
        treatment_period = []
        for i in duration[icu]:
            treatment_period.append(str2date(i).timestamp())
        for i in range(0, total_duration, treatment_inter):
            time_setp = starttime + i*3600
            bisect_num = [bisect_right(lab[:,0], time_setp), bisect_right(vital[:, 0], time_setp)]
            if 0 in bisect_num:
                continue
            outcome_time = time_setp + treatment_inter * 3600 / 2 # half hour between T and Y
            if bisect_right(vital[:, 0], outcome_time) == length[1] or np.any(np.diff(np.dstack((bisect_num,length)))==0):
                break
                
            t_idx = bisect_right(treatment_period, time_setp)
            x_time = treatment_period[t_idx - 1]
            x_idx = bisect_right(vital[:, 0], x_time)
            if x_idx == length[1] - 1:
                break # out of range
            w1, w2 = get_weights(vital[x_idx-1][0], x_time, vital[x_idx][0])
            x_temp = (vital[x_idx - 1][1:] * w1 + vital[x_idx][1:] * w2).tolist() # interpolate vital
            del x_temp[-6] # sysbp
            del x_temp[-5] # diabp
            del x_temp[-4] # meanbp
            
            x_idx = bisect_right(lab[:, 0], x_time)
            if x_idx == length[0] - 1:
                break # out of range
            w1, w2 = get_weights(lab[x_idx-1][0], x_time, lab[x_idx][0])
            x_temp1 = (lab[x_idx - 1][1:] * w1 + lab[x_idx][1:] * w2).tolist() # interpolate lab
            x_temp.extend(x_temp1)
            
            y_idx = bisect_right(vital[:, 0], outcome_time)
            if y_idx == length[1] - 1:
                break
            w1, w2 = get_weights(vital[y_idx-1][0], outcome_time, vital[y_idx][0])
            y.append((vital[y_idx-1][-1] * w1 + vital[y_idx][-1] * w2) / 2)
            
            if  t_idx % 2 == 0:
                T.append(0) # even = no treatment, odd = treatment
            else:
                T.append(1)
            x.append(x_temp)
        return 0 if x == [] or y == [] or T == [] else {'x': x, 'y':y, 'T':T}
    else:
        raise Exception('Error: unknown dataset')
        exit(0)

    
    


In [2]:
dataset = 'ventilation' # ventilation or vasopressor
total_duration = 30 # hours
treatment_inter = 1 # hour
select_lab = './selected_lab.json'
selected_vital = './selected_vital.json'
static = './static_features.json'

# read in data
static_features = read_json(static)
select_lab = read_json(select_lab) # {icu:[[time1, x1,x2,...],...}
selected_vital = read_json(selected_vital) # {icu:[[time1, x1,x2,...],...} NO glucose !!! as that has been contained in lab
if dataset.lower() == 'ventilation':
    duration = read_duration('./ventilation_durations.csv')
    classification = read_classification('./ventilation_classification.csv') # {icu:[[time, vent, oxy],...]}
    
elif dataset.lower() == 'vassopressor':
    duration = read_duration('./vasopressor_durations.csv', dataset=dataset)
    classification = None

In [3]:
'''
    
dataset = 'ventilation' # ventilation or vassopressor
total_duration = 30 # hours
treatment_inter = 1 # hour
select_lab = './selected_lab.json'
selected_vital = './selected_vital.json'
static = './static_features.json'

# read in data
static_features = read_json(static)
select_lab = read_json(select_lab) # {icu:[[time1, x1,x2,...],...}
selected_vital = read_json(selected_vital) # {icu:[[time1, x1,x2,...],...}
'''
    

                             
data = {}
for i in static_features:
    w = static_features[i]['weight']
    dynamic = {}
    for idx in w:
        icu = idx[2:8] # length of id = 6
        if is_match(icu, [select_lab, selected_vital]):
            match = 1
            weight = w[idx]
            bmi = static_features[i]['bmi'][idx]
            extract_val = extract(duration, select_lab, selected_vital, icu, total_duration, treatment_inter, dataset, classification=classification)
            if extract_val:
                dynamic[icu] = extract_val
            else:
                continue
        else:
            continue

    if dynamic != {}:
        static = []
        attribute = static_features[i]
        for j in ['age', 'sex', 'race', "metastatic_cancer", "diabetes", "height"]:
            static.append(attribute[j])
        data[i] = {'static': static, 'dynamic': dynamic}

filename = './' + dataset + '_' + 'duration_' + str(total_duration) + '.json'
file = open(filename, 'w')
json.dump(data, file)
file.close()

In [19]:
list(duration.keys())[2
                     ]

'200025'

In [10]:
len(data.keys())

2055

2013
