Use this website https://physionet.org/content/eicu-crd/2.0/ to download the files for preprocessed eICU data. The task is about vasopressor. Files to be used include (1) patient.csv, (2) pivoted_treatment_vasopressor.csv, (3) pivoted_lab.csv, (4) pivoted_vital.csv. These files (2) to (4) can be obtained by running the scripts from https://github.com/mit-lcp/eicu-code. Place these files under the same folder with this notebook.

# Step 1 Extract Static Features

In [1]:
import csv
from sklearn.linear_model import LinearRegression
import numpy as np

# features include patientUnitStayID, hospital ID, age, gender, race, weight, and height
path = './patient.csv'

'''
Gender: M = 1, F = 2
Race: Asian=1, Caucasian=2, African American=3, Native American=4, Hispanic=5, Other/Unknown=6, NULL not considered
'''
def getGender(sex):
    if sex == 'Male':
        return 1
    elif sex == 'Female':
        return 2
    else:
        return None
    
def getAge(age):
    if '>' in age:
        return 90.0
    elif age != '':
        return float(age)
    else:
        return None

def getRace(race):
    if race == 'Asian':
        return 1
    elif race == 'Caucasian':
        return 2
    elif race == 'African American':
        return 3
    elif race == 'Native American':
        return 4
    elif race == 'Hispanic':
        return 5
    elif race == 'Other/Unknown':
        return 6
    else:
        return None
    
def getHeight(h):
    if h == '':
        return None
    else:
        return float(h)

# X, y = [], []
cohort_eicu = {}
with open(path, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        patientUnitStayID = row[0]
        sex = getGender(row[2])
        if sex == None:
            continue
        age = getAge(row[3])
        if age == None:
            continue
        race = getRace(row[4])
        if race == None:
            continue
        hospitalid = row[5]
        height = getHeight(row[8])
        if height == None:
            continue
        weight1 = row[22] # initial weight
        weight2 = row[23] # discharge weight
        if weight2 == '':
            continue
    #    if weight1 != '':
    #        X.append([float(weight2)]) # impute weight1
    #        y.append(float(weight1))
        cohort_eicu[patientUnitStayID] = {}
        cohort_eicu[patientUnitStayID]['static'] = [hospitalid, age, sex, race, height, weight1, float(weight2)]
#reg = LinearRegression().fit(np.array(X), np.array(y))
for i in list(cohort_eicu.keys()):
    feature = cohort_eicu[i]['static']
    weight1 = feature[-2]
    if weight1 == "":
        #w = reg.predict(np.array([[feature[-1]]]))[0]
        w = feature[-1]
    else:
        w = float(weight1)
    cohort_eicu[i]['static'][-2] = w

In [2]:
print(len(cohort_eicu))
for i in list(cohort_eicu.keys())[:3]:
    print(cohort_eicu[i])

106238
{'static': ['59', 70.0, 2, 2, 152.4, 84.3, 85.8]}
{'static': ['60', 52.0, 2, 2, 162.6, 54.4, 54.4]}
{'static': ['60', 52.0, 2, 2, 162.6, 60.4, 60.4]}


Prepare tables for extracting treatment, covariates, and outcome

In [4]:
for i in treatment_series:
    print(treatment_series[i])
    break

[107, '240', '903']


In [4]:
import numpy as np

def check_none(row):
    # row: a np.array containing elements. A none value = ''
    # returns [is_none, idx, idx1] where is_none = 1 if there exists a none otherwise 0
    # idx is a np.array of indexs of the none values
    # idx1 is a np.array of indexs of the non-none values
    
    is_none = 0
    idx,idx1 = [], []
    for i,j in enumerate(row):
        if j == '':
            is_none = 1
            idx.append(i)
        else:
            idx1.append(i)
    return [is_none, np.array(idx), np.array(idx1)]


def impute_table(table):
    # table: a np.array that contains data and the first conlumn is time index
    # output the table as a list of lists and perform inputation using nearest neighbor
    
    attr_len = len(table[0])
    time1 = table[:, 0]
    time = []
    for i in time1:
        time.append(int(i))
    for i in range(1, attr_len):
        is_none, none_idx, valid = check_none(table[:, i])
        while is_none == 1:
            for j in none_idx:
                dif = abs(valid - j)
                neigh = []
                for k, ij in enumerate(dif):
                    if ij == 1:
                        neigh.append(valid[k])
                if len(neigh) == 1:
                    table[:, i][j] = table[:, i][neigh[0]]
                elif len(neigh) == 2:
                    refer1_t = time[neigh[0]]
                    refer2_t = time[neigh[1]]
                    none_t = time[j]
                    nearest = neigh[0] if none_t - refer1_t < refer2_t - none_t else neigh[1]
                    table[:, i][j] = table[:, i][nearest]
                else:
                    continue

            is_none, none_idx, valid = check_none(table[:, i])
    return table.tolist()

def no_data(table):
    # table: a np.array that contains data and the first conlumn is time index
    # return 1 if at least on column has only '' otherwise 0
    
    attr_len = len(table[0])
    for i in range(1, attr_len):
        col = table[:, i]
        if all(col==''):
            return 1
    return 0

In [13]:
import csv

# select lab

path = './pivoted_lab.csv'

select_lab = {}
with open(path, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        idx = row[0]
        if idx in cohort_eicu:
            if idx not in select_lab:
                select_lab[idx] = []
            r1 = row[1:] # chartime, covariates...
            del r1[19] # delete band
            del r1[9] # delete co2
            select_lab[idx].append(r1)
to_be_delete = [] # keys to be deleted
count = 0
max_row = 1900 # typically consider first 30 hours and vital is sample every minute
for i in select_lab:
    table = np.array(select_lab[i][:max_row])
    if len(table) <= 1:
        to_be_delete.append(i) # delete tables only have one time step
        continue
    if no_data(table):
        to_be_delete.append(i) # delete tables that at least one attribute has no value at all time
        continue
    select_lab[i] = impute_table(table)
    #print('Table: '+str(count))
    count += 1
    if count % 100 == 0:
        print('Table: '+str(count))
        
for i in to_be_delete:
    del select_lab[i]
print(len(select_lab))
file = open('./selected_lab.txt', 'w')
file.write(str(select_lab))

Table: 100
Table: 200
Table: 300
Table: 400
Table: 500
Table: 600
Table: 700
Table: 800
Table: 900
Table: 1000
Table: 1100
Table: 1200
Table: 1300
Table: 1400
Table: 1500
Table: 1600
Table: 1700
Table: 1800
Table: 1900
Table: 2000
Table: 2100
Table: 2200
Table: 2300
Table: 2400
Table: 2500
Table: 2600
Table: 2700
Table: 2800
Table: 2900
Table: 3000
Table: 3100
Table: 3200
Table: 3300
Table: 3400
Table: 3500
Table: 3600
Table: 3700
Table: 3800
Table: 3900
Table: 4000
Table: 4100
Table: 4200
Table: 4300
Table: 4400
Table: 4500
Table: 4600
Table: 4700
Table: 4800
Table: 4900
Table: 5000
Table: 5100
Table: 5200
Table: 5300
Table: 5400
Table: 5500
Table: 5600
Table: 5700
Table: 5800
Table: 5900
Table: 6000
Table: 6100
Table: 6200
Table: 6300
Table: 6400
Table: 6500
Table: 6600
Table: 6700
Table: 6800
Table: 6900
Table: 7000
Table: 7100
Table: 7200
Table: 7300
Table: 7400
Table: 7500
Table: 7600
Table: 7700
Table: 7800
Table: 7900
Table: 8000
Table: 8100
Table: 8200
Table: 8300
Table: 8400
T

228404312

In [7]:
import csv

# select vital

path = './pivoted_vital.csv'

'''
icu_id = 0
charttime = 1
heartrate = 3
respir = 4
spo2 = 5
temp = 9
ibp_mean = -1  (outcome)
'''

select_vital = {}
with open(path, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        idx = row[0]
        if idx in cohort_eicu:
            if idx not in select_vital:
                select_vital[idx] = []
            time, hr, resp, spo2, temp, bp = row[1], row[3], row[4], row[5], row[9], row[-1]
            select_vital[idx].append([time, hr, resp, spo2, temp, bp])
to_be_delete = [] # keys to be deleted
count = 0
max_row = 1900 # typically consider first 30 hours and vital is sample every minute
for i in select_vital:
    table = np.array(select_vital[i][:max_row])
    if len(table) == 1:
        to_be_delete.append(i) # delete tables only have one time step
        continue
    if no_data(table):
        to_be_delete.append(i) # delete tables that at least one attribute has no value at all time
        continue
    select_vital[i] = impute_table(table)
    #print('Table: '+str(count))
    count += 1
    if count % 200 == 0:
        print('Table: '+str(count))
        
for i in to_be_delete:
    del select_vital[i]
    
#select_vital = reformat(select_vital)

file = open('./selected_vital.txt', 'w')
file.write(str(select_vital))

Table: 200
Table: 400
Table: 600
Table: 800
Table: 1000
Table: 1200
Table: 1400
Table: 1600
Table: 1800
Table: 2000
Table: 2200
Table: 2400
Table: 2600
Table: 2800
Table: 3000
Table: 3200
Table: 3400
Table: 3600
Table: 3800
Table: 4000
Table: 4200
Table: 4400
Table: 4600
Table: 4800
Table: 5000
Table: 5200
Table: 5400
Table: 5600
Table: 5800
Table: 6000
Table: 6200
Table: 6400
Table: 6600
Table: 6800
Table: 7000
Table: 7200
Table: 7400
Table: 7600
Table: 7800
Table: 8000
Table: 8200
Table: 8400
Table: 8600
Table: 8800
Table: 9000
Table: 9200
Table: 9400
Table: 9600
Table: 9800
Table: 10000
Table: 10200
Table: 10400
Table: 10600
Table: 10800
Table: 11000
Table: 11200
Table: 11400
Table: 11600
Table: 11800
Table: 12000
Table: 12200
Table: 12400
Table: 12600
Table: 12800
Table: 13000
Table: 13200
Table: 13400
Table: 13600
Table: 13800
Table: 14000
Table: 14200
Table: 14400
Table: 14600
Table: 14800
Table: 15000
Table: 15200
Table: 15400
Table: 15600
Table: 15800
Table: 16000
Table: 16200


783781985

Integrate Data

In [1]:
from bisect import bisect_right
from bisect import bisect_left
import numpy as np

def strToint(seq):
    # seq: a list of mixed types
    # return a list of integers in a sorted order
    L = []
    for i in seq:
        L.append(int(i))
    return sorted(L)

def getEffect(inte, bounds):
    # inputs are lists
    # output a list of tuples, each containing the bounds of the effective peroid of the treatment
    # output format: {'x': [], 'y':[], 'T':[]}
    result = []
    
    for i in inte:
        i = int(i)
        result.append((i + bounds[0], i + bounds[1]))
    return result

def reformat(table):
    # table: list of lists, the first entry should be an integer. 
    # output a dictionary with time step being the key and the values are in float
    
    result = {}
    for i in table:
        key = int(i[0])
        val = i[1:]
        result[key] = []
        for j in val:
            result[key].append(float(j))
    return result

def getTreatment(periods, idx):
    # periods: list of tuples of lower and upper bounds
    # idx: a time stampt
    re = 0
    for i in periods:
        lower = i[0]
        upper = i[1]
        if lower <= idx and idx <= upper:
            re = 1
            break
    return re

def t_identify(t, x):
    # t and x are list of integers
    # return a dict with keys being elements in x and value being 0 = no treatment, 1 = treatment
    
    result = {}
    n = len(x)
    for i in t:
        if i in result:
            continue
        idx = bisect_left(x, i)
        if idx == 0 or idx == n:
            continue
        if x[idx - 1] not in result:
            result[x[idx - 1]] = 0
        result[x[idx]] = 1
    for i in x:
        if i not in result:
            result[i] = 0
    return result
        

def extract(duration, select_lab, selected_vital, icu, treatment_inter, effect = 5):
    # extract covariate and treatment given an icuid
    # output format: {'x': [], 'y':[], 'T':[]}
    if icu in duration and icu in select_lab and icu in selected_vital:
        if len(selected_vital[icu]) == 1:
            return 0
        else:
            pass
    else:
        return 0
    #treatment_effect = getEffect(duration[icu], treatment_inter)
    treatment_effect = strToint(duration[icu])
    lab = reformat(select_lab[icu])
    vital = reformat(selected_vital[icu])
    vital_idx = sorted(list(vital.keys()))
    lab_idx = sorted(list(lab.keys()))
    treatment_indicate = t_identify(duration[icu], vital_idx)
    vital_idx = sorted(list(treatment_indicate.keys()))
    X, y, T = [], [], []
    n = len(lab_idx)
    m = len(vital_idx)
    for i in range(m - 1):
        i1 = vital_idx[i + 1] # time stampt
        i = vital_idx[i]   # time stampt
        x1 = vital[i]
        insert = bisect_right(lab_idx, i)
        if insert == 0:
            entry = [lab_idx[0]]
        elif insert == n:
            entry = [lab_idx[-1]]
        else:
            entry = [lab_idx[insert-1], lab_idx[insert]]
        # interpolate lab
        if len(entry) == 1:
            interp_lab = lab[entry[0]]
        else:
            key1 = 1 / (abs(i - entry[0]) + 0.0000001)
            key2 = 1 / (abs(i - entry[1]) + 0.0000001)
            total = key1 + key1
            w1 = key1 / total
            w2 = key2 / total
            interp_lab = []
            for j,k in zip(lab[entry[0]], lab[entry[1]]):
                interp_lab.append(w1 * j + w2 * k)
        #treat = getTreatment(treatment_effect, i)
        treat = treatment_indicate[i1]
        outcome = vital[i1][-1]
        interp_lab.extend(x1[:-1])
        X.append(interp_lab)
        y.append(outcome)
        T.append(treat)
    return 0 if X == [] or y == [] or T == [] else {'x': X, 'y':y, 'T':T} 
    
    
    
    


In [2]:
# load selected lab and vital

import ast


def load_txt_to_dict(path):
    file = open(path)
    txt = file.readlines()[0]
    return ast.literal_eval(txt)

select_lab = load_txt_to_dict('./selected_lab.txt')
select_vital = load_txt_to_dict('./selected_vital.txt')
import csv

path = './pivoted_treatment_vasopressor.csv'

treatment_series = {}
with open(path, newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        icustay = row[0]
        if icustay not in treatment_series:
            treatment_series[icustay] = [int(row[1])]
        else:
            treatment_series[icustay].append(int(row[1]))

cohort_eicu = {}
with open('./patient.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    header = next(reader, None)
    for row in reader:
        patientUnitStayID = row[0]
        sex = getGender(row[2])
        if sex == None:
            continue
        age = getAge(row[3])
        if age == None:
            continue
        race = getRace(row[4])
        if race == None:
            continue
        hospitalid = row[5]
        height = getHeight(row[8])
        if height == None:
            continue
        weight1 = row[22] # initial weight
        weight2 = row[23] # discharge weight
        if weight2 == '':
            continue
    #    if weight1 != '':
    #        X.append([float(weight2)]) # impute weight1
    #        y.append(float(weight1))
        cohort_eicu[patientUnitStayID] = {}
        cohort_eicu[patientUnitStayID]['static'] = [hospitalid, age, sex, race, height, weight1, float(weight2)]
#reg = LinearRegression().fit(np.array(X), np.array(y))
for i in list(cohort_eicu.keys()):
    feature = cohort_eicu[i]['static']
    weight1 = feature[-2]
    if weight1 == "":
        #w = reg.predict(np.array([[feature[-1]]]))[0]
        w = feature[-1]
    else:
        w = float(weight1)
    cohort_eicu[i]['static'][-2] = w

MemoryError: 

In [71]:

data = {}
treatment_inter = [5, 60] # in mins, the effect of treatment
for icu in list(cohort_eicu.keys()):
    dynamic = {}
    extract_val = extract(treatment_series, select_lab, select_vital, icu, treatment_inter)
    print(extract_val)
    break
   # if extract_val:
   #     dynamic = extract_val
    #else:
   #     continue
    #static = cohort_eicu[icu]['static']
    #data[icu] = {'static': static, 'dynamic': dynamic}


In [74]:
# sort data according to hispital
data_sort = {}
statistic = {} # {hospital_id: icu counts}
non_t = 0
t = 0
for i in data:
    hosp = data[i]['static'][0]
    static = data[i]['static'][1:]
    if hosp in statistic:
        statistic[hosp] += 1
    else:
        statistic[hosp] = 1
    if hosp not in data_sort:
        data_sort[hosp] = []
    x = data[i]['dynamic']['x']
    y = data[i]['dynamic']['y']
    T = data[i]['dynamic']['T']
    n = len(x)
    for j in range(n):
        row = []
        row.extend(static)
        row.extend(x[j])
        if T[j]:
            t += 1
        else:
            non_t += 1
        row.append(T[j])
        row.append(y[j])
        data_sort[hosp].append(row) # {hosp1:[[covariates, T, y],...],...}
file = open('./vasopressor_hospital.txt', 'w')
file.write(str(data_sort))
file.close()

In [75]:
print(statistic) # shows the # of icu counts in each hostpital
print(non_t)
print(t)

{'79': 93, '92': 14, '93': 1, '83': 3, '85': 1, '95': 2, '108': 5, '102': 6, '84': 1, '142': 16, '141': 24, '144': 1, '140': 5, '171': 66, '148': 107, '154': 100, '152': 73, '167': 273, '176': 226, '165': 144, '157': 104, '146': 35, '175': 7, '155': 37, '158': 1, '164': 1, '183': 68, '181': 35, '184': 12, '180': 5, '182': 2, '188': 66, '194': 3, '199': 42, '202': 8, '206': 17, '220': 1, '217': 6, '215': 2, '224': 1, '227': 49, '226': 14, '243': 143, '244': 9, '253': 27, '248': 47, '252': 192, '249': 2, '264': 20, '259': 12, '256': 3, '266': 1, '275': 20, '272': 3, '281': 23, '271': 17, '282': 3, '312': 7, '307': 18, '303': 3, '300': 12, '336': 11, '338': 65, '345': 8, '337': 1, '331': 6, '352': 2, '394': 38, '417': 26, '420': 224, '419': 28, '440': 26, '435': 66, '429': 5, '445': 10, '449': 100, '444': 3, '459': 26, '458': 208}
1104992
15039


In [1]:
x = {'79': 93, '92': 14, '93': 1, '83': 3, '85': 1, '95': 2, '108': 5, '102': 6, '84': 1, '142': 16, '141': 24, '144': 1, '140': 5, '171': 66, '148': 107, '154': 100, '152': 73, '167': 273, '176': 226, '165': 144, '157': 104, '146': 35, '175': 7, '155': 37, '158': 1, '164': 1, '183': 68, '181': 35, '184': 12, '180': 5, '182': 2, '188': 66, '194': 3, '199': 42, '202': 8, '206': 17, '220': 1, '217': 6, '215': 2, '224': 1, '227': 49, '226': 14, '243': 143, '244': 9, '253': 27, '248': 47, '252': 192, '249': 2, '264': 20, '259': 12, '256': 3, '266': 1, '275': 20, '272': 3, '281': 23, '271': 17, '282': 3, '312': 7, '307': 18, '303': 3, '300': 12, '336': 11, '338': 65, '345': 8, '337': 1, '331': 6, '352': 2, '394': 38, '417': 26, '420': 224, '419': 28, '440': 26, '435': 66, '429': 5, '445': 10, '449': 100, '444': 3, '459': 26, '458': 208}
tot  = 0
for i in x:
    tot += x[i]
print(tot)
print(600/tot)

3092
0.19404915912031048
