### Data Prep

In [1]:
import sparse
import numpy as np
import pandas as pd

In [3]:
shock_df = pd.read_csv('data/Shock.csv').rename(columns={'ID': 'ICUSTAY_ID'})
shock_df['Shock_LABEL'] = (shock_df['Shock_ONSET_HOUR'] <= 4).astype(int)
shock_df = shock_df[['ICUSTAY_ID', 'Shock_LABEL']]
arf_df = pd.read_csv('data/ARF.csv').rename(columns={'ID': 'ICUSTAY_ID'})
arf_df['ARF_LABEL'] = (arf_df['ARF_ONSET_HOUR'] <= 12).astype(int)
arf_df = arf_df[['ICUSTAY_ID', 'ARF_LABEL']]
mort_labels = pd.read_csv('data/mortality_48.0h.csv').rename(columns={'ID':'ICUSTAY_ID'}).drop(['stay', 'y_true'], axis=1)

In [4]:
shock_df = shock_df[shock_df['ICUSTAY_ID'].isin(mort_labels.ICUSTAY_ID)].sort_values(by=['ICUSTAY_ID']).reset_index(drop=True)
arf_df = arf_df[arf_df['ICUSTAY_ID'].isin(mort_labels.ICUSTAY_ID)].sort_values(by=['ICUSTAY_ID']).reset_index(drop=True)
labels_df = shock_df.merge(arf_df, on="ICUSTAY_ID", how="inner").merge(mort_labels, on="ICUSTAY_ID", how="inner")
labels_df = labels_df[['ICUSTAY_ID', 'partition', 'Shock_LABEL', 'ARF_LABEL', 'mortality_LABEL']]

Unnamed: 0,ICUSTAY_ID,partition,Shock_LABEL,ARF_LABEL,mortality_LABEL
0,200001,train,0,0,0
1,200028,train,1,1,0
2,200033,train,0,0,1
3,200034,train,0,1,0
4,200053,train,0,1,0
...,...,...,...,...,...
8572,299909,val,0,0,0
8573,299913,train,0,1,0
8574,299949,val,0,1,0
8575,299950,val,0,0,0


In [5]:
icus = pd.read_csv('data/ICUSTAYS.csv', parse_dates=['INTIME', 'OUTTIME']).sort_values(by=['SUBJECT_ID']).reset_index(drop=True)
(labels_df['ICUSTAY_ID'].isin(icus['ICUSTAY_ID'])).sum()

8577

In [6]:
diagnoses = pd.read_csv('data/DIAGNOSES_ICD.csv')
diagnoses['ICD9_CODE'] = diagnoses['ICD9_CODE'].str.slice(0, 3)
print(diagnoses['ICD9_CODE'].nunique())
codes = icus[icus['ICUSTAY_ID'].isin(labels_df.ICUSTAY_ID)][['HADM_ID', 'ICUSTAY_ID']].merge(diagnoses, on='HADM_ID')
codes = codes[['ICUSTAY_ID', 'ICD9_CODE']]
codes = pd.get_dummies(codes.set_index('ICUSTAY_ID')['ICD9_CODE']).groupby(level=0).sum()

942


Unnamed: 0_level_0,003,005,007,008,009,011,013,014,018,027,...,V69,V70,V74,V81,V84,V85,V86,V87,V88,V90
ICUSTAY_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
200053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299909,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299949,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
299950,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
notes = pd.read_feather('data/NOTEEVENTS.feather')
notes['CHARTDATE'] = pd.to_datetime(notes['CHARTDATE'])
notes['CHARTTIME'] = pd.to_datetime(notes['CHARTTIME'])
notes['STORETIME'] = pd.to_datetime(notes['STORETIME'])

In [8]:
df = pd.merge(icus[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID', 'INTIME']], notes, on=['SUBJECT_ID', 'HADM_ID'],
                how='inner')
df = df.drop(columns=['SUBJECT_ID', 'HADM_ID'])

df = labels_df[['ICUSTAY_ID']].merge(df, on='ICUSTAY_ID', how='left')

df = df[df['ISERROR'].isnull()]

df = df[df['CHARTTIME'].notnull()]

In [9]:
df['TIME'] = (df['CHARTTIME'] - df['INTIME']).apply(lambda x: x.total_seconds()) / 3600
df = df[(df['TIME'] <= 12.0) & (df['TIME'] >= 0.0)]

In [10]:
df = df.groupby('ICUSTAY_ID')['TEXT'].apply(' '.join).reset_index()

In [11]:
import re
# Cleaning Text
def prep(x):
    y = re.sub('\\[(.*?)]', '', x)
    y = re.sub('[0-9]+\.', '', y)
    y = re.sub('dr\.', 'doctor', y)
    y = re.sub('m\.d\.', 'md', y)
    y = re.sub('admission date:', '', y)
    y = re.sub('discharge date:', '', y)
    y = re.sub('--|__|==', '', y)
    return y

df['TEXT'] = df['TEXT'].fillna(' ')
df['TEXT'] = df['TEXT'].str.replace('\n', ' ')
df['TEXT'] = df['TEXT'].str.replace('\r', ' ')
df['TEXT'] = df['TEXT'].apply(str.strip)
df['TEXT'] = df['TEXT'].apply(lambda x: prep(x))

In [12]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['FEATS'] = df['TEXT'].parallel_apply(
                lambda x: tokenizer.encode_plus(x, padding='max_length', truncation=True,
                                            return_token_type_ids=True, return_attention_mask=True)
)
df = df.drop(columns='TEXT')
df['input_ids'] = df['FEATS'].apply(lambda x: x['input_ids'])
df['token_type_ids'] = df['FEATS'].apply(lambda x: x['token_type_ids'])
df['attention_mask'] = df['FEATS'].apply(lambda x: x['attention_mask'])
df = df.drop(columns='FEATS')

INFO: Pandarallel will run on 128 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [13]:
df.reset_index(drop=True).to_feather('prepped_text.feather')

In [2]:
import pandas as pd
df = pd.read_feather('prepped_text.feather')

In [3]:
df = df[['ICUSTAY_ID', 'input_ids', 'token_type_ids', 'attention_mask']]

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
bert = BertModel(config=BertConfig())
model_dict = bert.state_dict()
pretrained_dict = torch.load('ClinicalBERT_checkpoint/ClinicalBERT_pretraining_pytorch_checkpoint/pytorch_model.bin')
pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
model_dict.update(pretrained_dict)
bert.load_state_dict(model_dict)
bert = bert.to(device)

In [7]:
class BertDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
    def __len__(self) :
        return len(self.df)
    def __getitem__(self, index):
        return (torch.tensor(self.df.iloc[index, :].input_ids),
               torch.tensor(self.df.iloc[index, :].token_type_ids),
               torch.tensor(self.df.iloc[index, :].attention_mask))

class Bert(nn.Module):
    def __init__(self, bert):
        super().__init__()
        self.bert = bert
    def forward(self, input_ids, token_type_ids, attention_mask):
        return self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[1].squeeze()

In [8]:
data = BertDataset(df)
dl = DataLoader(data, batch_size=1)
bert_model = Bert(bert)

In [9]:
with torch.no_grad():
    res = []
    for input_ids, token_type_ids, attention_mask in dl:
        res.append(bert_model(input_ids.to(device), token_type_ids.to(device), attention_mask.to(device)))

    res = torch.stack(res)
    torch.save(res, 'tensors.pt')

In [18]:
df['embed'] = res.cpu().numpy().tolist()
df = df[['ICUSTAY_ID', 'embed']]

In [20]:
df.to_feather('bert_embedding.feather')

In [9]:
df = pd.read_feather('bert_embedding.feather')

In [10]:
labels_df = labels_df[labels_df['ICUSTAY_ID'].isin(df['ICUSTAY_ID'].unique())]
labels_df = labels_df.reset_index()

In [11]:
codes = codes.iloc[labels_df['index']].reset_index(drop=True)
labels_df = pd.concat([labels_df, codes], axis=1).reset_index()

In [13]:
#setting up dictionaries for labels + task embeddings
label_index = {}
for task in labels_df.columns[4:]:
    for option in range(2):
        label = f"{task}:{option}"
        label_index[label] = len(label_index)
label_index['<START>'] = len(label_index)

In [14]:
labels_df[labels_df.columns[4:]] = labels_df[labels_df.columns[4:]].clip(upper=1)
labels_df['label_vector'] = labels_df.apply(lambda row: [label_index[f"{col}:{int(row[col])}"] for col in labels_df.columns[4:]], axis=1)

  labels_df['label_vector'] = labels_df.apply(lambda row: [label_index[f"{col}:{int(row[col])}"] for col in labels_df.columns[4:]], axis=1)


Unnamed: 0,level_0,index,ICUSTAY_ID,partition,Shock_LABEL,ARF_LABEL,mortality_LABEL,003,005,007,...,V70,V74,V81,V84,V85,V86,V87,V88,V90,label_vector
0,0,0,200001,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
1,1,1,200028,train,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[1, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
2,2,3,200034,train,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
3,3,4,200053,train,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
4,4,5,200061,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553,6553,8569,299889,train,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 3, 5, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
6554,6554,8571,299904,train,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
6555,6555,8573,299913,train,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 3, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."
6556,6556,8575,299950,val,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24..."


In [16]:
labels_df.to_feather('labels.feather')