### Preprocessing of the mimic anemia

In [None]:
import pandas as pd
import sys
sys.path.append("..")

In [None]:
import glob

files = glob.glob("../mimic-data/cleaned_discharge_summaries.csv")
dict(enumerate(files))



In [None]:
codes = pd.read_csv(files[0]).dropna()

def has_label(x, positive_icd9) :
    return any([positive_icd9 in y for y in x.split(';')])

codes['has_c1'] = codes['ICD9_CODE'].apply(lambda x : has_label(x, '285.1'))
codes['has_c2'] = codes['ICD9_CODE'].apply(lambda x : has_label(x, '285.2'))

codes.groupby(['has_c1', 'has_c2']).size()

data_for_pos_label = codes[(codes['has_c1'] == True) & (codes['has_c2'] == False)]
data_for_neg_label = codes[(codes['has_c2'] == True) & (codes['has_c1'] == False)]

data = pd.concat([data_for_neg_label, data_for_pos_label]).reset_index(drop=True)
data['target'] = data['has_c1'].apply(lambda x : 1 if x else 0)

In [None]:
from sklearn.model_selection import train_test_split
idxs = {}
idxs['train'], idxs['test'] = train_test_split(data.index, stratify=data['target'], test_size=0.2, random_state=12939)
idxs['train'], idxs['dev'] = train_test_split(idxs['train'], stratify=[data['target'][i] for i in idxs['train']], 
                                              test_size=0.15, random_state=13448)

In [None]:
keys = ['train', 'dev', 'test']
import numpy as np
texts = {}
labels = {}
for k in keys :
    filtered = data[data.index.isin(idxs[k])]
    texts[k] = list(filtered['TEXT'])
    labels[k] = list(filtered['target'])

In [None]:
df_texts = []
df_labels = []
df_exp_split = []

for k in keys :
    df_texts += texts[k]
    df_labels += labels[k]
    df_exp_split += [k]*len(texts[k])
    
df = pd.DataFrame({'text' : df_texts, 'label' : df_labels, 'exp_split' : df_exp_split}) 
df.to_csv('mimic_anemia_dataset.csv', index=False)

In [None]:
from preprocess_bc import extract_vocabulary_

word_to_ix = extract_vocabulary_(min_df = 5, dataframe = df)

df["text"] = df["text"].apply(lambda x: ("<SOS> " + x + " <EOS>").split())

ix_to_word = {v:k for k,v in word_to_ix.items()}

In [None]:
from gensim.models import KeyedVectors

model = KeyedVectors.load("../mimic-data/mimic-embeds.wv")

In [None]:
word_dim, vocab_size = model.vector_size, len(word_to_ix)
pretrained = np.zeros([vocab_size, word_dim])
in_pre = 0
for i, word in sorted(ix_to_word.items()):
    if word in model:
        pretrained[i] = model[word]
        in_pre += 1
    else:
        pretrained[i] = np.random.randn(word_dim)

pretrained[0] = np.zeros(word_dim)

print("Found " + str(in_pre) + " words in model out of " + str(len(ix_to_word)))

In [None]:
from preprocess_bc import text_to_seq

train_ix = text_to_seq(df[df.exp_split == "train"][["text","label"]].values, word_to_ix)
dev_ix = text_to_seq(df[df.exp_split == "dev"][["text","label"]].values, word_to_ix)
test_ix = text_to_seq(df[df.exp_split == "test"][["text","label"]].values, word_to_ix)

In [None]:
from preprocess_bc import DataHolder_BC

data = DataHolder_BC(train_ix, dev_ix, test_ix, word_to_ix, embeds =  pretrained)

In [None]:
import pickle

pickle.dump(data, open("data.p", "wb"))