In [1]:
import pandas as pd
import numpy as np
import pickle as pk
import os 
import re

from tqdm import tqdm
from collections import Counter, OrderedDict
from nltk import sent_tokenize, word_tokenize

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer 
# from options import args

from gensim.models import KeyedVectors

In [2]:
# the following re patterns and cleaning processes are adapted from the biowordvec repo
# ==============================================================================
SECTION_TITLES = re.compile(
    r'('
    r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
    r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
    r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
    r'|TECHNIQUE'
    r'):|FINAL REPORT',
    re.I | re.M)


def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))


def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)


def split_heading(text):
    """Split the report into sections"""
    start = 0
    for matcher in SECTION_TITLES.finditer(text):
        # add last
        end = matcher.start()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        # add title
        start = end
        end = matcher.end()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        start = end

    # add last piece
    end = len(text)
    if start < end:
        section = text[start:end].strip()
        if section:
            yield section

# PREPROCESAMIENTO
# Token
# Spell checker
# Lematización Inglés             
# Ingles Lemas -> Vector
# Traducción Lemas ingles-español
# Guardar ese embedding Español -> Vector
# Guardar ese embedding Ingles -> Vector
        
# PRODUCCIÓN
# Texto Español
# Token -> Spell Español -> Lematización -> Vector  
            

# ENTRENAMIENTO MIMIC III v1.4
# Filtrar DRGS 
# Entrenar Modelo 
# Ingles -> Vector -> Modelo -> DRG                  
            


def clean_text(text):
    """
    Clean text
    """

    # Replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # Replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text

def preprocess_mimic(text):
    """
    Preprocess reports in MIMIC-III.
    1. remove [**Patterns**] and signature
    2. split the report into sections
    3. tokenize sentences and words
    4. lowercase
    """
    tokens = []
    for sec in split_heading(clean_text(text)):
        for sent in sent_tokenize(sec):
            tokens.extend([w.lower() for w in word_tokenize(sent)])
    return tokens
# ==============================================================================


def get_stay_tokens(file, text_dir, keeptime=False):
    """
        input: path
        output: tokens in order for all texts representing the stay
    """
    stay_df = pd.read_pickle(os.path.join(text_dir, file))
    note_dict = OrderedDict()
    for _, row in stay_df.iterrows():
        diff = row['DIFFTIME']
        if diff < hour:
            text = preprocess_mimic(row['TEXT'])
            note_dict[diff] = text

    if keeptime:
        return note_dict
    else:
        tokens = [t for note in note_dict.values() for t in note]
        return tokens

def get_common(files, text_dir, output_dir):
    all_tokens=[]
    for file in tqdm(files):
        all_tokens.extend(get_stay_tokens(file, text_dir))
    token_count = Counter(all_tokens)

    common = [w for (w,c) in token_count.most_common() if c >= args.word_min_freq]  
    print("{} tokens in text, {} unique, and {} of them appeared at least three times".format(len(all_tokens), len(token_count),len(common)))
    with open(os.path.join(output_dir, 'unique_common.txt'), 'w') as f:
        for w in common:
            f.write(w+'\n')
    return common

def get_embeddings(words, output_dir):
    print("loading biovec...")
    model = KeyedVectors.load_word2vec_format(os.path.join(args.pretrained_embed_dir, 'BioWordVec_PubMed_MIMICIII_d200.vec.bin'), binary=True)
    print("loaded, start to get embed for tokens")

    model_vocab = set(model.index_to_key)

    valid_words = []
    oov = []
    for w in words:
        if w in model_vocab:
            valid_words.append(w)
        else:
            oov.append(w)
    print("oov", oov)

    # vocab dicts
    token2id = {}
    token2id['<pad>'] = 0
    for word in valid_words:
        token2id[word] = len(token2id)
    token2id['<unk>'] = len(token2id)

    # get embeddings; pad initiliazed as zero, unk as random
    dim = model.vectors.shape[1]
    embedding = np.zeros( (len(valid_words)+2, dim), dtype=np.float32)
    embedding[0] = np.zeros(dim,)
    embedding[-1] = np.random.randn(dim,)
    print("embed shape", embedding.shape)
    for i, w in enumerate(valid_words):
        embedding[i+1] = model[w]

    # save them
    t2i_path = os.path.join(output_dir, 'token2id.dict')
    with open(t2i_path, 'wb') as f:
        pk.dump(token2id, f)

    embed_path = os.path.join(output_dir, 'embedding.npy')
    np.save(embed_path, embedding)

    return token2id

def save2id(file, token2id, text_dir, output_dir):
    output_path = os.path.join(output_dir, file.replace('pk','dict'))
    note_dict = get_stay_tokens(file, text_dir, keeptime=True)

    out_dict = OrderedDict()
    for key, tokens in note_dict.items():
        out_dict[key] = [token2id[w] if w in token2id else token2id['<unk>'] for w in tokens]

    with open(output_path, 'wb') as f:
        pk.dump(out_dict, f)

In [5]:
DATA_PATH = 'data'
MIMIC_PATH = 'D:\\codes\\mimic-iii-clinical-database-1.4'
EMBED_PATH = 'embedding'
CHECKPOINT_PATH_ms_model1 = 'checkpoints/ms_model1'

class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__


args = {
    'eval_model': CHECKPOINT_PATH_ms_model1,
    'cohort': 'ms',
    'data_dir': DATA_PATH,
    'model': 'CAML',
    'dropout_rate': 0.3,
    'rule': 13,
    'rule_dir': 'rules',
    'max_seq_length': 2000,
    'cnn_filter_maps': 256,
    'single_kernel_size': 5,
    'batch_size': 32,
    # 'target': 'rw',
    'target': 'drg',
    'device': 'cuda',
    'Y': 738,
    'threshold': 48,
    'word_min_freq': 3,
    'pretrained_embed_dir': 'embedding'
    }


args = dotdict(args)

hour=args.threshold
data_dir = '%s/%s' % (args.data_dir, args.cohort)
text_dir = '%s/text_raw' % data_dir
output_dir = '%s/text_embed' % data_dir
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
files = [f for f in os.listdir(text_dir) if f.endswith('pk')]

# get vocab and save embeddings
words = get_common(files, text_dir, output_dir)
token2id = get_embeddings(words, output_dir)

# text to ids
for file in tqdm(files):
    save2id(file, token2id, text_dir, output_dir)

  0%|          | 0/19792 [00:00<?, ?it/s]

100%|██████████| 19792/19792 [31:14<00:00, 10.56it/s] 


39386136 tokens in text, 241276 unique, and 82086 of them appeared at least three times
loading biovec...
loaded, start to get embed for tokens
oov ['....', 'presyncope.', 'yfgga', "w'", '.....', 'q\\a7/', 'nu3fy', 'dhuorq', 'dxfdl', 'gg4g~', 's|f', 'w\\r', 'v2j/d', '=srgb', 'fevers/chills.', 'bipap.', 'etoh.', 'brbpr.', 'rvr.', 'egd.', 'normalized.', 'worsened.', '.......', 'fff.ff6', 'sbps.', 'neuromonitoring.', 'manegment.', 'west-inr', '108/50.', 'asymptomatic.', 'hypoxic.', 'workup.', '60/p.', 'diaphoresis.', '170s.', 'cc7.', '100s.', 'w/dr.', '3l.', 'prbcs.', 'dnr/dni.', 'lightheadedness.', '5.41.', 'urinal.', 'rigors.', '7.45/36/109.', 'ivfs.', '3hrs/day.', 'levaquin.', 'moved.', 'a1c:5', 'hematocrits.', 'unrevealing.', 'piv.', 'full.', 'milrinone.', 'illicits.', 'worsening.', 'arf.', '7.33/85/68/47.', 'ivx1.', 'daughters.', 'fmask.', 'vigorous.', 'vna.', 'sunday.', 'stairs.', 'trached.', 'initially.', 'catherization.', 'pivs.', 'tte/pfts/dental', 'habitus.', 'worsen.', '.17=cjp

 28%|██▊       | 5475/19792 [09:04<31:14,  7.64it/s]  