In [70]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from nltk.tag import StanfordPOSTagger
from nltk import word_tokenize
from nltk.internals import find_jars_within_path
#import nltk
#nltk.download('punkt')

In [78]:
root = "/Users/ashleyroakes/Desktop/"
mim_root = root + "mimic-iii-clinical-database-1.4/"

# Data Pre-processing
## Read in Discharge Notes

In [71]:
notes = mim_root + "NOTEEVENTS.csv.gz"

notes_df = pd.read_csv(notes, compression='gzip', error_bad_lines=False)\
                .query("CATEGORY == 'Discharge summary'")\
                .query("DESCRIPTION == 'Report'")
notes_df = notes_df[['SUBJECT_ID', 'HADM_ID', 'TEXT']]

# Should be 55,177 records
print("Number of discharge summaries: ", + len(notes_df))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Number of discharge summaries:  55177


## Read in Patient Diagnoses

In [53]:
diag = mim_root + "DIAGNOSES_ICD.csv.gz"

diag_df = pd.read_csv(diag, compression='gzip', error_bad_lines=False)\
                    .dropna()\
                    .groupby('HADM_ID')['ICD9_CODE']\
                    .unique()\
                    .reset_index()
diag_df.head()

Unnamed: 0,HADM_ID,ICD9_CODE
0,100001,"[25013, 3371, 5849, 5780, V5867, 25063, 5363, ..."
1,100003,"[53100, 2851, 07054, 5715, 45621, 53789, 4019,..."
2,100006,"[49320, 51881, 486, 20300, 2761, 7850, 3090, V..."
3,100007,"[56081, 5570, 9973, 486, 4019]"
4,100009,"[41401, 99604, 4142, 25000, 27800, V8535, 4148..."


## Read in ICD9 Dictionary

In [54]:
icd = mim_root + "D_ICD_DIAGNOSES.csv.gz"

icd_df = pd.read_csv(icd, compression='gzip', error_bad_lines=False)

icd_df.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


## Merge datasets by HADM_ID

In [56]:
df = pd.merge(diag_df, notes_df, on='HADM_ID', how='inner')

# Should be 55177-5 = 55172
len(df)

55172

## Split data in train, valid, and test sets
### training (38,588 records, 69.9%), validation (5536 records, 10.0%) and testing (11,048 records, 20.0%) folds

In [None]:
# Random split
#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size = .75)

## Tokenize documents using Stanford Tokenizer

In [83]:
# Add the jar and model via their path (instead of setting environment variables):
jar = root + 'stanford-postagger-full-2020-11-17/stanford-postagger.jar'
model = root + 'stanford-postagger-full-2020-11-17/models/english-left3words-distsim.tagger'

pos_tagger = StanfordPOSTagger(model, jar, encoding='utf8')

# Add other jars from Stanford directory
stanford_dir = pos_tagger._stanford_jar.rpartition('/')[0]
stanford_jars = find_jars_within_path(stanford_dir)
pos_tagger._stanford_jar = ':'.join(stanford_jars)

df['tokens'] = df["TEXT"].map(lambda t: pos_tagger.tag(word_tokenize(t)))

KeyboardInterrupt: 

## Substitute special sequences
### These special sequences were identified and replaced by the first token in the sequence, e.g. "[**Hospital1 18**]" was replaced by ‘Hospital1’.

In [None]:
df["tokens"].str.replace('<[^>]*>', '')\
            .str.lower()\
            .str.replace('[\W]+', ' ')\
            .str.split())



In [115]:
s = df["TEXT"][1]

def process_notes(st):
    s = "History of Present Illness:"
    st = st.split(s,1)[1]

    e = "\n\n\n"
    n = st.split(e, 1)[0]

    tn = word_tokenize(n)
    
    return tn

p = re.compile(r"\[\*\*")
p.findall(s)


['[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**',
 '[**']

## Filter HoPI sections
### Remove records without "History of Present Illness" or empty HoPI sections

In [None]:
# Detect history of present illness in text (n = 2641 records without HoPI data)


## Truncate at 500 tokens

In [None]:
# Truncate records with more than 500 tokes (n = 1143)


## Plot a histogram of the Number of tokens in each HoPI document, after data preprocessing.

## Count number of tokens in the training dataset (n = ~92,468 tokens)

In [None]:
# Count occurence of tokens that are in the training dataset

# Tokens that occur >= 5 times are in the study vocabulary

# Assign a unique integer ID for each token in the study vocabulary 

# Convert each HoPI document to a 1D array of integers using this index


# Document representation

In [None]:
# Represent clinical notes documnets as TF-IDF representation

In [None]:
# Mean embedding representation

In [None]:
# GRU representation

# Label representation