In [1]:
import pandas as pd

Find ICD9 Codes for Hospital admission
======================================

In [2]:
def normalise_icd9_code(code) :
    return code[:3] + '.' + code[3:]

def combine_icd9_codes(groupframe) :
    groupframe = groupframe.sort_values(by='SEQ_NUM')
    icd9_codes = ';'.join([normalise_icd9_code(x) for x in list(groupframe.ICD9_CODE)])
    return pd.Series({'ICD9_CODE' : icd9_codes})

# Replace the path with DIAGNOSES_ICD.csv file on your machine.
df_icd9_codes = pd.read_csv('DIAGNOSES_ICD.csv.gz').dropna()
df_icd9_codes = df_icd9_codes.groupby(['SUBJECT_ID', 'HADM_ID']).apply(combine_icd9_codes)
df_icd9_codes = pd.DataFrame(df_icd9_codes).reset_index()

Clean Discharge Summaries
=========================

In [3]:
# Replace the path with NOTEEVENTS.csv file on your machine
df_notes = pd.read_csv('NOTEEVENTS.csv.gz')

columns_to_keep = ['SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'DESCRIPTION', 'TEXT']
df_notes_discharge = df_notes[(df_notes.CATEGORY == 'Discharge summary') & (pd.isnull(df_notes.ISERROR))][columns_to_keep]
df_notes_discharge['DESCRIPTION'] = df_notes_discharge['DESCRIPTION'].replace({'Report' : 0, 'Addendum' : 1})

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
def group_text_reports(groupframe) :
    #Combine main report and addenda
    groupframe = groupframe.sort_values(by=['DESCRIPTION', 'CHARTDATE'])
    concat_text = " ".join(groupframe['TEXT']).strip()
    return pd.Series({'TEXT' : concat_text})

df_notes_discharge_combined = df_notes_discharge.groupby(['SUBJECT_ID', 'HADM_ID']).apply(group_text_reports)
df_notes_discharge_combined = pd.DataFrame(df_notes_discharge_combined).reset_index()

In [5]:
df_notes_discharge_combined.columns

Index(['SUBJECT_ID', 'HADM_ID', 'TEXT'], dtype='object')

In [6]:
def cleaner_mimic(text, spacy=True) :
    text = re.sub(r'\s+', ' ', text.strip())
    if spacy :
        text = [t.text.lower() for t in nlp(text)]
    else :
        text = [t.lower() for t in text.split()]
    text = " ".join(text)
    text = re.sub(r'\[\s*\*\s*\*(.*?)\*\s*\*\s*\]', ' <DE> ', text)
    text = re.sub(r'([^a-zA-Z0-9])(\s*\1\s*)+', r'\1 ', text)
    text = re.sub(r'\s+', ' ', text.strip())
    text = ['qqq' if any(char.isdigit() for char in word) else word for word in text.split(' ')]
    return " ".join(text)


In [7]:
from tqdm import tqdm

In [8]:
%%capture

texts = list(df_notes_discharge_combined['TEXT'])
import re, spacy

nlp = spacy.load("en", disable=["parser", "tagger", "ner"])

from multiprocessing import Pool
with Pool(4) as p :
    cleaned_texts = list(tqdm(p.imap(cleaner_mimic, texts), total=len(texts)))

In [9]:
df_icd9_codes.columns

Index(['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE'], dtype='object')

In [10]:
df_notes_discharge_combined['TEXT'] = cleaned_texts
df_notes_discharge_combined = df_notes_discharge_combined.merge(df_icd9_codes, on=['SUBJECT_ID', 'HADM_ID'])

In [11]:
df_notes_discharge_combined.columns

Index(['SUBJECT_ID', 'HADM_ID', 'TEXT', 'ICD9_CODE'], dtype='object')

In [12]:
df_notes_discharge_combined.to_csv('cleaned_discharge_summaries.csv', index=False)

Train Word2Vec on discharge summaries
=====================================

In [13]:
from gensim.models import Word2Vec as w2v

vec_length = 300
sentences = [x.split(' ') for x in cleaned_texts]
embeds = w2v(sentences, size=vec_length, window=10, min_count=2, workers=10)

In [14]:
embeds.save("mimic-embeds.wv")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
