In [1]:
import numpy as np
import pandas as pd

from pyspark.ml import Pipeline

In [3]:
df = pd.read_csv('/home/s2264951/SDDM/data/mimiciii.csv')

clinical_notes_list = df['TEXT'].tolist()

In [4]:
clinical_notes_list[0]

"#1Resp\nBaby remains [**Name2 (NI) 2816**] [**Name2 (NI) 2807**] cannula 1l at 40-45% FIO2.\nRR 40-60 with ic/sc retractions. No spells but occ drift in\nsat to 80's. Lungs clear.\nA. [**Name2 (NI) **] NC\nP. Cont to monitor O2 requirement\n#2FEN\nWt 2.375 up 15g. TF at 140cc/kg or 55cc q4. Baby receiving\nBM32 with bene. Feed given over 1 hour. Abd soft, active\nbowel sounds. Void and trace stool. Min asp.\nA. [**Name2 (NI) **] feed\nP. Cont to monitor [**Name2 (NI) **] to feeds as well as weight gain\n#4Dev\nTemp stable swaddled in an open crib. Calm disposition.\nAwake and [**Name2 (NI) **] with cares but tires and falls back to\nsleep.\n#5Parent\nno contact so far this shift\n\n\n"

In [5]:
import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql.types import StringType

In [6]:
# starting spark session with spark-nlp

spark = sparknlp.start()

In [7]:
data = spark.createDataFrame(clinical_notes_list, StringType()).toDF('text')

In [8]:
data.show(2, truncate= False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                                                                                                                                                                                                                                                                               

In [9]:
document_assembler = DocumentAssembler()\
        .setInputCol('text')\
        .setOutputCol('document')

sentenceDetector = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')\
        .setCustomBounds(['\n'])\
        .setUseCustomBoundsOnly(True)\
        .setExplodeSentences(True)

tokenizer = Tokenizer() \
        .setInputCols(['sentence']) \
        .setOutputCol('token')

normalizer = Normalizer()\
        .setInputCols(['token'])\
        .setOutputCol('normal')\
        .setLowercase(False)\
        .setCleanupPatterns(['[\!"\[\]$&\*+,/:;=?@^_`()|~=]'])

finisher = Finisher() \
        .setInputCols(["sentence", "normal"]) \
        .setIncludeMetadata(True)

In [10]:
pipeline = Pipeline(stages=[
    document_assembler,
    sentenceDetector,
    tokenizer,
    normalizer,
    finisher
])

In [11]:
pipelineModel = pipeline.fit(data)

In [12]:
result = pipelineModel.transform(data)

In [13]:
result.select('finished_normal').show(10, truncate=False)

+-----------------------------------------------------------------------------------+
|finished_normal                                                                    |
+-----------------------------------------------------------------------------------+
|[#1Resp]                                                                           |
|[Baby, remains, Name2, NI, 2816, Name2, NI, 2807, cannula, 1l, at, 40-45%, FIO2, .]|
|[RR, 40-60, with, icsc, retractions, ., No, spells, but, occ, drift, in]           |
|[sat, to, 80's, ., Lungs, clear, .]                                                |
|[A, ., Name2, NI, NC]                                                              |
|[P, ., Cont, to, monitor, O2, requirement]                                         |
|[#2FEN]                                                                            |
|[Wt, 2.375, up, 15g, ., TF, at, 140cckg, or, 55cc, q4, ., Baby, receiving]         |
|[BM32, with, bene, ., Feed, given, over, 1, hour, ., 

In [14]:
# dataframe shape
result.count()

79740187

In [None]:
# limit to 10 otherwise it raises error

sentence_list = result.select("finished_normal").limit(10).rdd.flatMap(lambda x: x).collect()

In [16]:
# this format (list of list of words) is needed as input for Gensim training algorithm

sentence_list[:10]

[['#1Resp'],
 ['Baby',
  'remains',
  'Name2',
  'NI',
  '2816',
  'Name2',
  'NI',
  '2807',
  'cannula',
  '1l',
  'at',
  '40-45%',
  'FIO2',
  '.'],
 ['RR',
  '40-60',
  'with',
  'icsc',
  'retractions',
  '.',
  'No',
  'spells',
  'but',
  'occ',
  'drift',
  'in'],
 ['sat', 'to', "80's", '.', 'Lungs', 'clear', '.'],
 ['A', '.', 'Name2', 'NI', 'NC'],
 ['P', '.', 'Cont', 'to', 'monitor', 'O2', 'requirement'],
 ['#2FEN'],
 ['Wt',
  '2.375',
  'up',
  '15g',
  '.',
  'TF',
  'at',
  '140cckg',
  'or',
  '55cc',
  'q4',
  '.',
  'Baby',
  'receiving'],
 ['BM32',
  'with',
  'bene',
  '.',
  'Feed',
  'given',
  'over',
  '1',
  'hour',
  '.',
  'Abd',
  'soft',
  'active'],
 ['bowel',
  'sounds',
  '.',
  'Void',
  'and',
  'trace',
  'stool',
  '.',
  'Min',
  'asp',
  '.']]

### TRAINING EMBEDDINGS WITH GENSIM

In [17]:
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases

In [35]:
# Phrase Detection
# Give some common terms that can be ignored in phrase detection

common_terms = ["of", "with", "without", "and", "or", "the", "a", "at", "to"]

phrases = Phrases(sentence_list, common_terms = common_terms)

In [36]:
#Phraser object created
bigram = Phraser(phrases)

In [37]:
phrased_sentence_list = list(bigram[sentence_list])

In [38]:
print(bigram[sentence_list[1]])

['Baby', 'remains', 'Name2', 'NI', '2816', 'Name2', 'NI', '2807', 'cannula', '1l', 'at', '40-45%', 'FIO2', '.']


### Train model

In [39]:
model = Word2Vec(phrased_sentence_list,
                min_count = 3, # the minimum count of words to consider when training the model
                size = 200, # the number of dimensions of the embedding
                workers = 32, # the number of threads to use while training
                window = 7 # context window for words
                )

In [40]:
len(model.wv.vocab)

65

In [41]:
model.wv.vocab

{'#1Resp': <gensim.models.keyedvectors.Vocab at 0x7f310c043710>,
 'Baby': <gensim.models.keyedvectors.Vocab at 0x7f310c043650>,
 'remains': <gensim.models.keyedvectors.Vocab at 0x7f310c043750>,
 'Name2': <gensim.models.keyedvectors.Vocab at 0x7f310c043790>,
 'NI': <gensim.models.keyedvectors.Vocab at 0x7f310c043810>,
 '2816': <gensim.models.keyedvectors.Vocab at 0x7f310c043890>,
 '2807': <gensim.models.keyedvectors.Vocab at 0x7f310c043910>,
 'cannula': <gensim.models.keyedvectors.Vocab at 0x7f310c043950>,
 '1l': <gensim.models.keyedvectors.Vocab at 0x7f310c043550>,
 'at': <gensim.models.keyedvectors.Vocab at 0x7f310c043850>,
 '40-45%': <gensim.models.keyedvectors.Vocab at 0x7f310c043990>,
 'FIO2': <gensim.models.keyedvectors.Vocab at 0x7f310c0438d0>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f30fd29cd50>,
 'RR': <gensim.models.keyedvectors.Vocab at 0x7f30fd29ce10>,
 '40-60': <gensim.models.keyedvectors.Vocab at 0x7f30fd29cd10>,
 'with': <gensim.models.keyedvectors.Vocab at 0x7f30fd

In [42]:
len(phrased_sentence_list)

10

In [43]:
model.wv.most_similar('Baby')

[('over', 0.17315153777599335),
 ('40-45%', 0.155145525932312),
 ('active', 0.138882577419281),
 ('to', 0.12271511554718018),
 ('sat', 0.1222916916012764),
 ('monitor', 0.11807230114936829),
 ('retractions', 0.11444057524204254),
 ('140cckg', 0.11009781807661057),
 ('drift', 0.10835620015859604),
 ('1l', 0.1074826717376709)]

In [45]:
model.wv.most_similar('trace')

[('Min', 0.17005720734596252),
 ('40-60', 0.13682407140731812),
 ('A', 0.1233462318778038),
 ('Lungs', 0.11467423290014267),
 ('BM32', 0.11341571062803268),
 ('but', 0.10447914898395538),
 ('O2', 0.09901955723762512),
 ('FIO2', 0.09467938542366028),
 ('q4', 0.09222345054149628),
 ('receiving', 0.0826098620891571)]

### Save model

In [None]:
model.wv.save_word2vec_format('model_embeddings.bin')

In [None]:
spark.stop()