In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import * 
from sparknlp.base import * 
from sparknlp.common import * 

In [2]:
def start(gpu = False):
    builder = SparkSession.builder\
        .appName('Spark NLP')\
        .master('local[*]')\
        .config('spark.driver.memory', '8G')\
        .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')\
        .config('spark.kryoserializer.buffer.max', '1000M')
    if gpu:
        builder.config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.4.3')
    else:
        builder.config('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.3')
    
    return builder.getOrCreate()

In [3]:
spark = start()

In [4]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.4.3
Apache Spark version:  2.4.5


### PREDICTION PIPELINE

In [5]:
document = DocumentAssembler()\
        .setInputCol('text')\
        .setOutputCol('document')

sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')

token = Tokenizer()\
        .setInputCols(['sentence'])\
        .setOutputCol('token')

bioembeddings = WordEmbeddings()\
        .setInputCols(['sentence', 'token'])\
        .setOutputCol('biowordvec')\
        .setStoragePath('/home/aminmoradi/sddm_project/embeddings/BioWordVec_PubMed_MIMICIII_d200.vec.bin', 'BINARY')\
        .setDimension(200)\
        .setCaseSensitive(False)

model = NerDLModel.load('NER_DL_BioVec_200d_20')\
        .setInputCols(['sentence', 'token', 'biowordvec'])\
        .setOutputCol('ner')

converter = NerConverter()\
        .setInputCols(['document', 'token', 'ner'])\
        .setOutputCol('ner_span')

ner_pipeline = Pipeline(
            stages = [
                document,
                sentence,
                token,
                bioembeddings,
                model,
                converter])

In [6]:
text = '''Current medications:
   Acetaminophen, Aspirin EC, Ascorbic Acid, Docusate Sodium, Insulin,
   Morphine Sulfate, Omeprazole, Oxycodone-Acetaminophen, Pantoprazole,
   Vancomycin
   24 Hour Events:
   Received from OR, requiring fluid for hypotension, on/off neo and nitro
   Sheath rt groin removed
   Weaned and extubated'''

In [7]:
empty_data = spark.createDataFrame([['']]).toDF('text')

In [8]:
prediction_model = ner_pipeline.fit(empty_data)

In [9]:
prediction_data = spark.createDataFrame([[text]]).toDF('text')

In [10]:
lp = LightPipeline(prediction_model)

In [11]:
result = lp.annotate(text)

In [12]:
for e in list(zip(result['token'], result['ner'])):
    print(e)

('[**Last', 'O')
('Name', 'O')
('(', 'O')
('un', 'O')
(')', 'O')
('**', 'O')
(']:Lisinopril', 'O')
('40/D,Hctz', 'O')
('25/D,Coumadin', 'O')
('5mg/D(LD', 'O')
('[**1-30**]),ToprolXL', 'O')
('100/D,Glargine', 'O')
('35u', 'O')
('dinner,Humalog', 'O')
('SSI(bfst,lunch,HS),VitC,Fish', 'O')
('oil', 'O')
('.', 'O')
('Current', 'B-treatment')
('medications', 'I-treatment')
(':', 'O')
('Acetaminophen', 'B-treatment')
(',', 'O')
('Aspirin', 'B-treatment')
('EC', 'I-treatment')
(',', 'O')
('Ascorbic', 'B-treatment')
('Acid', 'I-treatment')
(',', 'O')
('Docusate', 'B-treatment')
('Sodium', 'I-treatment')
(',', 'O')
('Insulin', 'B-treatment')
(',', 'O')
('Morphine', 'B-treatment')
('Sulfate', 'I-treatment')
(',', 'O')
('Omeprazole', 'B-treatment')
(',', 'O')
('Oxycodone-Acetaminophen', 'B-treatment')
(',', 'O')
('Pantoprazole', 'B-treatment')
(',', 'O')
('Vancomycin', 'B-treatment')
('24', 'O')
('Hour', 'O')
('Events', 'O')
(':', 'O')
('Received', 'O')
('from', 'O')
('OR', 'O')
(',', 'O')
('requi

In [13]:
prediction = prediction_model.transform(prediction_data)

IllegalArgumentException: 'requirement failed: Found input column with storage metadata. But such ref does not match to the ref this annotator requires. Make sure you are loading the annotator with ref: WordEmbeddings_2d3346ee489c'

In [None]:
prediction.show()

In [None]:
prediction.select(F.explode(F.arrays_zip('ner_span.result', 'ner_span.metadata')).alias('entities'))\
        .select(F.expr('entities["0"]').alias('chunk'),
                F.expr('entities["1"].entity').alias('entity'))\
        .show(10, truncate=False)

In [None]:
spark.stop()