In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.training import CoNLL
from sparknlp.pretrained import PretrainedPipeline

In [2]:
def start(gpu=False):
    builder = SparkSession.builder \
        .appName("Spark NLP - Date Extraction") \
        .master("local[*]") \
        .config("spark.driver.memory", "10G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.4.3")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.3")

    return builder.getOrCreate()

In [3]:
import os
os.environ['PYSPARK_PYTHON'] = '/home/francesco/anaconda3/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/francesco/anaconda3/bin/python'

In [4]:
spark = start(gpu=False)

In [5]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.4.3
Apache Spark version:  2.4.5


In [4]:
empty_data = spark.createDataFrame([['']]).toDF('text')

### DATE EXTRACTION PIPELINE

In [21]:
# date extraction pipeline
document = DocumentAssembler()\
        .setInputCol('text')\
        .setOutputCol('document')

sentence = SentenceDetector()\
        .setInputCols(['document'])\
        .setOutputCol('sentence')

token = Tokenizer()\
        .setInputCols(['sentence'])\
        .setOutputCol('token')

embeddings = WordEmbeddingsModel.pretrained()\
        .setInputCols('sentence', 'token')\
        .setOutputCol('embeddings')\
        .setCaseSensitive(False)

#bioVec_embeddings = WordEmbeddingsModel.load('./embeddings/BioVec_model')\
       # .setInputCols(['sentence', 'token'])\
       # .setOutputCol('biowordvec')

nerTagger = NerDLModel.pretrained('onto_300')\
        .setInputCols(['sentence', 'token', 'embeddings'])\
        .setOutputCol('ner')

nerConverter = NerConverter()\
        .setInputCols(['sentence', 'token', 'ner'])\
        .setOutputCol('ner_span')

date_pipeline = Pipeline(
        stages=[
            document,
            sentence,
            token,
            embeddings,
            nerTagger,
            nerConverter
        ])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
onto_300 download started this may take some time.
Approximate size to download 14.1 MB
[OK!]


In [22]:
date_model = date_pipeline.fit(empty_data)

In [12]:
text = ''' The patient was prescribed 1 capsule of Advil for 5 days . 
He was seen by the endocrinology service and she was discharged on 40 units of insulin glargine at night , 12 units of insulin lispro with meals ,
and metformin 1000 mg two times a day . 
It was determined that all SGLT2 inhibitors should be discontinued indefinitely for 3 months .'''

In [13]:
prediction_data = spark.createDataFrame([[text]]).toDF('text')

In [None]:
results = date_model.transform(prediction_data)

In [None]:
results.show()

In [6]:
testData = spark.createDataFrame([["I would like to come over and see you in 01/02/2019."],
                                ["Donald John Trump (born June 14, 1946) is the 45th and current president of the United States"]]).toDF("text")

In [7]:
pipeline = PretrainedPipeline("match_datetime", lang="en")

match_datetime download started this may take some time.
Approx size to download 12.9 KB
[OK!]


In [17]:
pipeline = PretrainedPipeline("onto_recognize_entities_lg", lang="en")

onto_recognize_entities_lg download started this may take some time.
Approx size to download 2.3 GB
[OK!]


In [18]:
annotation = pipeline.transform(prediction_data)

In [19]:
annotation.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|            entities|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| The patient was ...|[[document, 0, 34...|[[document, 1, 58...|[[token, 1, 3, Th...|[[word_embeddings...|[[named_entity, 1...|[[chunk, 28, 28, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [21]:
annotation.select("entities.result").show(truncate = False)

+------------------------------------------------------+
|result                                                |
+------------------------------------------------------+
|[1, Advil, 5 days, 40, 12, 1000, two, SGLT2, 3 months]|
+------------------------------------------------------+



In [11]:
spark.stop()