In [11]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F

import pandas as pd
import numpy as np

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.training import CoNLL
from sparknlp.pretrained import PretrainedPipeline

In [2]:
def start(gpu=False):
    builder = SparkSession.builder \
        .appName("Spark NLP - Date Extraction onto 100") \
        .master("local[*]") \
        .config("spark.driver.memory", "10G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.4.3")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.3")

    return builder.getOrCreate()

In [3]:
import os
os.environ['PYSPARK_PYTHON'] = '/home/francesco/anaconda3/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/home/francesco/anaconda3/bin/python'

In [4]:
spark = start(gpu=False)

In [5]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.3
Apache Spark version:  2.4.5


In [None]:
empty_df = spark.createDataFrame([['']]).toDF('text')

In [29]:
# Date extraction pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

embeddings = WordEmbeddingsModel.pretrained('glove_100d', 'en')\
    .setInputCols(["document", 'token'])\
    .setOutputCol("embeddings")

ner_model = NerDLModel.pretrained('onto_100', 'en') \
    .setInputCols(['document', 'token', 'embeddings']) \
    .setOutputCol('ner')

ner_converter = NerConverter() \
    .setInputCols(['document', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    tokenizer,
    embeddings,
    ner_model,
    ner_converter
])

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
onto_100 download started this may take some time.
Approximate size to download 13.5 MB
[OK!]


In [41]:
text_list = ['''
Admission Date : 2015-08-10. Discharge Date : 2015-08-15. The patient was taken to the Operating Room the day of admission. He underwent a left bronchoscopy and lower lobectomy of the left side. The patient was then transferred to the CSRU, intubated in stable condition. The patient remained to be stable on the floor with subsequent monitoring of hematocrit within a normal stable range. He is successfully extubated on 08-12 and hypertension transferred to the floor in stable condition where the Pain Service is managing his epidural with very good effect and he is tolerating a regular p.o. diet and is making adequate amount of urine. His recovery was essentially unremarkable. His epidural was successfully discontinued and he is discharged to home on 08-15 with instructions to follow-up with Dr. Rodriguez in the office within the next one to two weeks. He is discharged to home with pain medication which is percocet.
'''
]

In [42]:
pipeline_model = nlp_pipeline.fit(empty_df)
df = spark.createDataFrame(pd.DataFrame({'text': text_list}))
result = pipeline_model.transform(df)

In [43]:
result.select(F.explode(
                        F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')
                        ).alias("cols")
             ).select(
                        F.expr("cols['0']").alias('chunk'),
                        F.expr("cols['1']['entity']").alias('ner_label')
                     ).filter("ner_label = 'DATE' or ner_label = 'CARDINAL'").show(truncate=False)

+-------------------------+---------+
|chunk                    |ner_label|
+-------------------------+---------+
|2015-08-10               |CARDINAL |
|2015-08-15               |CARDINAL |
|the day                  |DATE     |
|08-12                    |CARDINAL |
|08-15                    |DATE     |
|the next one to two weeks|DATE     |
+-------------------------+---------+



In [44]:
# stop sparknlp session
spark.stop()