In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sparknlp.training import CoNLL

In [2]:
def start(gpu=False):
    builder = SparkSession.builder \
        .appName("Spark NLP - NERDL training") \
        .master("local[*]") \
        .config("spark.driver.memory", "10G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
        .config("spark.kryoserializer.buffer.max", "1000M")
    if gpu:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:2.4.3")
    else:
        builder.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.3")

    return builder.getOrCreate()
  
spark = start(gpu=False)

In [3]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.5.3
Apache Spark version:  2.4.5


In [4]:
# read CoNLL train file
training_data = CoNLL().readDataset(spark, '/home/francesco/clinical_ner/conll.train')

In [5]:
training_data.count()

16311

In [5]:
# read CoNLL test file
test_data = CoNLL().readDataset(spark, '/home/francesco/clinical_ner/conll.test')

In [7]:
test_data.count()

27568

In [6]:
# load the trained embeddings model
BioVec_embeddings = WordEmbeddingsModel.load('./embeddings/BioVec_model')\
        .setInputCols(['sentence', 'token'])\
        .setOutputCol('biowordvec')

In [None]:
test_data_bio = BioVec_embeddings.transform(test_data)
test_data_bio.write.parquet("test_withBioVecEmbeds.parquet")

In [7]:
nerTagger = NerDLApproach()\
.setInputCols(['sentence', 'token', 'biowordvec'])\
.setLabelColumn('label')\
.setOutputCol('ner')\
.setMaxEpochs(20)\
.setLr(0.001)\
.setPo(0.005)\
.setBatchSize(8)\
.setRandomSeed(0)\
.setVerbose(2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setTestDataset('test_withBioVecEmbeds.parquet')\
.setGraphFolder('/home/francesco/graph/tensorflow_graph')

pipeline = Pipeline( stages = [
                BioVec_embeddings,
                nerTagger
  ])

In [8]:
ner_model = pipeline.fit(training_data)

In [9]:
predictions = ner_model.transform(test_data)
predictions.show(3)

+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|            text|            document|            sentence|               token|                 pos|               label|          biowordvec|                 ner|
+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|admission date :|[[document, 0, 15...|[[document, 0, 15...|[[token, 0, 8, ad...|[[pos, 0, 8, NN, ...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|      2012-10-31|[[document, 0, 9,...|[[document, 0, 9,...|[[token, 0, 9, 20...|[[pos, 0, 9, NN, ...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|discharge date :|[[document, 0, 15...|[[document, 0, 15...|[[token, 0, 8, di...|[[pos, 0, 8, NN, ...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
+---

In [16]:
predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
           .select(F.expr("cols['0']").alias("token"),
                   F.expr("cols['1']").alias("ground_truth"),
                   F.expr("cols['2']").alias("prediction")).show(70,truncate=False)

+------------------------+------------+-----------+
|token                   |ground_truth|prediction |
+------------------------+------------+-----------+
|admission               |O           |O          |
|date                    |O           |O          |
|:                       |O           |O          |
|2012-10-31              |O           |O          |
|discharge               |O           |O          |
|date                    |O           |O          |
|:                       |O           |O          |
|2012-11-07              |O           |O          |
|date                    |O           |O          |
|of                      |O           |O          |
|birth                   |O           |O          |
|:                       |O           |O          |
|1941-03-23              |O           |O          |
|sex                     |O           |O          |
|:                       |O           |O          |
|m                       |O           |O          |
|service    

In [11]:
ner_model.stages

[WORD_EMBEDDINGS_MODEL_ce73dc8c7bc4, NerDLModel_ba52c8d1f745]

In [12]:
# save NERDL model
ner_model.stages[1].write().overwrite().save('./models/NER_DL_trained_model')

In [15]:
# save Word Embeddings model
ner_model.stages[0].write().overwrite().save('./models/WordEmbeddingsModel_BioVec_200d_20')

In [None]:
# stop spark session
spark.stop()