[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/jupyter/enterprise/healthcare/colab/ChunkMergeClinicalMultiple.ipynb)

In [4]:
import json

with open('keys.json') as f:
    license_keys = json.load(f)

license_keys.keys()


dict_keys(['secret', 'SPARK_NLP_LICENSE', 'JSL_OCR_LICENSE', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'JSL_OCR_SECRET'])

In [26]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4

secret = license_keys['secret']
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['JSL_OCR_LICENSE'] = license_keys['JSL_OCR_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']

! python -m pip install --upgrade spark-nlp-jsl==2.5.0  --extra-index-url https://pypi.johnsnowlabs.com/$secret

# Install Spark NLP
! pip install --ignore-installed -q spark-nlp==2.5

import sparknlp

print (sparknlp.version())

import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession


from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl



def start(secret):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.0") \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+secret+"/spark-nlp-jsl-2.5.0.jar")
      
    return builder.getOrCreate()


spark = start(secret) # if you want to start the session with custom params as in start function above
# sparknlp_jsl.start(secret)

In [12]:
# Sample data
data_chunk_merge = spark.createDataFrame([
  (1,"""A 63-year-old man presents to the hospital with a history of recurrent infections that include cellulitis, pneumonias, and upper respiratory tract infections. He reports subjective fevers at home along with unintentional weight loss and occasional night sweats. The patient has a remote history of arthritis, which was diagnosed approximately 20 years ago and treated intermittently with methotrexate (MTX) and prednisone. On physical exam, he is found to be febrile at 102°F, rather cachectic, pale, and have hepatosplenomegaly. Several swollen joints that are tender to palpation and have decreased range of motion are also present. His laboratory values show pancytopenia with the most severe deficiency in neutrophils.
""")]).toDF("id","text")

In [13]:
# Ners names and column names for each ner
ners_to_merge = {"ner_deid_large":"deid", "ner_bionlp":"bio", "ner_jsl":"jsl"}

In [15]:
# Preprocessing pipeline
da = DocumentAssembler().setInputCol("text").setOutputCol("document")
sd = SentenceDetector().setInputCols("document").setOutputCol("sentence")
tk = Tokenizer().setInputCols("sentence").setOutputCol("token")
emb = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models").setOutputCol("embs")


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [21]:

ner_pl = []
for ner, out in ners_to_merge.items():
    first = len(ner_pl)==0
    ner_pl.append(NerDLModel.pretrained(ner,"en","clinical/models").setInputCols("sentence","token","embs").setOutputCol(out))
    ner_pl.append(NerConverter().setInputCols("sentence","token",out).setOutputCol(out+"_chunk"))
    if not first:
        ner_pl.append(ChunkMergeApproach().setInputCols(prev+"_chunk", out+"_chunk").setOutputCol(out+"_chunk"))
    prev = out


pl = Pipeline().setStages([
    da,sd,tk,emb]+ner_pl)

ner_deid_large download started this may take some time.
Approximate size to download 14 MB
[OK!]
ner_bionlp download started this may take some time.
Approximate size to download 13.9 MB
[OK!]
ner_jsl download started this may take some time.
Approximate size to download 14 MB
[OK!]


In [22]:
merged_data = pl.fit(data_chunk_merge).transform(data_chunk_merge).cache()

In [25]:
merged_data.selectExpr("id","explode(arrays_zip(jsl_chunk.result, jsl_chunk.metadata)) as a")\
.selectExpr("id","a['0'] as chunk","a['1'].entity as entity").show(100, False)

+---+----------------------------------+----------------------+
|id |chunk                             |entity                |
+---+----------------------------------+----------------------+
|1  |63-year-old                       |Age                   |
|1  |man                               |Organism              |
|1  |recurrent                         |Modifier              |
|1  |cellulitis                        |Diagnosis             |
|1  |pneumonias                        |Diagnosis             |
|1  |upper respiratory tract infections|Diagnosis             |
|1  |He                                |Gender                |
|1  |subjective                        |Modifier              |
|1  |fevers                            |Symptom_Name          |
|1  |occasional                        |Modifier              |
|1  |night sweats                      |Symptom_Name          |
|1  |patient                           |Organism              |
|1  |arthritis                         |