![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Setup

## Authentication-Cell 
You must re-run this each time you restart the Kernel to set the environment variables

In [None]:
import json
import os
import re

# Upload your spark_nlp_for_healthcare.json to the default directory and then run this cell to set env variables 
with open('spark_nlp_for_healthcare.json', 'r') as f:
    for k, v in json.load(f).items():
        %set_env $k=$v

# Upload your spark_ocr.json to the default directory and then run this cell to set env variables
ocr_version_pattern = re.compile(r"(\d+\.\d+\.\d+).*?(spark\d+)")
with open('spark_ocr.json', 'r') as f:
    for k, v in json.load(f).items():
        %set_env $k=$v
        if k == 'SPARK_OCR_LICENSE' :
            k = 'JSL_OCR_LICENSE'
            %set_env $k=$v
        if k == 'JSL_OCR_SECRET' :
            k = 'SPARK_OCR_SECRET'
            %set_env $k=$v
        if k == 'OCR_VERSION':
            match = ocr_version_pattern.findall(v)
            if match:
                ocr_base_version, ocr_spark_version = match[0]
                %set_env OCR_BASE_VERSION=$ocr_base_version
                %set_env OCR_SPARK_VERSION=$ocr_spark_version

# Set Spark Version here:
%set_env PYSPARK=3.0.2

The initial setup only needs to be run once.

In [None]:
!bash ./spark-nlp-setup-macOS-catalina.sh

### Please restart the kernel after the script finishes. 
You will need to re-run the **authentication cell** each time you start this notebook.

## Starting the Spark session

In [None]:
import sparknlp
import sparknlp_jsl
import sparkocr
# Create or get Spark Session
from pyspark.sql import SparkSession

PWD = os.getcwd()
SECRET = os.environ["SECRET"]
PUBLIC_VERSION = os.environ["PUBLIC_VERSION"]
JSL_VERSION = os.environ["JSL_VERSION"]
OCR_VERSION = f"{ocr_base_version}-{ocr_spark_version}"

spark = SparkSession.builder \
    .appName("Spark NLP") \
    .master("local[*]") \
    .config("spark.driver.memory","12G") \
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "800M")\
    .config("spark.sql.legacy.allowUntypedScalaUDF", True) \
    .config("spark.jars.packages", f"com.johnsnowlabs.nlp:spark-nlp_2.12:{PUBLIC_VERSION}") \
    .config("spark.jars", f"file:///{PWD}/spark-nlp-jsl-{JSL_VERSION}.jar,file:///{PWD}/spark-ocr-assembly-{OCR_VERSION}.jar") \
    .getOrCreate()

print("spark.version", spark.version)
print("sparknlp.version()", sparknlp.version())
print("sparkocr.version()", sparkocr.version())
print("sparknlp_jsl.version()", sparknlp_jsl.version())

# Spark NLP

In [None]:
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import *
from sparknlp.annotator import *

In [None]:
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
result = pipeline.annotate("Harry Potter is a great movie")
result

# Spark NLP for Healthcare

In [None]:
import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.util import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.pretrained import ResourceDownloader

from pyspark.sql import functions as F
from pyspark.ml import Pipeline, PipelineModel

# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

#sentenceDetector = SentenceDetector()\
        #.setInputCols(["document"])\
        #.setOutputCol("sentence")
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")
 
# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
        .setInputCols(["sentence"])\
        .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
        .setInputCols(["sentence","token"])\
        .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = MedicalNerModel.pretrained("ner_clinical_large","en","clinical/models")\
        .setInputCols(["sentence","token","embeddings"])\
        .setOutputCol("ner")

ner_converter = NerConverter()\
        .setInputCols(["sentence","token","ner"])\
        .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter])


data = spark.createDataFrame([["""The human KCNJ9 (Kir 3.3, GIRK3) is a member of the G-protein-activated inwardly rectifying potassium (GIRK) channel family. Here we describe the genomicorganization of the KCNJ9 locus on chromosome 1q21-23 as a candidate gene forType II diabetes mellitus in the Pima Indian population. The gene spansapproximately 7.6 kb and contains one noncoding and two coding exons separated byapproximately 2.2 and approximately 2.6 kb introns, respectively. We identified14 single nucleotide polymorphisms (SNPs), including one that predicts aVal366Ala substitution, and an 8 base-pair (bp) insertion/deletion. Ourexpression studies revealed the presence of the transcript in various humantissues including pancreas, and two major insulin-responsive tissues: fat andskeletal muscle. The characterization of the KCNJ9 gene should facilitate furtherstudies on the function of the KCNJ9 protein and allow evaluation of thepotential role of the locus in Type II diabetes."
"""]]).toDF("text")

model = nlpPipeline.fit(data)
result = model.transform(data)
result.select("ner_chunk").show()

# Spark OCR

In [None]:
!curl -L "http://www.asx.com.au/asxpdf/20171103/pdf/43nyyw9r820c6r.pdf" --output sample_doc.pdf 

In [None]:
from sparkocr.transformers import *
from sparkocr.transformers import *
from pyspark.ml import PipelineModel
from sparkocr.utils import display_image
from sparkocr.metrics import score
def pipeline():
    # Transforrm PDF document to images per page
    pdf_to_image = PdfToImage()\
          .setInputCol("content")\
          .setOutputCol("image")
    # Run OCR
    ocr = ImageToText()\
          .setInputCol("image")\
          .setOutputCol("text")\
          .setConfidenceThreshold(65)
    
    pipeline = PipelineModel(stages=[
        pdf_to_image,
        ocr
    ])
    
    return pipeline

In [None]:
pdf = 'sample_doc.pdf'
pdf_example_df = spark.read.format("binaryFile").load(pdf).cache()

In [None]:
result = pipeline().transform(pdf_example_df).cache()

In [None]:
result.select("pagenum","text", "confidence").show()

In [None]:
result.select("text").collect()