![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/4.Clinical_DeIdentificiation.ipynb)

# Clinical Deidentification

## Colab Setup

In [1]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

license_keys.keys()

Saving workshop_license_keys2.json to workshop_license_keys2 (1).json


dict_keys(['SPARK_NLP_LICENSE', 'SECRET', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'PUBLIC_VERSION', 'JSL_VERSION', 'JSL_OCR_LICENSE'])

In [2]:
license_keys['JSL_VERSION']

'2.7.1'

In [3]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

secret = license_keys['SECRET']

os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
jsl_version = license_keys['JSL_VERSION']
version = license_keys['PUBLIC_VERSION']

! pip install --ignore-installed -q pyspark==2.4.4

! python -m pip install --upgrade spark-nlp-jsl==$jsl_version  --extra-index-url https://pypi.johnsnowlabs.com/$secret

! pip install --ignore-installed -q spark-nlp==$version

import sparknlp

print (sparknlp.version())

import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession


from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl

spark = sparknlp_jsl.start(secret)

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/2.7.1-c069474a59bb52cf25c5ed6e7beb05b04c42e7ca
Requirement already up-to-date: spark-nlp-jsl==2.7.1 in /usr/local/lib/python3.6/dist-packages (2.7.1)
2.6.4


In [4]:
# if you want to start the session with custom params as in start function above
def start(secret):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:"+version) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+secret+"/spark-nlp-jsl-"+jsl_version+".jar")
      
    return builder.getOrCreate()

#spark = start(secret)

In [5]:
spark

# Deidentification Model

Protected Health Information: 
- individual’s past, present, or future physical or mental health or condition
- provision of health care to the individual
- past, present, or future payment for the health care 

Protected health information includes many common identifiers (e.g., name, address, birth date, Social Security Number) when they can be associated with the health information.

Load NER pipeline to isentify protected entities:

In [6]:
import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline, PipelineModel
import pyspark.sql.functions as F
import string
import numpy as np
import sparknlp
from sparknlp.util import *
from sparknlp.pretrained import ResourceDownloader
from pyspark.sql import functions as F
from sparknlp_jsl.annotator import *

In [7]:
from sparknlp_jsl.annotator import *

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on n2c2 (de-identification and Heart Disease Risk Factors Challenge) datasets)

clinical_ner = NerDLModel.pretrained("ner_deid_large", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 13.9 MB
[OK!]


### Pretrained NER models extracts:

- Name
- Profession
- Age
- Date
- Contact(Telephone numbers, FAX numbers, Email addresses)
- Location (Address, City, Postal code, Hospital Name, Employment information)
- Id (Social Security numbers, Medical record numbers, Internet protocol addresses)

In [8]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 month years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

In [9]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [10]:
result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

In [11]:
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)

+----------+-----+
|ner_label |count|
+----------+-----+
|O         |28   |
|I-LOCATION|5    |
|B-DATE    |3    |
|I-NAME    |3    |
|B-NAME    |3    |
|B-LOCATION|2    |
|B-ID      |1    |
|B-AGE     |1    |
+----------+-----+



### Check extracted sensetive entities

In [12]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|2093-01-13                   |DATE     |
|David Hale                   |NAME     |
|Hendrickson , Ora            |NAME     |
|7194334                      |ID       |
|01/13/93                     |DATE     |
|Oliveira                     |NAME     |
|25                           |AGE      |
|2079-11-09                   |DATE     |
|Cocke County Baptist Hospital|LOCATION |
|0295 Keats Street            |LOCATION |
+-----------------------------+---------+



We can find the cases, where the model will skip some important entities, for example:

In [13]:
text ='''
Patient AIQING, 25 years-old , born in Beijing, was transfered to the The Johns Hopkins Hospital. Phone number: (541) 754-3010. MSW 100009632582
'''

In [14]:
result2 = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [15]:
result_df = result2.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result2.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+--------------------------+---------+
|chunk                     |ner_label|
+--------------------------+---------+
|25                        |AGE      |
|Beijing                   |LOCATION |
|The Johns Hopkins Hospital|LOCATION |
|(541) 754-3010            |CONTACT  |
|100009632582              |ID       |
+--------------------------+---------+



For these entities we can add a dictionary to the pipeline, by using **NerOverwriter()**:

In [16]:
neroverwriter = NerOverwriter() \
    .setInputCols(["ner"]) \
    .setOutputCol("ner_overwrited") \
    .setStopWords(['AIQING']) \
    .setNewResult("I-NAME")

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner_overwrited"])\
  .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    neroverwriter,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

Let's test the model after modification:

In [17]:
result2 = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [18]:
result_df2 = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_df2.show()

+-----------+---------+
|      token|ner_label|
+-----------+---------+
|          A|        O|
|          .|        O|
|     Record|        O|
|       date|        O|
|          :|        O|
| 2093-01-13|   B-DATE|
|          ,|        O|
|      David|   B-NAME|
|       Hale|   I-NAME|
|          ,|        O|
|        M.D|        O|
|          .|        O|
|          ,|        O|
|       Name|        O|
|          :|        O|
|Hendrickson|   B-NAME|
|          ,|   I-NAME|
|        Ora|   I-NAME|
|         MR|        O|
|          .|        O|
+-----------+---------+
only showing top 20 rows



In [19]:
result2.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+--------------------------+---------+
|chunk                     |ner_label|
+--------------------------+---------+
|AIQING                    |NAME     |
|25                        |AGE      |
|Beijing                   |LOCATION |
|The Johns Hopkins Hospital|LOCATION |
|(541) 754-3010            |CONTACT  |
|100009632582              |ID       |
+--------------------------+---------+



As we can see, now name **AIQING** was identified correctly

### Excluding entities from deidentification

Sometimes we need to leave some entities in the text, for example, if we want to analyze the frequency of the disease by the hospital. In this case, we need to use parameter **setWhiteList()** to modify NerChunk output. This parameter having using a list of entities type to deidentify as an input. So, if we want to leave the location in the list we need to remove this tag from the list:

In [20]:
ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner_overwrited"])\
  .setOutputCol("ner_chunk") \
  .setWhiteList(['NAME', 'PROFESSION', 'ID', 'AGE',
               'DATE', 'CONTACT'])

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    neroverwriter,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_with_white_list = nlpPipeline.fit(empty_data)

In [21]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 month years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

result_with_white_list = model_with_white_list.transform(spark.createDataFrame([[text]]).toDF("text"))

In [22]:
result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_with_white_list.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|2093-01-13       |DATE     |
|David Hale       |NAME     |
|Hendrickson , Ora|NAME     |
|7194334          |ID       |
|01/13/93         |DATE     |
|Oliveira         |NAME     |
|25               |AGE      |
|2079-11-09       |DATE     |
+-----------------+---------+



## Masking and Obfuscation

### Replace this enitites with Tags

In [23]:
deidentification = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")

deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]


In [24]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [25]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|2093-01-13                   |DATE     |
|David Hale                   |NAME     |
|Hendrickson , Ora            |NAME     |
|7194334                      |ID       |
|01/13/93                     |DATE     |
|Oliveira                     |NAME     |
|25                           |AGE      |
|2079-11-09                   |DATE     |
|Cocke County Baptist Hospital|LOCATION |
|0295 Keats Street            |LOCATION |
+-----------------------------+---------+



In [26]:
deid_text = deidentification.transform(result)

In [27]:
deid_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : <DATE> , <NAME> , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : <NAME> MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : <DATE> PCP : <NAME> , <AGE> years-old , Record date : <DATE> ."
4,Cocke County Baptist Hospital .,<LOCATION> .
5,0295 Keats Street,<LOCATION>


### Use obfuscation mode

In the obfuscation mode **DeIdentificationModel** will replace sensetive entities with random values of the same type. 

Will be replaced: 
- Name
- Location
- Contacts
- Profession

Will be tagged:
- Age
- Date
- ID

In [28]:
obfuscation = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("obfuscate")\
      .setObfuscateDate(True)

        #.setObfuscateRefFile('obfuscation.txt') \
        #.setRefSep('#') \
        #.setRegexPatternsDictionary('regex_dict.txt', 'TEXT')
      

deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]


sample obfuscation.txt
```
Marvin MARSHALL#DOCTOR
Hubert GROGAN#DOCTOR
ALTHEA COLBURN#DOCTOR
Kalil AMIN#DOCTOR
Inci FOUNTAIN#DOCTOR
Ekaterina Rosa#DOCTOR
Rudiger Chao#DOCTOR
COLLETTE KOHLER#DOCTOR
Mufi HIGGS#DOCTOR
```

sample regex_dict.txt
```
AGE \d{1,2}\s(?:months|weeks|mos)
MEDICALRECORD (?<=MRN\s)(\w+
```

In [29]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|      ner_overwrited|           ner_chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
A . Record date ...|[[document, 0, 21...|[[document, 1, 3,...|[[token, 1, 1, A,...|[[word_embeddings...|[[named_entity, 1...|[[named_entity, 1...|[[chunk, 19, 28, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [30]:
obfusated_text = obfuscation.transform(result)

In [31]:
obfusated_text.select('ner.result').take(1)

[Row(result=['O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'B-NAME', 'I-NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'I-NAME', 'I-NAME', 'O', 'O', 'O', 'B-ID', 'O', 'O', 'B-DATE', 'O', 'O', 'B-NAME', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION'])]

In [32]:
obfusated_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : 2093-02-17 , FAWNIA , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : Doll MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : <DATE> PCP : Charlene , <AGE> years-old , Record date : 2080-01-07 ."
4,Cocke County Baptist Hospital .,Delphos .
5,0295 Keats Street,Elk Garden


Obfuscate date

To mask the Date we need to setting up some properties in the DeIdentification module.This properties are setting up using the following functions.
* setObfuscateDate(True): Select if we going to obfuscate the dates
* setDateTag("DATE"): Tag representing dates
* setDays(5):  Number of days to obfuscate the dates by displacement. If not provided a random integer between 1 and 60 will be used
* setDateToYear(False): True if dates must be converted to years, false otherwise
* setDateFormats(["DD/MM/YYY"]): Format of dates to displace

In [33]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|      ner_overwrited|           ner_chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
A . Record date ...|[[document, 0, 21...|[[document, 1, 3,...|[[token, 1, 1, A,...|[[word_embeddings...|[[named_entity, 1...|[[named_entity, 1...|[[chunk, 19, 28, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [34]:
obfuscation = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("obfuscate")\
      .setObfuscateDate(True)\
      .setDateTag("DATE")\
      .setDays(5)\
      .setDateFormats(["MM/dd/yy","yyyy-MM-dd"])


deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]


In [35]:
obfusated_text = obfuscation.transform(result)

In [36]:
obfusated_text.select('ner.result')

DataFrame[result: array<string>]

In [37]:
obfusated_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : 2093-01-18 , SOPHIE , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : TORIE MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : 01/18/93 PCP : DAVE , <AGE> years-old , Record date : 2079-11-14 ."
4,Cocke County Baptist Hospital .,Graford .
5,0295 Keats Street,Graford


## Use full pipeline in the Light model

In [38]:
finisher = Finisher() \
    .setInputCols("deidentified")

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner_overwrited"])\
  .setOutputCol("ner_chunk") 

In [39]:
pipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    neroverwriter,
    ner_converter,
    obfuscation])

In [40]:

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = pipeline.fit(empty_data)

In [41]:
light_model = LightPipeline(model)

In [42]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01-13-1993 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

In [43]:
annotated_text = light_model.annotate(text)
annotated_text['deidentified']

['A .',
 'Record date : 2093-01-18 , Rosabelle , M.D .',
 ', Name : BRANDIE MR .',
 '# <ID> Date : <DATE> PCP : SANDOR , <AGE> years-old , Record date : 2079-11-14 .',
 'Lyford .',
 'Clairton']

In [44]:
list(zip(annotated_text['token'], annotated_text['ner']))

[('A', 'O'),
 ('.', 'O'),
 ('Record', 'O'),
 ('date', 'O'),
 (':', 'O'),
 ('2093-01-13', 'B-DATE'),
 (',', 'O'),
 ('David', 'B-NAME'),
 ('Hale', 'I-NAME'),
 (',', 'O'),
 ('M.D', 'O'),
 ('.', 'O'),
 (',', 'O'),
 ('Name', 'O'),
 (':', 'O'),
 ('Hendrickson', 'B-NAME'),
 (',', 'I-NAME'),
 ('Ora', 'I-NAME'),
 ('MR', 'O'),
 ('.', 'O'),
 ('#', 'O'),
 ('7194334', 'B-ID'),
 ('Date', 'O'),
 (':', 'O'),
 ('01-13-1993', 'B-DATE'),
 ('PCP', 'O'),
 (':', 'O'),
 ('Oliveira', 'B-NAME'),
 (',', 'O'),
 ('25', 'B-AGE'),
 ('years-old', 'O'),
 (',', 'O'),
 ('Record', 'O'),
 ('date', 'O'),
 (':', 'O'),
 ('2079-11-09', 'B-DATE'),
 ('.', 'O'),
 ('Cocke', 'B-LOCATION'),
 ('County', 'I-LOCATION'),
 ('Baptist', 'I-LOCATION'),
 ('Hospital', 'I-LOCATION'),
 ('.', 'O'),
 ('0295', 'B-LOCATION'),
 ('Keats', 'I-LOCATION'),
 ('Street', 'I-LOCATION')]

In [45]:
annotated_text['ner_chunk']

['2093-01-13',
 'David Hale',
 'Hendrickson , Ora',
 '7194334',
 '01-13-1993',
 'Oliveira',
 '25',
 '2079-11-09',
 'Cocke County Baptist Hospital',
 '0295 Keats Street']

In [46]:
source_text = '''Record date : 2093-01-13, David Hale, M.D. is manager, 
Name: Hendrickson, Ora MR. # 7194334 Date: 01-13-1993 PCP: Oliveira.
Record date: 2079-11-09. Cocke County Baptist Hospital. 0295 Keats Street.
This 17-yr-old male, presented with chest heaviness that started during a pick-up basketball game. His past medical history was unremarkable. He denied prior cardiac symptoms and suffered no chest trauma during the game. His father had suffered an acute myocardial infarction at age 38. The patient was a nonsmoker, did not drink alcohol, and denied recreational drug use. He swallowed a tablet of aspirin before coming to the emergency room. His blood pressure was 160/90 mm Hg, and his heart rate was 80 bpm. Physical examination revealed no stigmata of Marfan syndrome. The rest of his physical examination was normal.'''

annotated_text = light_model.annotate(source_text)
annotated_text['deidentified']

['Record date : 2093-01-18, Rosabelle, M.D. is Fish and Game Wardens, \nName: BRANDIE MR. # <ID> Date: (408)535-0622 PCP: SANDOR.',
 'Record date: 2079-11-14.',
 'Lyford.',
 'Clairton.',
 'This <AGE> male, presented with chest heaviness that started during a pick-up basketball game.',
 'His past medical history was unremarkable.',
 'He denied prior cardiac symptoms and suffered no chest trauma during the game.',
 'His father had suffered an acute myocardial infarction at age',
 '<AGE>. The patient was a nonsmoker, did not drink alcohol, and denied recreational drug use.',
 'He swallowed a tablet of aspirin before coming to the emergency room.',
 'His blood pressure was 160/90 mm Hg, and his heart rate was 80 bpm.',
 'Physical examination revealed no stigmata of Marfan syndrome.',
 'The rest of his physical examination was normal.']

## Train custom Model


In [47]:
glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

public_ner = NerDLModel.pretrained("ner_dl", 'en') \
          .setInputCols(["document", "token", "embeddings"]) \
          .setOutputCol("ner")

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_chunk")


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [48]:

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    glove_embeddings,
    public_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

custom_model = nlpPipeline.fit(result)

In [49]:
source_text = '''Record date : 2093-01-13, David Hale lives in New York'''

In [50]:
result = custom_model.transform(spark.createDataFrame([[source_text]]).toDF("text"))

#### We got new entities in the model 'PER' and 'ORG'

In [51]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+----------+---------+
|chunk     |ner_label|
+----------+---------+
|David Hale|PER      |
|New York  |LOC      |
+----------+---------+



### Mask custom entities

In [52]:
deidentification = DeIdentification() \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")

In [53]:
deid_model = deidentification.fit(result)
deid_text = deid_model.transform(result)

In [54]:
deid_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,"Record date : 2093-01-13, David Hale lives in New York","Record date : 2093-01-13, <PER> lives in <LOC>"


In [55]:
deid_model.write().overwrite().save('custom_deid_masker_model')

 To be able to obfuscate new entities we have to create custom Deidentification Model with new dictionary

In [56]:
obfuscation_list = ['Marvin MARSHALL#PER',
      'Hubert GROGAN#PER',
      'ALTHEA COLBURN#PER',
      'Kalil AMIN#PER',
      'Inci FOUNTAIN#PER',
      'Surrey#ORG',
      'Warwickshire#ORG',
      'Derbyshire#ORG',
      'Leicestershire#ORG',
      'Glamorgan#ORG',
      'Durham#ORG',
      'Los Angeles#LOC']

with open('obfuscation.txt', 'a') as the_file:
    for line in obfuscation_list:
      the_file.write(line + '\n')

In [57]:
obfuscator = DeIdentification() \
    .setInputCols(["sentence", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode('obfuscate') \
    .setRefSep('#') \
    .setObfuscateRefFile('obfuscation.txt') \
    .setObfuscateDate(True)


In [58]:
obfuscator_model = obfuscator.fit(result)

In [59]:
obfuscator_model.write().overwrite().save('custom_obfuscator_model')

In [60]:
obfusc_text = obfuscator_model.transform(result)

In [61]:
obfusc_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,"Record date : 2093-01-13, David Hale lives in New York","Record date : 2093-01-13, Kalil AMIN lives in Los Angeles"
