## Text Preprocessing with SparkNLP

In [None]:
#data
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/spark-nlp-basics/sample-sentences-en.txt

In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-09-27 19:54:55--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-09-27 19:54:55--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-09-27 19:54:55--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:44

In [None]:
import sparknlp
spark= sparknlp.start()

In [None]:
text= "Galatasaray is the firts club that has won the UEFA cup in Turkey"

spark_df= spark.createDataFrame([[text]]).toDF("text")
spark_df.show(truncate=False)

+-----------------------------------------------------------------+
|text                                                             |
+-----------------------------------------------------------------+
|Galatasaray is the firts club that has won the UEFA cup in Turkey|
+-----------------------------------------------------------------+



In [None]:
# to create sparkDF from a list of string
from pyspark.sql.types import StringType, IntegerType

In [None]:
text_list = ['Peter Parker is a nice guy and lives in New York.', 'Bruce Wayne is also a nice guy and lives in Gotham City.']
spark.createDataFrame(text_list, StringType()).toDF("text").show(truncate=False)

+--------------------------------------------------------+
|text                                                    |
+--------------------------------------------------------+
|Peter Parker is a nice guy and lives in New York.       |
|Bruce Wayne is also a nice guy and lives in Gotham City.|
+--------------------------------------------------------+



In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/annotation/english/spark-nlp-basics/sample-sentences-en.txt

In [None]:
with open("/content/sample-sentences-en.txt") as f:
  print(f.read())

Peter is a very good person.
My life in Russia is very interesting.
John and Peter are brothers. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!


In [None]:
df= spark.read.text("/content/sample-sentences-en.txt").toDF("text")
df.show(truncate=False)

+-----------------------------------------------------------------------------+
|text                                                                         |
+-----------------------------------------------------------------------------+
|Peter is a very good person.                                                 |
|My life in Russia is very interesting.                                       |
|John and Peter are brothers. However they don't support each other that much.|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |
|Europe is very culture rich. There are huge churches! and big houses!        |
+-----------------------------------------------------------------------------+



In [None]:
text_files= spark.sparkContext.wholeTextFiles("./*.txt", 2)
text_folder_df= text_files.toDF(schema=["path", "text"])
text_folder_df.show(truncate=20)

+--------------------+--------------------+
|                path|                text|
+--------------------+--------------------+
|file:/content/sam...|Peter is a very g...|
+--------------------+--------------------+



In [None]:
text_folder_df.select("text").take(1)

[Row(text="Peter is a very good person.\nMy life in Russia is very interesting.\nJohn and Peter are brothers. However they don't support each other that much.\nLucas Nogal Dunbercker is no longer happy. He has a good car though.\nEurope is very culture rich. There are huge churches! and big houses!")]

In [None]:
text_folder_df.select("text").collect()

[Row(text="Peter is a very good person.\nMy life in Russia is very interesting.\nJohn and Peter are brothers. However they don't support each other that much.\nLucas Nogal Dunbercker is no longer happy. He has a good car though.\nEurope is very culture rich. There are huge churches! and big houses!")]

In [None]:
df.show()

+--------------------+
|                text|
+--------------------+
|Peter is a very g...|
|My life in Russia...|
|John and Peter ar...|
|Lucas Nogal Dunbe...|
|Europe is very cu...|
+--------------------+



In [None]:
from sparknlp.base import * 

In [None]:
documentAssembler= DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")\
      .setCleanupMode("shrink")

doc_df= documentAssembler.transform(df)

In [None]:
doc_df.show(truncate=60)

+------------------------------------------------------------+------------------------------------------------------------+
|                                                        text|                                                    document|
+------------------------------------------------------------+------------------------------------------------------------+
|                                Peter is a very good person.|[[document, 0, 27, Peter is a very good person., [sentenc...|
|                      My life in Russia is very interesting.|[[document, 0, 37, My life in Russia is very interesting....|
|John and Peter are brothers. However they don't support e...|[[document, 0, 76, John and Peter are brothers. However t...|
|Lucas Nogal Dunbercker is no longer happy. He has a good ...|[[document, 0, 67, Lucas Nogal Dunbercker is no longer ha...|
|Europe is very culture rich. There are huge churches! and...|[[document, 0, 68, Europe is very culture rich. There are...|
+-------

In [None]:
doc_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)



In [None]:
doc_df.select("document.result", "document.begin", "document.end").show(truncate=40)

+----------------------------------------+-----+----+
|                                  result|begin| end|
+----------------------------------------+-----+----+
|          [Peter is a very good person.]|  [0]|[27]|
|[My life in Russia is very interesting.]|  [0]|[37]|
|[John and Peter are brothers. However...|  [0]|[76]|
|[Lucas Nogal Dunbercker is no longer ...|  [0]|[67]|
|[Europe is very culture rich. There a...|  [0]|[68]|
+----------------------------------------+-----+----+



In [None]:
doc_df.select("document.result").take(1)

[Row(result=['Peter is a very good person.'])]

In [None]:
from sparknlp.annotator import * 

In [None]:
sentence= SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")
    
sent_df= sentence.transform(doc_df)

In [None]:
sent_df.show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+
|                                    text|                                document|                                sentence|
+----------------------------------------+----------------------------------------+----------------------------------------+
|            Peter is a very good person.|[[document, 0, 27, Peter is a very go...|[[document, 0, 27, Peter is a very go...|
|  My life in Russia is very interesting.|[[document, 0, 37, My life in Russia ...|[[document, 0, 37, My life in Russia ...|
|John and Peter are brothers. However ...|[[document, 0, 76, John and Peter are...|[[document, 0, 27, John and Peter are...|
|Lucas Nogal Dunbercker is no longer h...|[[document, 0, 67, Lucas Nogal Dunber...|[[document, 0, 41, Lucas Nogal Dunber...|
|Europe is very culture rich. There ar...|[[document, 0, 68, Europe is very cul...|[[document, 0, 27, Europe is very cul...|


In [None]:
sentence.extractParamMap()

{Param(parent='SentenceDetector_fb7d225876cf', name='customBounds', doc='characters used to explicitly mark sentence bounds'): [],
 Param(parent='SentenceDetector_fb7d225876cf', name='detectLists', doc='whether detect lists during sentence detection'): True,
 Param(parent='SentenceDetector_fb7d225876cf', name='explodeSentences', doc='whether to explode each sentence into a different row, for better parallelization. Defaults to false.'): False,
 Param(parent='SentenceDetector_fb7d225876cf', name='inputCols', doc='previous annotations columns, if renamed'): ['document'],
 Param(parent='SentenceDetector_fb7d225876cf', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='SentenceDetector_fb7d225876cf', name='maxLength', doc='Set the maximum allowed length for each sentence'): 99999,
 Param(parent='SentenceDetector_fb7d225876cf', name='minLength', doc='Set the minimum allowed length for each sentence.'): 0,
 Param(parent='Sentenc

In [None]:
sent_df.select("sentence.result").show(truncate=40)

+----------------------------------------+
|                                  result|
+----------------------------------------+
|          [Peter is a very good person.]|
|[My life in Russia is very interesting.]|
|[John and Peter are brothers., Howeve...|
|[Lucas Nogal Dunbercker is no longer ...|
|[Europe is very culture rich., There ...|
+----------------------------------------+



In [None]:
sent_dl= SentenceDetectorDLModel()\
      .pretrained("sentence_detector_dl", "en")\
      .setInputCols(["document"])\
      .setOutputCol("sentences")

sentence_detector_dl download started this may take some time.
Approximate size to download 354.6 KB
[OK!]


In [None]:
sent_dl_df= sent_dl.transform(doc_df)
sent_dl_df.select("sentences.result").show(truncate=40)

+----------------------------------------+
|                                  result|
+----------------------------------------+
|          [Peter is a very good person.]|
|[My life in Russia is very interesting.]|
|[John and Peter are brothers., Howeve...|
|[Lucas Nogal Dunbercker is no longer ...|
|[Europe is very culture rich., There ...|
+----------------------------------------+



In [None]:
from pyspark.sql import functions as F

In [None]:
sent_dl_df.select(F.explode("sentences.result")).show(truncate=40)

+----------------------------------------+
|                                     col|
+----------------------------------------+
|            Peter is a very good person.|
|  My life in Russia is very interesting.|
|            John and Peter are brothers.|
|However they don't support each other...|
|Lucas Nogal Dunbercker is no longer h...|
|               He has a good car though.|
|            Europe is very culture rich.|
|There are huge churches! and big houses!|
+----------------------------------------+



In [None]:
tokenizer= Tokenizer()\
      .setInputCols(["document"])\
      .setOutputCol("token")

In [None]:
tokenizer.extractParamMap()

{Param(parent='Tokenizer_f40954ec1d53', name='caseSensitiveExceptions', doc='Whether to care for case sensitiveness in exceptions'): True,
 Param(parent='Tokenizer_f40954ec1d53', name='contextChars', doc='character list used to separate from token boundaries'): ['.',
  ',',
  ';',
  ':',
  '!',
  '?',
  '*',
  '-',
  '(',
  ')',
  '"',
  "'"],
 Param(parent='Tokenizer_f40954ec1d53', name='inputCols', doc='previous annotations columns, if renamed'): ['document'],
 Param(parent='Tokenizer_f40954ec1d53', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='Tokenizer_f40954ec1d53', name='maxLength', doc='Set the maximum allowed legth for each token'): 99999,
 Param(parent='Tokenizer_f40954ec1d53', name='minLength', doc='Set the minimum allowed legth for each token'): 0,
 Param(parent='Tokenizer_f40954ec1d53', name='outputCol', doc='output annotation column. can be left default.'): 'token',
 Param(parent='Tokenizer_f40954ec1d53',

In [None]:
text = 'Peter Parker (Spiderman) is a nice guy and lives in New York but has no e-mail!'

spark_df= spark.createDataFrame([[text]]).toDF("text")

In [None]:
doc_df= documentAssembler.transform(spark_df)
token_df= tokenizer.fit(doc_df).transform(doc_df)
token_df.show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+
|                                    text|                                document|                                   token|
+----------------------------------------+----------------------------------------+----------------------------------------+
|Peter Parker (Spiderman) is a nice gu...|[[document, 0, 78, Peter Parker (Spid...|[[token, 0, 4, Peter, [sentence -> 0]...|
+----------------------------------------+----------------------------------------+----------------------------------------+



In [None]:
token_df.select("token.result").take(1)

[Row(result=['Peter', 'Parker', '(', 'Spiderman', ')', 'is', 'a', 'nice', 'guy', 'and', 'lives', 'in', 'New', 'York', 'but', 'has', 'no', 'e-mail', '!'])]

In [None]:
tokenizer= Tokenizer()\
      .setInputCols(["document"])\
      .setOutputCol("token")\
      .setContextChars(["?", "!"])\
      .setSplitChars(["-"])\
      .addException("New York")

token_df= tokenizer.fit(doc_df).transform(doc_df)
token_df.show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+
|                                    text|                                document|                                   token|
+----------------------------------------+----------------------------------------+----------------------------------------+
|Peter Parker (Spiderman) is a nice gu...|[[document, 0, 78, Peter Parker (Spid...|[[token, 0, 4, Peter, [sentence -> 0]...|
+----------------------------------------+----------------------------------------+----------------------------------------+



In [None]:
token_df.select("token.result").show(truncate=False)

+--------------------------------------------------------------------------------------------------+
|result                                                                                            |
+--------------------------------------------------------------------------------------------------+
|[Peter, Parker, (Spiderman), is, a, nice, guy, and, lives, in, New York, but, has, no, e, mail, !]|
+--------------------------------------------------------------------------------------------------+



## Spark ML Pipeline

In [None]:
from pyspark.ml import Pipeline

In [None]:
documentAssembler= DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentencer= SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

tokenizer= Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")\
      .setSplitChars(["-"])\
      .setContextChars(["?", "!"])
 
 
nlp_pipeline= Pipeline(stages=[
                               documentAssembler,
                               sentencer,
                               tokenizer
 ])

In [None]:
df= spark.read.text("/content/sample-sentences-en.txt").toDF("text")
empty_df= spark.createDataFrame([[" "]]).toDF("text")

pipeline_model= nlp_pipeline.fit(empty_df)
result= pipeline_model.transform(df)

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|
+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[document, 0, 27...|[[token, 0, 4, Pe...|
|My life in Russia...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 1, My...|
|John and Peter ar...|[[document, 0, 76...|[[document, 0, 27...|[[token, 0, 3, Jo...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[document, 0, 41...|[[token, 0, 4, Lu...|
|Europe is very cu...|[[document, 0, 68...|[[document, 0, 27...|[[token, 0, 5, Eu...|
+--------------------+--------------------+--------------------+--------------------+



In [None]:
result.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true

In [None]:
result.select("sentence.result").show(truncate=False)

+--------------------------------------------------------------------------------+
|result                                                                          |
+--------------------------------------------------------------------------------+
|[Peter is a very good person.]                                                  |
|[My life in Russia is very interesting.]                                        |
|[John and Peter are brothers., However they don't support each other that much.]|
|[Lucas Nogal Dunbercker is no longer happy., He has a good car though.]         |
|[Europe is very culture rich., There are huge churches!, and big houses!]       |
+--------------------------------------------------------------------------------+



In [None]:
documentAssembler= DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentencer= SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

tokenizer= Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

normalizer= Normalizer()\
      .setInputCols(["token"])\
      .setOutputCol("normalized")\
      .setLowercase(True)\
      .setCleanupPatterns(["[^\w\d\s]"])   

nlpPipeline= Pipeline(stages=[ 
                              documentAssembler,
                              sentencer,
                              tokenizer,
                              normalizer
])

empty_df= spark.createDataFrame([[" "]]).toDF("text")

pipeline_model= nlpPipeline.fit(empty_df)

result= pipeline_model.transform(df)

In [None]:
result.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          normalized|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[token, 0, 4, pe...|
|My life in Russia...|[[document, 0, 37...|[[document, 0, 37...|[[token, 0, 1, My...|[[token, 0, 1, my...|
|John and Peter ar...|[[document, 0, 76...|[[document, 0, 27...|[[token, 0, 3, Jo...|[[token, 0, 3, jo...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[document, 0, 41...|[[token, 0, 4, Lu...|[[token, 0, 4, lu...|
|Europe is very cu...|[[document, 0, 68...|[[document, 0, 27...|[[token, 0, 5, Eu...|[[token, 0, 5, eu...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
result.select("normalized.result").take(2)

[Row(result=['peter', 'is', 'a', 'very', 'good', 'person']),
 Row(result=['my', 'life', 'in', 'russia', 'is', 'very', 'interesting'])]

In [None]:
stopwords_cleaner= StopWordsCleaner()\
      .setInputCols(["token"])\
      .setOutputCol("cleaned")\
      .setCaseSensitive(False)\
      #.setStopwords() (["no", "without"]) (e.g. read a list of words from a txt)

stopwords_cleaner.getStopWords()

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [None]:
documentAssembler= DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentencer= SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

tokenizer= Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

nlpPipeline= Pipeline(stages= [ 
                               documentAssembler,
                               sentencer,
                               tokenizer,
                               stopwords_cleaner
])

empty_df= spark.createDataFrame([[" "]]).toDF("text")
pipeline_model= nlpPipeline.fit(empty_df)

In [None]:
result= pipeline_model.transform(df)
result.show(truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|                                    text|                                document|                                sentence|                                   token|                                 cleaned|
+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|            Peter is a very good person.|[[document, 0, 27, Peter is a very go...|[[document, 0, 27, Peter is a very go...|[[token, 0, 4, Peter, [sentence -> 0]...|[[token, 0, 4, Peter, [sentence -> 0]...|
|  My life in Russia is very interesting.|[[document, 0, 37, My life in Russia ...|[[document, 0, 37, My life in Russia ...|[[token, 0, 1, My, [sentence -> 0], [...|[[token

In [None]:
result.select("cleaned.result").show(truncate=False)

+------------------------------------------------------------------+
|result                                                            |
+------------------------------------------------------------------+
|[Peter, good, person, .]                                          |
|[life, Russia, interesting, .]                                    |
|[John, Peter, brothers, ., However, support, much, .]             |
|[Lucas, Nogal, Dunbercker, longer, happy, ., good, car, though, .]|
|[Europe, culture, rich, ., huge, churches, !, big, houses, !]     |
+------------------------------------------------------------------+



In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentences')

tokenizer = Tokenizer() \
    .setInputCols(["sentences"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")\
    .setLowercase(False)\

stopwords_cleaner = StopWordsCleaner()\
    .setInputCols("normalized")\
    .setOutputCol("cleanTokens")\
    .setCaseSensitive(False)\

token_assembler= TokenAssembler()\
    .setInputCols(["sentences", "cleanTokens"])\
    .setOutputCol("assembled")

nlpPipeline= Pipeline(stages=[ 
                              documentAssembler,
                              sentencer,
                              tokenizer,
                              normalizer,
                              stopwords_cleaner,
                              token_assembler
])

empty_df= spark.createDataFrame([[" "]]).toDF("text")
pipeline_model= nlpPipeline.fit(empty_df)

result= pipeline_model.transform(df)

In [None]:
result.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+
|                                              text|                                          document|                                         sentences|                                             token|                                        normalized|                                       cleanTokens|                                         assembled|
+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------------------------------+--------------------------

In [None]:
# if we use TokenAssembler().setPreservePosition(True), the original borders will be preserved (dropped & unwanted chars will be replaced by spaces)


In [None]:
result.select("assembled").take(1)

[Row(assembled=[Row(annotatorType='document', begin=0, end=16, result='Peter good person', metadata={'sentence': '0'}, embeddings=[])])]

In [None]:
result.select("text", F.explode("assembled.result").alias("cleaned_text")).show(truncate=False)

+-----------------------------------------------------------------------------+-----------------------------------+
|text                                                                         |cleaned_text                       |
+-----------------------------------------------------------------------------+-----------------------------------+
|Peter is a very good person.                                                 |Peter good person                  |
|My life in Russia is very interesting.                                       |life Russia interesting            |
|John and Peter are brothers. However they don't support each other that much.|John Peter brothers                |
|John and Peter are brothers. However they don't support each other that much.|However dont support much          |
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.         |Lucas Nogal Dunbercker longer happy|
|Lucas Nogal Dunbercker is no longer happy. He has a good car though.   

In [None]:
import pandas as pd
result.select("text", F.explode("assembled.result").alias("cleaned_text")).toPandas()

Unnamed: 0,text,cleaned_text
0,Peter is a very good person.,Peter good person
1,My life in Russia is very interesting.,life Russia interesting
2,John and Peter are brothers. However they don'...,John Peter brothers
3,John and Peter are brothers. However they don'...,However dont support much
4,Lucas Nogal Dunbercker is no longer happy. He ...,Lucas Nogal Dunbercker longer happy
5,Lucas Nogal Dunbercker is no longer happy. He ...,good car though
6,Europe is very culture rich. There are huge ch...,Europe culture rich
7,Europe is very culture rich. There are huge ch...,huge churches
8,Europe is very culture rich. There are huge ch...,big houses


In [None]:
result.withColumn(
    "tmp", 
    F.explode("assembled"))\
    .select("tmp.*").select("begin", "end", "result", "metadata.sentence").show(truncate=False)

+-----+---+-----------------------------------+--------+
|begin|end|result                             |sentence|
+-----+---+-----------------------------------+--------+
|0    |16 |Peter good person                  |0       |
|0    |22 |life Russia interesting            |0       |
|0    |18 |John Peter brothers                |0       |
|29   |53 |However dont support much          |1       |
|0    |34 |Lucas Nogal Dunbercker longer happy|0       |
|43   |57 |good car though                    |1       |
|0    |18 |Europe culture rich                |0       |
|29   |41 |huge churches                      |1       |
|54   |63 |big houses                         |2       |
+-----+---+-----------------------------------+--------+



+-------------+-----+---+--------------------+---------------+----------+
|annotatorType|begin|end|              result|       metadata|embeddings|
+-------------+-----+---+--------------------+---------------+----------+
|     document|    0| 16|   Peter good person|[sentence -> 0]|        []|
|     document|    0| 22|life Russia inter...|[sentence -> 0]|        []|
|     document|    0| 18| John Peter brothers|[sentence -> 0]|        []|
|     document|   29| 53|However dont supp...|[sentence -> 1]|        []|
|     document|    0| 34|Lucas Nogal Dunbe...|[sentence -> 0]|        []|
|     document|   43| 57|     good car though|[sentence -> 1]|        []|
|     document|    0| 18| Europe culture rich|[sentence -> 0]|        []|
|     document|   29| 41|       huge churches|[sentence -> 1]|        []|
|     document|   54| 63|          big houses|[sentence -> 2]|        []|
+-------------+-----+---+--------------------+---------------+----------+



In [None]:
!wget -q https://raw.githubusercontent.com/mahavivo/vocabulary/master/lemmas/AntBNC_lemmas_ver_001.txt


In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
    
stemmer= Stemmer()\
    .setInputCols(["token"])\
    .setOutputCol("stem")

lemmatizer = Lemmatizer() \
    .setInputCols(["token"]) \
    .setOutputCol("lemma") \
    .setDictionary("./AntBNC_lemmas_ver_001.txt", value_delimiter ="\t", key_delimiter = "->")

In [None]:
nlpPipeline= Pipeline(stages=[ 
                              documentAssembler,
                              tokenizer,
                              stemmer,
                              lemmatizer
                              ])

empty_df= spark.createDataFrame([[" "]]).toDF("text")
pipeline_model= nlpPipeline.fit(empty_df)

result= pipeline_model.transform(df)

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                stem|               lemma|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a very g...|[[document, 0, 27...|[[token, 0, 4, Pe...|[[token, 0, 4, pe...|[[token, 0, 4, Pe...|
|My life in Russia...|[[document, 0, 37...|[[token, 0, 1, My...|[[token, 0, 1, my...|[[token, 0, 1, My...|
|John and Peter ar...|[[document, 0, 76...|[[token, 0, 3, Jo...|[[token, 0, 3, jo...|[[token, 0, 3, Jo...|
|Lucas Nogal Dunbe...|[[document, 0, 67...|[[token, 0, 4, Lu...|[[token, 0, 4, lu...|[[token, 0, 4, Lu...|
|Europe is very cu...|[[document, 0, 68...|[[token, 0, 5, Eu...|[[token, 0, 5, eu...|[[token, 0, 5, Eu...|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
result_df= result.select(F.explode(F.arrays_zip("token.result", "stem.result", "lemma.result")).alias("col"))\
    .select(F.expr("col['0']").alias("token"),
            F.expr("col['1']").alias("stem"),
            F.expr("col['2']").alias("lemma")).toPandas()

result_df.head(10)

Unnamed: 0,token,stem,lemma
0,Peter,peter,Peter
1,is,i,be
2,a,a,a
3,very,veri,very
4,good,good,good
5,person,person,person
6,.,.,.
7,My,my,My
8,life,life,life
9,in,in,in


## NGram Generator 

In [None]:
from sparknlp.base import *
from sparknlp.annotator import *
import pyspark.sql.functions as F

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentencer = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentences')

tokenizer = Tokenizer()\
    .setInputCols(["sentences"]) \
    .setOutputCol("token")

stemmer= Stemmer()\
    .setInputCols(["token"])\
    .setOutputCol("stem")

ngram= NGramGenerator()\
    .setInputCols(["stem"])\
    .setOutputCol("ngram")\
    .setN(3)\
    .setDelimiter("_")\
    .setEnableCumulative(True)

nlpPipeline= Pipeline(stages=[ 
                              documentAssembler,
                              sentencer,
                              tokenizer,
                              stemmer,
                              ngram
])

empty_df= spark.createDataFrame([[" "]]).toDF("text")
pipeline_model= nlpPipeline.fit(empty_df)

result= pipeline_model.transform(df)

In [None]:
result.select("ngram.result").show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                              result|
+----------------------------------------------------------------------------------------------------+
|[peter, i, a, veri, good, person, ., peter_i, i_a, a_veri, veri_good, good_person, person_., pete...|
|[my, life, in, russia, i, veri, interest, ., my_life, life_in, in_russia, russia_i, i_veri, veri_...|
|[john, and, peter, ar, brother, ., john_and, and_peter, peter_ar, ar_brother, brother_., john_and...|
|[luca, nogal, dunberck, i, no, longer, happi, ., luca_nogal, nogal_dunberck, dunberck_i, i_no, no...|
|[europ, i, veri, cultur, rich, ., europ_i, i_veri, veri_cultur, cultur_rich, rich_., europ_i_veri...|
+----------------------------------------------------------------------------------------------------+



### Text Matcher (Entity Extractor)

In [None]:
entity_extractor= TextMatcher()\
      .setInputCols(["document", "token"])\
      .setOutputCol("matched_entities")

entity_extractor.extractParamMap()

{Param(parent='TextMatcher_23f4f952ec5f', name='caseSensitive', doc='whether to match regardless of case. Defaults true'): True,
 Param(parent='TextMatcher_23f4f952ec5f', name='inputCols', doc='previous annotations columns, if renamed'): ['document',
  'token'],
 Param(parent='TextMatcher_23f4f952ec5f', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='TextMatcher_23f4f952ec5f', name='mergeOverlapping', doc='whether to merge overlapping matched chunks. Defaults false'): False,
 Param(parent='TextMatcher_23f4f952ec5f', name='outputCol', doc='output annotation column. can be left default.'): 'matched_entities'}

In [None]:
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv

In [None]:
news_df= spark.read\
      .option("header", True)\
      .csv("/content/news_category_train.csv")

news_df.show(5, truncate=50)

+--------+--------------------------------------------------+
|category|                                       description|
+--------+--------------------------------------------------+
|Business| Short sellers, Wall Street's dwindling band of...|
|Business| Private investment firm Carlyle Group, which h...|
|Business| Soaring crude prices plus worries about the ec...|
|Business| Authorities have halted oil export flows from ...|
|Business| Tearaway world oil prices, toppling records an...|
+--------+--------------------------------------------------+
only showing top 5 rows



In [None]:
#writing the target entities to the txt file
entities = ['Wall Street', 'USD', 'stock', 'NYSE']

with open("financial_entities.txt", "w") as f:
  for i in entities:
    f.write(i+ "\n")

entities = ['soccer', 'world cup', 'Messi', 'FC Barcelona']
with open("sport_entities.txt", "w") as f:
  for i in entities:
    f.write(i+ "\n")


In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

tokenizer = Tokenizer()\
    .setInputCols(["document"]) \
    .setOutputCol("token")

financial_entity_extractor= TextMatcher()\
    .setInputCols(["document", "token"])\
    .setOutputCol("financial_entities")\
    .setCaseSensitive(False)\
    .setEntities("financial_entities.txt")\
    .setEntityValue("financial_entity")

sport_entity_extractor=TextMatcher()\
    .setInputCols(["document", "token"])\
    .setOutputCol("sport_entities")\
    .setCaseSensitive(False)\
    .setEntities("sport_entities.txt")\
    .setEntityValue("sport_entity")

nlpPipeline= Pipeline(stages= [ 
                               documentAssembler,
                               tokenizer,
                               financial_entity_extractor,
                               sport_entity_extractor
])

empty_df= spark.createDataFrame([[" "]]).toDF("description")
pipeline_model= nlpPipeline.fit(empty_df)

In [None]:
result= pipeline_model.transform(news_df)


In [None]:
result.show(5, truncate=40)

+--------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+--------------+
|category|                             description|                                document|                                   token|                      financial_entities|sport_entities|
+--------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+--------------+
|Business| Short sellers, Wall Street's dwindli...|[[document, 0, 84,  Short sellers, Wa...|[[token, 1, 5, Short, [sentence -> 0]...|                                      []|            []|
|Business| Private investment firm Carlyle Grou...|[[document, 0, 204,  Private investme...|[[token, 1, 7, Private, [sentence -> ...|                                      []|            []|
|Business| Soaring crude prices plus worries ab...

In [None]:
result.select("financial_entities.result", "sport_entities.result").take(2)

[Row(result=[], result=[]), Row(result=[], result=[])]

In [None]:
result.select("description","financial_entities.result", "sport_entities.result")\
  .toDF("description", "financial_matches", "sport_matches").filter((F.size("financial_matches")>1) | (F.size("sport_matches")>1))\
  .show(truncate=70)

+----------------------------------------------------------------------+----------------------------------+-------------------+
|                                                           description|                 financial_matches|      sport_matches|
+----------------------------------------------------------------------+----------------------------------+-------------------+
|"Company launched the biggest electronic auction of stock in Wall S...|              [stock, Wall Street]|                 []|
|Google, Inc. significantly cut the expected share price for its ini...|                    [stock, stock]|                 []|
|Google, Inc. significantly cut the expected share price this mornin...|                    [stock, stock]|                 []|
| Shares of Air Canada  (AC.TO) fell by more than half on Wednesday,...|                    [Stock, stock]|                 []|
|Stock prices are lower in moderate trading. The Dow Jones Industria...|                    [Stock, Stoc

In [None]:
result_df= result.select(F.explode(F.arrays_zip("financial_entities.result", "financial_entities.begin", "financial_entities.end")).alias("col"))\
    .select(F.expr("col['0']").alias("financial_chunk"),
            F.expr("col['1']").alias("begin"), 
            F.expr("col['2']").alias("end")).toPandas()
result_df.head()

Unnamed: 0,financial_chunk,begin,end
0,stock,112,116
1,stock,114,118
2,stock,45,49
3,stock,126,130
4,stock,188,192


In [None]:
! wget -q	https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/pubmed/pubmed-sample.csv


In [None]:
pubMedDf= spark.read\
  .option("header", True)\
  .csv("/content/pubmed-sample.csv")\
  .filter("AB IS NOT null")\
  .withColumnRenamed("AB", "text")\
  .drop("TI")

pubMedDf.show(truncate=50)

+--------------------------------------------------+
|                                              text|
+--------------------------------------------------+
|The human KCNJ9 (Kir 3.3, GIRK3) is a member of...|
|BACKGROUND: At present, it is one of the most i...|
|OBJECTIVE: To investigate the relationship betw...|
|Combined EEG/fMRI recording has been used to lo...|
|Kohlschutter syndrome is a rare neurodegenerati...|
|Statistical analysis of neuroimages is commonly...|
|The synthetic DOX-LNA conjugate was characteriz...|
|Our objective was to compare three different me...|
|We conducted a phase II study to assess the eff...|
|"Monomeric sarcosine oxidase (MSOX) is a flavoe...|
|We presented the tachinid fly Exorista japonica...|
|The literature dealing with the water conductin...|
|A novel approach to synthesize chitosan-O-isopr...|
|An HPLC-ESI-MS-MS method has been developed for...|
|The localizing and lateralizing values of eye a...|
|OBJECTIVE: To evaluate the effectiveness and 

In [None]:
rules = '''
renal\s\w+, started with 'renal'
cardiac\s\w+, started with 'cardiac'
\w*ly\b, ending with 'ly'
\S*\d+\S*, match any word that contains numbers
(\d+).?(\d*)\s*(mg|ml|g), match medication metrics
'''

with open("regex_rules", "w") as f:
  f.write(rules)
  

In [None]:
documentAssembler= DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

regex_matcher= RegexMatcher()\
  .setInputCols(["document"])\
  .setStrategy("MATCH_ALL")\
  .setOutputCol("matched_regex")\
  .setExternalRules("/content/regex_rules", delimiter=",")

nlpPipeline= Pipeline(stages= [ 
                               documentAssembler,
                               regex_matcher
])

empty_df= spark.createDataFrame([[" "]]).toDF("text")
pipeline_model= nlpPipeline.fit(empty_df)
result= pipeline_model.transform(pubMedDf)


In [None]:
result.show()

+--------------------+--------------------+--------------------+
|                text|            document|       matched_regex|
+--------------------+--------------------+--------------------+
|The human KCNJ9 (...|[[document, 0, 95...|[[chunk, 72, 79, ...|
|BACKGROUND: At pr...|[[document, 0, 14...|[[chunk, 143, 152...|
|OBJECTIVE: To inv...|[[document, 0, 15...|[[chunk, 805, 817...|
|Combined EEG/fMRI...|[[document, 0, 16...|[[chunk, 335, 342...|
|Kohlschutter synd...|[[document, 0, 25...|[[chunk, 220, 225...|
|Statistical analy...|[[document, 0, 10...|[[chunk, 12, 16, ...|
|The synthetic DOX...|[[document, 0, 57...|[[chunk, 150, 157...|
|Our objective was...|[[document, 0, 24...|[[chunk, 397, 401...|
|We conducted a ph...|[[document, 0, 14...|[[chunk, 855, 859...|
|"Monomeric sarcos...|[[document, 0, 14...|[[chunk, 58, 63, ...|
|We presented the ...|[[document, 0, 12...|[[chunk, 26, 28, ...|
|The literature de...|[[document, 0, 16...|[[chunk, 427, 435...|
|A novel approach ...|[[d

In [None]:
result.select("text", "matched_regex.result")\
    .toDF("text", "regex").filter(F.size("regex")>1)\
    .show(truncate=80)


NameError: ignored

In [None]:
MultiDateMatcher().extractParamMap()

{Param(parent='MultiDateMatcher_54f0349cc579', name='dateFormat', doc='desired format for dates extracted'): 'yyyy/MM/dd',
 Param(parent='MultiDateMatcher_54f0349cc579', name='defaultDayWhenMissing', doc='which day to set when it is missing from parsed input'): 1,
 Param(parent='MultiDateMatcher_54f0349cc579', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='MultiDateMatcher_54f0349cc579', name='readMonthFirst', doc='Whether to parse july 07/05/2015 or as 05/07/2015'): True}

In [None]:
documentAssembler= DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

date_matcher= MultiDateMatcher()\
    .setInputCols(["document"])\
    .setOutputCol("dates")\
    .setDateFormat("yyyy/mm/dd")

date_pipeline= Pipeline(stages=[ 
                                documentAssembler,
                                date_matcher
])

date_df= spark.createDataFrame([['I saw him yesterday and he told me that he will visit us next week']]).toDF("text")
pipeline_model= date_pipeline.fit(date_df)
result=pipeline_model.transform(date_df)


In [None]:
result.show()

+--------------------+--------------------+--------------------+
|                text|            document|               dates|
+--------------------+--------------------+--------------------+
|I saw him yesterd...|[[document, 0, 65...|[[date, 57, 65, 2...|
+--------------------+--------------------+--------------------+



In [None]:
result.select("dates.result").show(truncate=False)

+------------------------+
|result                  |
+------------------------+
|[2021/08/04, 2021/08/26]|
+------------------------+



In [None]:
result.select("document.result", "dates.result")\
    .toDF("text", "dates").show(truncate=False)

+--------------------------------------------------------------------+------------------------+
|text                                                                |dates                   |
+--------------------------------------------------------------------+------------------------+
|[I saw him yesterday and he told me that he will visit us next week]|[2021/10/04, 2021/10/26]|
+--------------------------------------------------------------------+------------------------+



In [None]:
documentAssembler= DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

date_matcher= MultiDateMatcher()\
    .setInputCols(["document"])\
    .setOutputCol("dates")\
    .setDateFormat("yyyy/mm/dd")

date_pipeline= Pipeline(stages=[ 
                                documentAssembler,
                                date_matcher
])

date_df= spark.createDataFrame([['I saw him yesterday and he told me that he will visit us next week.']]).toDF("text")
pipeline_model= date_pipeline.fit(date_df)
result=pipeline_model.transform(date_df)


In [None]:
result.select("dates.result").show(truncate=False)

+------------------------+
|result                  |
+------------------------+
|[2021/17/04, 2021/17/26]|
+------------------------+

