In [2]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-04-04 20:13:55--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.26
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.26|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-04-04 20:13:55--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1593 (1.6K) [text/plain]
Saving to: ‘STDOUT’


2021-04-04 20:13:55 (21.7 MB/s) - written to stdout [1593/1593]

setup Colab for PySpark 3.1.1 and Spark NLP 3.0.1
[K     |████████████████████████████████|

In [6]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *


In [7]:
spark = sparknlp.start()

In [8]:
data = spark.createDataFrame([['Peter is a godo person living in Germny. Paula is also a good person. She lives in London.']]).toDF('text')

In [9]:
data.show(truncate=False)

+------------------------------------------------------------------------------------------+
|text                                                                                      |
+------------------------------------------------------------------------------------------+
|Peter is a godo person living in Germny. Paula is also a good person. She lives in London.|
+------------------------------------------------------------------------------------------+



In [10]:
document = DocumentAssembler().setInputCol('text').setOutputCol('document').setCleanupMode('shrink')

In [12]:
sentence = SentenceDetector().setInputCols('document').setOutputCol('sentence')

In [13]:
sentence.setExplodeSentences(True)

SentenceDetector_912b0b16f735

In [16]:
tokenizer = Tokenizer().setInputCols('sentence').setOutputCol('token')

In [17]:
tokenizer.setExceptions(['e-mail']) 

Tokenizer_7111ed85f6ac

In [18]:
checker = NorvigSweetingModel.pretrained().setInputCols(['token']).setOutputCol('checked')

spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]


In [22]:
embeddings = WordEmbeddingsModel.pretrained().setInputCols(['sentence', 'token']).setOutputCol('embeddings')

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [24]:
ner = NerDLModel.pretrained().setInputCols(['sentence', 'checked', 'embeddings']).setOutputCol('ner')

ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [25]:
converter = NerConverter().setInputCols(['sentence', 'checked', 'ner']).setOutputCol('chunk')

In [28]:
from pyspark.ml import Pipeline


pipeline = Pipeline().setStages([document, sentence, tokenizer, checker,embeddings,  ner, converter])

In [29]:
model = pipeline.fit(data)

In [30]:
result = model.transform(data)

In [32]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|             checked|          embeddings|                 ner|               chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Peter is a godo p...|[{document, 0, 89...|[{document, 0, 39...|[{token, 0, 4, Pe...|[{token, 0, 4, Pe...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 4, Pe...|
|Peter is a godo p...|[{document, 0, 89...|[{document, 41, 6...|[{token, 41, 45, ...|[{token, 41, 45, ...|[{word_embeddings...|[{named_entity, 4...|[{chunk, 41, 45, ...|
|Peter is a godo p...|[{document, 0, 89...|[{document, 70, 8...|[{token, 70, 72, ...|[{token, 70, 72, ...|[{word_embeddings...|[{named_entity, 7...|[{

In [33]:
result.select('sentence.result').show(truncate=False)

+------------------------------------------+
|result                                    |
+------------------------------------------+
|[Peter is a godo person living in Germny.]|
|[Paula is also a good person.]            |
|[She lives in London.]                    |
+------------------------------------------+



In [34]:
result.select('checked.result').show(truncate=False)

+---------------------------------------------------+
|result                                             |
+---------------------------------------------------+
|[Peter, is, a, god, person, living, in, Germany, .]|
|[Paula, is, also, a, good, person, .]              |
|[She, lives, in, London, .]                        |
+---------------------------------------------------+



In [35]:
result.select('ner.result').show(truncate=False)

+-----------------------------------+
|result                             |
+-----------------------------------+
|[B-PER, O, O, O, O, O, O, B-LOC, O]|
|[B-PER, O, O, O, O, O, O]          |
|[O, O, O, B-LOC, O]                |
+-----------------------------------+



In [37]:
result.select('ner.begin', 'ner.end').show(truncate=False)

+---------------------------------+---------------------------------+
|begin                            |end                              |
+---------------------------------+---------------------------------+
|[0, 6, 9, 11, 16, 23, 30, 33, 39]|[4, 7, 9, 14, 21, 28, 31, 38, 39]|
|[41, 47, 50, 55, 57, 62, 68]     |[45, 48, 53, 55, 60, 67, 68]     |
|[70, 74, 80, 83, 89]             |[72, 78, 81, 88, 89]             |
+---------------------------------+---------------------------------+



In [38]:
result.select('chunk.result', 'chunk.begin', 'chunk.end').show(truncate=False)

+---------------+-------+-------+
|result         |begin  |end    |
+---------------+-------+-------+
|[Peter, Germny]|[0, 33]|[4, 38]|
|[Paula]        |[41]   |[45]   |
|[London]       |[83]   |[88]   |
+---------------+-------+-------+



In [39]:
light = LightPipeline(model)

In [40]:
light.annotate('Bruno is living in Italy, and he is doing well')

{'checked': ['Bruno',
  'is',
  'living',
  'in',
  'Italy',
  ',',
  'and',
  'he',
  'is',
  'doing',
  'well'],
 'chunk': ['Bruno', 'Italy'],
 'document': ['Bruno is living in Italy, and he is doing well'],
 'embeddings': ['Bruno',
  'is',
  'living',
  'in',
  'Italy',
  ',',
  'and',
  'he',
  'is',
  'doing',
  'well'],
 'ner': ['B-PER', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O'],
 'sentence': ['Bruno is living in Italy, and he is doing well'],
 'token': ['Bruno',
  'is',
  'living',
  'in',
  'Italy',
  ',',
  'and',
  'he',
  'is',
  'doing',
  'well']}