In [None]:
! pip install -q pyspark==3.1.2 spark-nlp

[K     |████████████████████████████████| 212.4 MB 68 kB/s 
[K     |████████████████████████████████| 130 kB 56.3 MB/s 
[K     |████████████████████████████████| 198 kB 60.1 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:


import sparknlp

spark = sparknlp.start()
# params =>> gpu=False, spark23=False (start with spark 2.3)


print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)



Spark NLP version 3.3.2
Apache Spark version: 3.1.2


In [None]:
! cd ~/.ivy2/cache/com.johnsnowlabs.nlp/spark-nlp_2.12/jars && ls -lt

total 38928
-rw-r--r-- 1 root root 39858655 Nov  3 13:50 spark-nlp_2.12-3.3.2.jar


In [None]:
# https://github.com/JohnSnowLabs/spark-nlp/blob/master/python/sparknlp/__init__.py

from pyspark.sql import SparkSession

def start(gpu=False, spark23=False):
    current_version="2.5.4"
    maven_spark24 = "com.johnsnowlabs.nlp:spark-nlp_2.11:{}".format(current_version)
    maven_gpu_spark24 = "com.johnsnowlabs.nlp:spark-nlp-gpu_2.11:{}".format(current_version)
    maven_spark23 = "com.johnsnowlabs.nlp:spark-nlp-spark23_2.11:{}".format(current_version)
    maven_gpu_spark23 = "com.johnsnowlabs.nlp:spark-nlp-gpu-spark23_2.11:{}".format(current_version)

    builder = SparkSession.builder \
        .appName("Spark NLP") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "1000M") \
        .config("spark.driver.maxResultSize", "0")
    if gpu and spark23:
        builder.config("spark.jars.packages", maven_gpu_spark23)
    elif spark23:
        builder.config("spark.jars.packages", maven_spark23)
    elif gpu:
        builder.config("spark.jars.packages", maven_gpu_spark24)
    else:
        builder.config("spark.jars.packages", maven_spark24)
        
    return builder.getOrCreate()

In [None]:
from sparknlp.pretrained import PretrainedPipeline

In [None]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

# Explain Document ML

###Stages
    _ DocumentAssembler
    SentenceDetector
    Tokenizer
    Lemmatizer
    Stemmer
    Part of Speech
    SpellChecker (Norvig)

In [None]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')


explain_document_ml download started this may take some time.
Approx size to download 9.1 MB
[OK!]


In [None]:
pipeline.model.stages


[document_811d40a38b24,
 SENTENCE_ce56851acebe,
 REGEX_TOKENIZER_78daa3b4692f,
 SPELL_79c88338ef12,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_caf11d1f4d0e,
 POS_dbb704204f6f]

In [None]:
# Load pretrained pipeline from local disk:

pipeline_local = PretrainedPipeline.from_disk('/root/cache_pretrained/explain_document_ml_en_3.1.3_3.0_1632168876620')

In [None]:


%%time

result = pipeline_local.annotate(testDoc)



CPU times: user 32.3 ms, sys: 8.68 ms, total: 41 ms
Wall time: 1.85 s


In [None]:
result.keys()

dict_keys(['document', 'spell', 'pos', 'lemmas', 'token', 'stems', 'sentence'])

In [None]:
result['sentence']

['Peter is a very good persn.',
 'My life in Russia is very intersting.',
 'John and Peter are brthers.',
 "However they don't support each other that much.",
 'Lucas Dunbercker is no longer happy.',
 'He has a good car though.',
 'Europe is very culture rich.',
 'There are huge churches!',
 'and big houses!']

In [None]:
result['token']

['Peter',
 'is',
 'a',
 'very',
 'good',
 'persn',
 '.',
 'My',
 'life',
 'in',
 'Russia',
 'is',
 'very',
 'intersting',
 '.',
 'John',
 'and',
 'Peter',
 'are',
 'brthers',
 '.',
 'However',
 'they',
 "don't",
 'support',
 'each',
 'other',
 'that',
 'much',
 '.',
 'Lucas',
 'Dunbercker',
 'is',
 'no',
 'longer',
 'happy',
 '.',
 'He',
 'has',
 'a',
 'good',
 'car',
 'though',
 '.',
 'Europe',
 'is',
 'very',
 'culture',
 'rich',
 '.',
 'There',
 'are',
 'huge',
 'churches',
 '!',
 'and',
 'big',
 'houses',
 '!']

In [None]:
list(zip(result['token'], result['pos']))

[('Peter', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('persn', 'NN'),
 ('.', '.'),
 ('My', 'PRP$'),
 ('life', 'NN'),
 ('in', 'IN'),
 ('Russia', 'NNP'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('intersting', 'JJ'),
 ('.', '.'),
 ('John', 'NNP'),
 ('and', 'CC'),
 ('Peter', 'NNP'),
 ('are', 'VBP'),
 ('brthers', 'NNS'),
 ('.', '.'),
 ('However', 'RB'),
 ('they', 'PRP'),
 ("don't", 'VBP'),
 ('support', 'VB'),
 ('each', 'DT'),
 ('other', 'JJ'),
 ('that', 'IN'),
 ('much', 'JJ'),
 ('.', '.'),
 ('Lucas', 'NNP'),
 ('Dunbercker', 'NNP'),
 ('is', 'VBZ'),
 ('no', 'DT'),
 ('longer', 'RB'),
 ('happy', 'JJ'),
 ('.', '.'),
 ('He', 'PRP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('car', 'NN'),
 ('though', 'IN'),
 ('.', '.'),
 ('Europe', 'NNP'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('culture', 'RB'),
 ('rich', 'JJ'),
 ('.', '.'),
 ('There', 'EX'),
 ('are', 'VBP'),
 ('huge', 'JJ'),
 ('churches', 'NNS'),
 ('!', '.'),
 ('and', 'CC'),
 ('big', 'JJ'),
 ('houses', 'NNS'),
 ('!', 

In [None]:
import pandas as pd
pd.set_option("display.max_rows", 100)

df = pd.DataFrame({'token':result['token'], 
                      'corrected':result['spell'], 'POS':result['pos'],
                      'lemmas':result['lemmas'], 'stems':result['stems']})
df

Unnamed: 0,token,corrected,POS,lemmas,stems
0,Peter,Peter,NNP,Peter,peter
1,is,is,VBZ,be,i
2,a,a,DT,a,a
3,very,very,RB,very,veri
4,good,good,JJ,good,good
5,persn,person,NN,person,person
6,.,.,.,.,.
7,My,My,PRP$,My,my
8,life,life,NN,life,life
9,in,in,IN,in,in


# Explain Doc 

In [None]:
pipeline_dl = PretrainedPipeline('explain_document_dl', lang='en')

explain_document_dl download started this may take some time.
Approx size to download 169.4 MB
[OK!]


In [None]:
pipeline_dl.model.stages

[document_7939d5bf1083,
 SENTENCE_05265b07c745,
 REGEX_TOKENIZER_c5c312143f63,
 SPELL_e4ea67180337,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_ba49f7631065,
 POS_d01c734956fe,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_a81db9af2d23]

In [None]:
pipeline_dl.model.stages[-2].getStorageRef()

'glove_100d'

In [None]:
pipeline_dl.model.stages[-2].getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [None]:
%%time

result = pipeline_dl.annotate(testDoc)

result.keys()



CPU times: user 40.4 ms, sys: 9.91 ms, total: 50.4 ms
Wall time: 1.39 s


In [None]:
result.keys()

dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])

In [None]:
result['entities']

['Peter', 'Russia', 'John', 'Peter', 'Lucas Dunbercker', 'Europe']

In [None]:
df = pd.DataFrame({'token':result['token'], 'ner_label':result['ner'],
                      'spell_corrected':result['checked'], 'POS':result['pos'],
                      'lemmas':result['lemma'], 'stems':result['stem']})

df

Unnamed: 0,token,ner_label,spell_corrected,POS,lemmas,stems
0,Peter,B-PER,Peter,NNP,Peter,peter
1,is,O,is,VBZ,be,i
2,a,O,a,DT,a,a
3,very,O,very,RB,very,veri
4,good,O,good,JJ,good,good
5,persn,O,person,NN,person,person
6,.,O,.,.,.,.
7,My,O,My,PRP$,My,my
8,life,O,life,NN,life,life
9,in,O,in,IN,in,in


# Recognize Entities

In [None]:
recognize_entities = PretrainedPipeline('recognize_entities_dl', lang='en')

recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [None]:
recognize_entities.model.stages

[document_1c58bc1aca5d,
 SENTENCE_328d8a47c1a8,
 REGEX_TOKENIZER_b6c4cbc5a4ea,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_389b80afbf7d]

In [None]:
recognize_entities.model.stages[3].getStorageRef()

'glove_100d'

In [None]:
recognize_entities.model.stages[4].getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [None]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

result = recognize_entities.annotate(testDoc)

list(zip(result['token'], result['ner']))

[('Peter', 'B-PER'),
 ('is', 'O'),
 ('a', 'O'),
 ('very', 'O'),
 ('good', 'O'),
 ('persn', 'O'),
 ('.', 'O'),
 ('My', 'O'),
 ('life', 'O'),
 ('in', 'O'),
 ('Russia', 'B-LOC'),
 ('is', 'O'),
 ('very', 'O'),
 ('intersting', 'O'),
 ('.', 'O'),
 ('John', 'B-PER'),
 ('and', 'O'),
 ('Peter', 'B-PER'),
 ('are', 'O'),
 ('brthers', 'O'),
 ('.', 'O'),
 ('However', 'O'),
 ('they', 'O'),
 ("don't", 'O'),
 ('support', 'O'),
 ('each', 'O'),
 ('other', 'O'),
 ('that', 'O'),
 ('much', 'O'),
 ('.', 'O'),
 ('Lucas', 'B-PER'),
 ('Dunbercker', 'I-PER'),
 ('is', 'O'),
 ('no', 'O'),
 ('longer', 'O'),
 ('happy', 'O'),
 ('.', 'O'),
 ('He', 'O'),
 ('has', 'O'),
 ('a', 'O'),
 ('good', 'O'),
 ('car', 'O'),
 ('though', 'O'),
 ('.', 'O'),
 ('Europe', 'B-LOC'),
 ('is', 'O'),
 ('very', 'O'),
 ('culture', 'O'),
 ('rich', 'O'),
 ('.', 'O'),
 ('There', 'O'),
 ('are', 'O'),
 ('huge', 'O'),
 ('churches', 'O'),
 ('!', 'O'),
 ('and', 'O'),
 ('big', 'O'),
 ('houses', 'O'),
 ('!', 'O')]