# SparkNLP

In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-09-25 09:55:03--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-09-25 09:55:03--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-09-25 09:55:04--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [None]:
import sparknlp
spark = sparknlp.start()

print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.2.3
Apache Spark version: 3.0.3


In [None]:
from pyspark.sql import SparkSession

In [None]:
ss= SparkSession.builder.appName("enter_draft").getOrCreate() #ss.stop()

## Pretrained Pipelines

In [None]:
from sparknlp.pretrained import PretrainedPipeline

In [None]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brthers. However they don't support each other that much.
Lucas Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

In [None]:
pipeline= PretrainedPipeline("explain_document_ml", lang="en")

explain_document_ml download started this may take some time.
Approx size to download 9.1 MB
[OK!]


In [None]:
pipeline.model.stages

[document_811d40a38b24,
 SENTENCE_ce56851acebe,
 REGEX_TOKENIZER_78daa3b4692f,
 SPELL_79c88338ef12,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_caf11d1f4d0e,
 POS_dbb704204f6f]

In [None]:
%%time
result= pipeline.annotate(testDoc)

CPU times: user 20.3 ms, sys: 2 ms, total: 22.3 ms
Wall time: 150 ms


In [None]:
result.keys()

dict_keys(['document', 'spell', 'pos', 'lemmas', 'token', 'stems', 'sentence'])

In [None]:
result["sentence"]

['Peter is a very good persn.',
 'My life in Russia is very intersting.',
 'John and Peter are brthers.',
 "However they don't support each other that much.",
 'Lucas Dunbercker is no longer happy.',
 'He has a good car though.',
 'Europe is very culture rich.',
 'There are huge churches!',
 'and big houses!']

In [None]:
list(zip(result["token"], result["lemmas"], result["stems"]))

In [None]:
import pandas as pd

In [None]:
df= pd.DataFrame({"token": result["token"],
                  "pos": result["pos"],
                  "spell": result["spell"]})
df.head(10)

Unnamed: 0,token,pos,spell
0,Peter,NNP,Peter
1,is,VBZ,is
2,a,DT,a
3,very,RB,very
4,good,JJ,good
5,persn,NN,person
6,.,.,.
7,My,PRP$,My
8,life,NN,life
9,in,IN,in


In [None]:
pipeline_dl= PretrainedPipeline("explain_document_dl", lang="en")

explain_document_dl download started this may take some time.
Approx size to download 169.4 MB
[OK!]


In [None]:
pipeline_dl.model.stages

[document_7939d5bf1083,
 SENTENCE_05265b07c745,
 REGEX_TOKENIZER_c5c312143f63,
 SPELL_e4ea67180337,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_ba49f7631065,
 POS_d01c734956fe,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_a81db9af2d23]

In [None]:
pipeline_dl.model.stages[-2].getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [None]:
%%time
result= pipeline_dl.annotate(testDoc)

CPU times: user 30.8 ms, sys: 7.33 ms, total: 38.2 ms
Wall time: 929 ms


In [None]:
pipeline_dl.model.stages

[document_7939d5bf1083,
 SENTENCE_05265b07c745,
 REGEX_TOKENIZER_c5c312143f63,
 SPELL_e4ea67180337,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_ba49f7631065,
 POS_d01c734956fe,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_a81db9af2d23]

In [None]:
result.keys()

dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])

In [None]:
result["entities"]

['Peter', 'Russia', 'John', 'Peter', 'Lucas Dunbercker', 'Europe']

In [None]:
df_dl= pd.DataFrame({"token": result["token"], "ner_label": result["ner"],
                     "spell_corrected": result["checked"], "pos": result["pos"],
                     "lemmas": result["lemma"], "stem": result["stem"]})
df_dl.head(10)

Unnamed: 0,token,ner_label,spell_corrected,pos,lemmas,stem
0,Peter,B-PER,Peter,NNP,Peter,peter
1,is,O,is,VBZ,be,i
2,a,O,a,DT,a,a
3,very,O,very,RB,very,veri
4,good,O,good,JJ,good,good
5,persn,O,person,NN,person,person
6,.,O,.,.,.,.
7,My,O,My,PRP$,My,my
8,life,O,life,NN,life,life
9,in,O,in,IN,in,in


In [None]:
pipeline= PretrainedPipeline("clean_stop", lang="en")

clean_stop download started this may take some time.
Approx size to download 22.8 KB
[OK!]


In [None]:
pipeline.model.stages

[document_90b4be8a6e0b,
 SENTENCE_8ba1e4f73af0,
 REGEX_TOKENIZER_fb4f98b445ce,
 STOPWORDS_CLEANER_b5d381c851f5]

In [None]:
%%time
result= pipeline.annotate(testDoc)


CPU times: user 12.8 ms, sys: 207 µs, total: 13 ms
Wall time: 62.5 ms


In [None]:
result.keys()

dict_keys(['document', 'sentence', 'token', 'cleanTokens'])

In [None]:
' '.join(result["cleanTokens"])

"Peter good persn . life Russia intersting . John Peter brthers . don't support . Lucas Dunbercker longer happy . good car . Europe culture rich . huge churches ! big houses !"

In [None]:
ss.stop()

In [None]:
spark.stop()