# Spark NLP: aprenda a otimizar a linguagem natural com eficácia

[Spark NLP: aprenda a otimizar a linguagem natural com eficácia](https://www.alura.com.br/artigos/spark-nlp-linguagem-natural-forma-otimizada)

## Preparando o ambiente

In [5]:
import sparknlp
spark = sparknlp.start()

In [6]:
data = [("The Beatles", "There are places I'll remember " +
                        "All my life though some have changed " +
                        "Some forever, not for better " +
                        "Some have gone and some remain"),
       ("Oasis", "So I start a revolution from my bed " +
                 "Cause you said the brains I had went to my head " +
                 "Step outside, summertime's in bloom " +
                 "Stand up beside the fireplace"),
       ("Pink Floyd", "How I wish you were here " +
                      "We're just two lost soul " +
                      "Swimming in a fish bowl year after year " +
                      "Running over the same old gound")]
df_musica = spark.createDataFrame(data, ["artista", "letra"])

In [7]:
from sparknlp.annotator import LemmatizerModel, Stemmer, Tokenizer, StopWordsCleaner
from sparknlp.base import DocumentAssembler

## DocumentAssembler

In [8]:
document_assembler = DocumentAssembler() \
    .setInputCol("letra") \
    .setOutputCol("document")
doc_df = document_assembler.transform(df_musica)
doc_df.select("document.result").show(truncate=False)

                                                                                

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
|[There are places I'll remember All my life though some have changed Some forever, not for better Some have gone and some remain]                      |
|[So I start a revolution from my bed Cause you said the brains I had went to my head Step outside, summertime's in bloom Stand up beside the fireplace]|
|[How I wish you were here We're just two lost soul Swimming in a fish bowl year after year Running over the same old gound]                            |
+---------------------------------------------------------------------------

## Tokenizer

In [12]:
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
token_df = tokenizer.fit(doc_df).transform(doc_df)
token_df.select("token.result").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                               |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[There, are, places, I'll, remember, All, my, life, though, some, have, changed, Some, forever, ,, not, for, better, Some, have, gone, and, some, remain]                            |
|[So, I, start, a, revolution, from, my, bed, Cause, you, said, the, brains, I, had, went, to, my, head, Step, outside, ,, summertime's, in, bloom, Stand, up, beside, the, fireplace]|
|[How, I, wish, you, were, here, We're, just, two, lost, soul, Swimming, in, a, 

## Removendo stop words

In [13]:
stopwords_cleaner = StopWordsCleaner() \
    .setInputCols("token") \
    .setOutputCol("clean_tokens")
token_df_clean = stopwords_cleaner.transform(token_df)
token_df_clean.select("clean_tokens.result").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                    |
+--------------------------------------------------------------------------------------------------------------------------+
|[places, remember, life, though, changed, forever, ,, better, gone, remain]                                               |
|[start, revolution, bed, Cause, said, brains, went, head, Step, outside, ,, summertime's, bloom, Stand, beside, fireplace]|
|[wish, two, lost, soul, Swimming, fish, bowl, year, year, Running, old, gound]                                            |
+--------------------------------------------------------------------------------------------------------------------------+



In [11]:
stopwords_cleaner.getStopWords()

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

## Stemmer

In [14]:
stemmer = Stemmer() \
    .setInputCols(["clean_tokens"]) \
    .setOutputCol("stem")
stem = stemmer.transform(token_df_clean)
stem.select("stem.result").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------+
|result                                                                                                           |
+-----------------------------------------------------------------------------------------------------------------+
|[place, rememb, life, though, chang, forev, ,, better, gone, remain]                                             |
|[start, revolut, bed, caus, said, brain, went, head, step, outsid, ,, summertime', bloom, stand, besid, fireplac]|
|[wish, two, lost, soul, swim, fish, bowl, year, year, run, old, gound]                                           |
+-----------------------------------------------------------------------------------------------------------------+



## Lemmatizer

In [15]:
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["clean_tokens"]) \
    .setOutputCol("lemma")
result = lemmatizer.transform(stem)
result.select("lemma.result").show(truncate=False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ — ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ \ ]Download done! Loading the resource.
[OK!]
+----------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                |
+----------------------------------------------------------------------------------------------------------------------+
|[place, remember, life, though, change, forever, ,, well, go, remain]                                                 |
|[start, revolution, bed, Cause, say, brain, go, head, Step, outside, ,, summertime's, bloom, Stand, beside, fireplace]|
|[wish, two, lose, soul, Swimming, fish, bowl, year, year, Running, old, gound]                                        |
+----------------------------------

In [16]:
spark.stop()