## PySpark - Natural Language Processing

In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NLP").getOrCreate()

## Tokenizer

_**Documentacion Tokenizer:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.Tokenizer.html_

_**Documentacion RegexTokenizer:** https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.RegexTokenizer.html_

In [None]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import RegexTokenizer

from pyspark.sql.functions import col
from pyspark.sql.functions import udf

from pyspark.sql.types import IntegerType

In [None]:
data = spark.createDataFrame(data = [(0, "Hola mundo me gusta python y pyspark"),
                                     (1, "PySpark no me convence mucho"),
                                     (2, "Logistic,regression,models,decision,trees,gradient,boosting,classifier")],
                             schema = ["id", "sentence"])

In [None]:
tokenizer = Tokenizer(inputCol = "sentence",
                      outputCol = "words")

regex_tokenizer = RegexTokenizer(inputCol = "sentence",
                                 outputCol = "words",
                                 pattern = "\\W")

count_token = udf(f = lambda words: len(words),
                  returnType = IntegerType())

In [None]:
tokenized = tokenizer.transform(data)

tokenized.select("sentence", "words").withColumn("tokens", count_token(col("words"))).show(truncate = False)

In [None]:
regex_tokenized = regex_tokenizer.transform(data)

regex_tokenized.select("sentence", "words").withColumn("tokens", count_token(col("words"))).show(truncate = False)

### StopWords
_**Documentacion:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.StopWordsRemover.html_

In [None]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol = "words",
                           outputCol = "filtered")

remover.transform(regex_tokenized).select("words", "filtered").show(truncate = False)

## n-grams
_**Documentacion:** https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.ml.feature.NGram.html_

In [None]:
from pyspark.ml.feature import NGram

n_gram = NGram(n = 2,
               inputCol = "words",
               outputCol = "n_gram")

n_gram_data = n_gram.transform(regex_tokenized)

n_gram_data.select("n_gram").show(truncate = False)

### TF-IDF
_**Documentacion HashingTF:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.feature.HashingTF.html_ 

_**Documentacion IDF:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.feature.IDF.html_

In [None]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF

tokenizer = Tokenizer(inputCol = "sentence", outputCol = "words")
words_data = tokenizer.transform(data)
words_data.show(truncate = False)

In [None]:
hashingTF = HashingTF(inputCol = "words",
                      outputCol = "raw_features",
                      numFeatures = 20)

featurized_data = hashingTF.transform(words_data)

featurized_data.select("words", "raw_features").show(truncate = False)

In [None]:
idf = IDF(inputCol = "raw_features",
          outputCol = "features")

rescaled_data = idf.fit(featurized_data).transform(featurized_data)

rescaled_data.select("raw_features", "features").show(truncate = False)

## CountVectorizer

_**Documentacion:** https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.feature.CountVectorizer.html_

In [None]:
from pyspark.ml.feature import CountVectorizer

df = spark.createDataFrame(data = [(0, "a b c".split(" ")),
                                   (1, "a b b c a".split(" "))],
                           schema = ["id", "words"])

cv = CountVectorizer(inputCol = "words",
                     outputCol = "features",
                     vocabSize = 3,
                     minDF = 2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate = False)

In [None]:
################################################################################################################################