In [2]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.0-bin-hadoop3.2')
DATA_PATH = '../data/Spark_for_Machine_Learning/Natural_Language_Processing/'

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/30 14:12:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [7]:
sentences = spark.createDataFrame([
    (0, 'I like burritos.'),
    (1, 'She prefers tacos.'),
    (2, 'We can go and get both at the restaurant.'),
    (3, 'They,serve,pozole,tacos,burritos,and,ceviche.'),
    (4, 'Mezcal and Tequila, too.')
], ['id','sentence'])

In [8]:
sentences.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|    I like burritos.|
|  1|  She prefers tacos.|
|  2|We can go and get...|
|  3|They,serve,pozole...|
|  4|Mezcal and Tequil...|
+---+--------------------+



                                                                                

## Tokenizing

In [9]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
regex_tokenizer = RegexTokenizer(
    inputCol='sentence', outputCol='words', pattern='\\W')

In [10]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [15]:
# Basic tokenizer requires whitespace
tokenized = tokenizer.transform(sentences)
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|    I like burritos.|[i, like, burritos.]|     3|
|  1|  She prefers tacos.|[she, prefers, ta...|     3|
|  2|We can go and get...|[we, can, go, and...|     9|
|  3|They,serve,pozole...|[they,serve,pozol...|     1|
|  4|Mezcal and Tequil...|[mezcal, and, teq...|     4|
+---+--------------------+--------------------+------+



In [21]:
# Our regex tokenizer will split the sentences appropriately.
rg_tokenized = regex_tokenizer.transform(sentences)
rg_tokenized = rg_tokenized.withColumn('tokens', count_tokens(col('words')))

## Removing stop words

In [16]:
from pyspark.ml.feature import StopWordsRemover

In [24]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')

In [26]:
remover.transform(rg_tokenized).select('filtered').show()

+--------------------+
|            filtered|
+--------------------+
|    [like, burritos]|
|    [prefers, tacos]|
|[go, get, restaur...|
|[serve, pozole, t...|
|   [mezcal, tequila]|
+--------------------+



## N-grams

In [27]:
from pyspark.ml.feature import NGram

In [28]:
ngram = NGram(n=2, inputCol='words',outputCol='grams')

In [30]:
ngram.transform(rg_tokenized).select('grams').show(truncate=False)

+-----------------------------------------------------------------------------------+
|grams                                                                              |
+-----------------------------------------------------------------------------------+
|[i like, like burritos]                                                            |
|[she prefers, prefers tacos]                                                       |
|[we can, can go, go and, and get, get both, both at, at the, the restaurant]       |
|[they serve, serve pozole, pozole tacos, tacos burritos, burritos and, and ceviche]|
|[mezcal and, and tequila, tequila too]                                             |
+-----------------------------------------------------------------------------------+



## TF-IDF
Term-frequency inverse document frequency

In [31]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [37]:
# Get TF features
hashing_tf = HashingTF(inputCol='words',outputCol='rawFeatures')
featurized_data = hashing_tf.transform(rg_tokenized)
featurized_data.select(['sentence','rawFeatures']).show(truncate=False)

+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|sentence                                     |rawFeatures                                                                                                  |
+---------------------------------------------+-------------------------------------------------------------------------------------------------------------+
|I like burritos.                             |(262144,[19036,28128,208258],[1.0,1.0,1.0])                                                                  |
|She prefers tacos.                           |(262144,[54216,89302,211980],[1.0,1.0,1.0])                                                                  |
|We can go and get both at the restaurant.    |(262144,[30497,95889,108437,148675,156084,162111,187114,219915,252722],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|They,serve,pozole,tacos,burritos,and,ceviche.|(2621

In [39]:
# Get IDF
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_fit = idf.fit(featurized_data)

In [42]:
idf_fit.transform(featurized_data).select('features').show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                                                                                            |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(262144,[19036,28128,208258],[1.0986122886681098,0.6931471805599453,1.0986122886681098])                                                                                                                                                            |
|(262144,[54

21/12/30 14:32:12 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
