In [1]:
import numpy as np
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("nlp").getOrCreate()

## Tokenizer

In [4]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [15]:
sent_df = spark.createDataFrame(
    [
        (0, "Hello there general kenobi."),
        (1, "I enjoy my good fellas."),
        (2, "I would like to be happier."),
        (3, "here,is,a,list,of,words")
    ],
    ["id", "sentence"]
)

In [16]:
sent_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hello there gener...|
|  1|I enjoy my good f...|
|  2|I would like to b...|
|  3|here,is,a,list,of...|
+---+--------------------+



In [17]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regextokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

countTokens = udf(lambda w: len(w), IntegerType())

In [18]:
tokenized = tokenizer.transform(sent_df)

In [19]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hello there gener...|[hello, there, ge...|
|  1|I enjoy my good f...|[i, enjoy, my, go...|
|  2|I would like to b...|[i, would, like, ...|
|  3|here,is,a,list,of...|[here,is,a,list,o...|
+---+--------------------+--------------------+



In [20]:
tokenized.select("sentence", "words").withColumn("tokens", countTokens(col("words"))).show()

+--------------------+--------------------+------+
|            sentence|               words|tokens|
+--------------------+--------------------+------+
|Hello there gener...|[hello, there, ge...|     4|
|I enjoy my good f...|[i, enjoy, my, go...|     5|
|I would like to b...|[i, would, like, ...|     6|
|here,is,a,list,of...|[here,is,a,list,o...|     1|
+--------------------+--------------------+------+



In [21]:
regextokenized = regextokenizer.transform(sent_df)
regextokenized.select("sentence", "words").withColumn("tokens", countTokens(col("words"))).show()

+--------------------+--------------------+------+
|            sentence|               words|tokens|
+--------------------+--------------------+------+
|Hello there gener...|[hello, there, ge...|     4|
|I enjoy my good f...|[i, enjoy, my, go...|     5|
|I would like to b...|[i, would, like, ...|     6|
|here,is,a,list,of...|[here, is, a, lis...|     6|
+--------------------+--------------------+------+



In [26]:
sent_df_token = regextokenized.select("sentence", "words").withColumn("tokens", countTokens(col("words")))

## Stop Word Removal

In [22]:
from pyspark.ml.feature import StopWordsRemover

In [23]:
sent_df.show(truncate=False)

+---+---------------------------+
|id |sentence                   |
+---+---------------------------+
|0  |Hello there general kenobi.|
|1  |I enjoy my good fellas.    |
|2  |I would like to be happier.|
|3  |here,is,a,list,of,words    |
+---+---------------------------+



In [27]:
remover = StopWordsRemover(inputCol="words", outputCol="cleaned")

In [28]:
remover.transform(sent_df_token).show(truncate=False)

+---------------------------+---------------------------------+------+------------------------+
|sentence                   |words                            |tokens|cleaned                 |
+---------------------------+---------------------------------+------+------------------------+
|Hello there general kenobi.|[hello, there, general, kenobi]  |4     |[hello, general, kenobi]|
|I enjoy my good fellas.    |[i, enjoy, my, good, fellas]     |5     |[enjoy, good, fellas]   |
|I would like to be happier.|[i, would, like, to, be, happier]|6     |[like, happier]         |
|here,is,a,list,of,words    |[here, is, a, list, of, words]   |6     |[list, words]           |
+---------------------------+---------------------------------+------+------------------------+



## N-grams

In [29]:
from pyspark.ml.feature import NGram

In [32]:
sent_df_token.show(truncate=False)

+---------------------------+---------------------------------+------+
|sentence                   |words                            |tokens|
+---------------------------+---------------------------------+------+
|Hello there general kenobi.|[hello, there, general, kenobi]  |4     |
|I enjoy my good fellas.    |[i, enjoy, my, good, fellas]     |5     |
|I would like to be happier.|[i, would, like, to, be, happier]|6     |
|here,is,a,list,of,words    |[here, is, a, list, of, words]   |6     |
+---------------------------+---------------------------------+------+



In [33]:
bigrams = NGram(n=2, inputCol="words", outputCol="bigrams")

In [34]:
bigrams_df = bigrams.transform(sent_df_token)

In [35]:
bigrams_df.select("bigrams").show(truncate=False)

+-------------------------------------------------+
|bigrams                                          |
+-------------------------------------------------+
|[hello there, there general, general kenobi]     |
|[i enjoy, enjoy my, my good, good fellas]        |
|[i would, would like, like to, to be, be happier]|
|[here is, is a, a list, list of, of words]       |
+-------------------------------------------------+



## Term Freq-Inverse Doc Freq (TFIDF)