In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tools').getOrCreate()

## Tokenizer and tokenization

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [4]:
# create a dataframe with sentences
sen_df = spark.createDataFrame([
    (0, 'Hi I heard about Spark'),
    (1, 'I wish java could use case classes'),
    (2, 'Logistic,regression,model,are,neat')
],['id', 'sentence'])

In [5]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [34]:
# pattern is regex for separators
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [36]:
# udf is user defined function --> create your own function
count_tokens = udf(lambda words:len(words), IntegerType())

In [37]:
tokenized = tokenizer.transform(sen_df)
tokenized.show()
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [58]:
regex_tokenized = regex_tokenizer.transform(sen_df)
regex_tokenized.show()
regex_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



## Stop word removing (common words in a language e.g. a, an, the, is, ...)

In [59]:
from pyspark.ml.feature import StopWordsRemover

In [60]:
sentenceDF = spark.createDataFrame([
    (0, ['I', 'saw', 'the', 'green', 'horse']),
    (1, ['Mary', 'had', 'a', 'little', 'lamb'])
], ['id', 'tokens'])

In [61]:
# this is for English, Spark also has modules that support other languages
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [62]:
remover.transform(sentenceDF).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



## Create n-gram

In [63]:
from pyspark.ml.feature import NGram

In [64]:
wordDF = regex_tokenized.select('id', 'words')
wordDF.show()

+---+--------------------+
| id|               words|
+---+--------------------+
|  0|[hi, i, heard, ab...|
|  1|[i, wish, java, c...|
|  2|[logistic, regres...|
+---+--------------------+



In [69]:
ngram = NGram(n=3, inputCol='words', outputCol='grams')

In [70]:
ngram.transform(wordDF).show()
# note that 'transform' does not operate on the input DataFrame i.e. wordDF does not change

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[hi, i, heard, ab...|[hi i heard, i he...|
|  1|[i, wish, java, c...|[i wish java, wis...|
|  2|[logistic, regres...|[logistic regress...|
+---+--------------------+--------------------+



In [71]:
ngram.transform(wordDF).select('grams').show(truncate=False)

+--------------------------------------------------------------------------------+
|grams                                                                           |
+--------------------------------------------------------------------------------+
|[hi i heard, i heard about, heard about spark]                                  |
|[i wish java, wish java could, java could use, could use case, use case classes]|
|[logistic regression model, regression model are, model are neat]               |
+--------------------------------------------------------------------------------+

