In [4]:
sen_df = spark.createDataFrame([
        (0,'Gi I heard about spark'),
        (1,'I wish java could use case classes'),
        (2, 'Logistic, regression, models, are, neat')
        ],['id','sentence'])

In [5]:
sen_df.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Gi I heard about ...|
|  1|I wish java could...|
|  2|Logistic, regress...|
+---+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [8]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [9]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [10]:
tokenized = tokenizer.transform(sen_df)

In [12]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Gi I heard about ...|[gi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic, regress...|[logistic,, regre...|     5|
+---+--------------------+--------------------+------+



In [13]:
rg_tokenized = regex_tokenizer.transform(sen_df)

In [14]:
rg_tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|            sentence|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Gi I heard about ...|[gi, i, heard, ab...|     5|
|  1|I wish java could...|[i, wish, java, c...|     7|
|  2|Logistic, regress...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [15]:
from pyspark.ml.feature import StopWordsRemover

In [16]:
sentenceDataFrame = spark.createDataFrame([
    (0,['I','saw','the','green','horse']),
    (1,['Mary','had','ad','little','lamb'])
],['id','tokens'])

In [17]:
sentenceDataFrame.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, ad, l...|
+---+--------------------+



In [18]:
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered')

In [19]:
remover.transform(sentenceDataFrame).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, ad, l...|[Mary, ad, little...|
+---+--------------------+--------------------+



In [20]:
from pyspark.ml.feature import NGram

In [28]:
wordDataFrame = spark.createDataFrame([
        (0, ['Hi,"I", "heard", "about", "spark']),
        (1,['I', 'wish', 'java', 'could', 'use', 'case', 'classes']),
        (2, ["Logistic","regression", "models", "are", "neat"])
],["id", "words"])

In [29]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [30]:
ngram.transform(wordDataFrame).show()

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[Hi,"I", "heard",...|                  []|
|  1|[I, wish, java, c...|[I wish, wish jav...|
|  2|[Logistic, regres...|[Logistic regress...|
+---+--------------------+--------------------+



In [31]:
ngram.transform(wordDataFrame).select('grams').show(truncate=False)

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[]                                                                |
|[I wish, wish java, java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

