In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('tools_2').getOrCreate()

In [10]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [39]:
sentenceData = spark.createDataFrame([
    (0.0, 'Hi I heard about Spark'),
    (0.0, 'I wish Java could use case classes'),
    (1.0, 'Logistic regression models are neat')
], ['label', 'sentence'])

In [40]:
sentenceData.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish Java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



## First tokenize the DataFrame

In [41]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
count_tokens = udf(lambda words_sep: len(words_sep), IntegerType())

In [42]:
words_data = tokenizer.transform(sentenceData)

In [43]:
words_data.withColumn('tokens', count_tokens(words_data['words'])).show()

+-----+--------------------+--------------------+------+
|label|            sentence|               words|tokens|
+-----+--------------------+--------------------+------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  0.0|I wish Java could...|[i, wish, java, c...|     7|
|  1.0|Logistic regressi...|[logistic, regres...|     5|
+-----+--------------------+--------------------+------+



## tf-idf

In [46]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [57]:
featurized_data = hashing_tf.transform(words_data)
featurized_data.show()

+-----+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|
|  0.0|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|
|  1.0|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|
+-----+--------------------+--------------------+--------------------+



In [58]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [59]:
idf_model = idf.fit(featurized_data)

In [60]:
rescaled_data = idf_model.transform(featurized_data)

In [63]:
rescaled_data.show()

+-----+--------------------+--------------------+--------------------+--------------------+
|label|            sentence|               words|         rawFeatures|            features|
+-----+--------------------+--------------------+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|(262144,[24417,49...|
|  0.0|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|(262144,[20719,24...|
|  1.0|Logistic regressi...|[logistic, regres...|(262144,[13671,91...|(262144,[13671,91...|
+-----+--------------------+--------------------+--------------------+--------------------+



In [64]:
rescaled_data.take(1)

[Row(label=0.0, sentence='Hi I heard about Spark', words=['hi', 'i', 'heard', 'about', 'spark'], rawFeatures=SparseVector(262144, {24417: 1.0, 49304: 1.0, 73197: 1.0, 91137: 1.0, 234657: 1.0}), features=SparseVector(262144, {24417: 0.2877, 49304: 0.6931, 73197: 0.6931, 91137: 0.6931, 234657: 0.6931}))]

## CountVectorizer

In [65]:
from pyspark.ml.feature import CountVectorizer

In [110]:
df = spark.createDataFrame([
    (0, 'a b b c'.split(' ')),
    (1, 'a b b b c a d'.split(' '))
], ['id', 'words'])

In [111]:
df.show(truncate=False)

+---+---------------------+
|id |words                |
+---+---------------------+
|0  |[a, b, b, c]         |
|1  |[a, b, b, b, c, a, d]|
+---+---------------------+



In [112]:
countVec = CountVectorizer(inputCol='words', outputCol='features', vocabSize=3, minDF=2)

In [113]:
model = countVec.fit(df)

In [114]:
result = model.transform(df)

In [115]:
result.show(truncate=False)

+---+---------------------+-------------------------+
|id |words                |features                 |
+---+---------------------+-------------------------+
|0  |[a, b, b, c]         |(3,[0,1,2],[2.0,1.0,1.0])|
|1  |[a, b, b, b, c, a, d]|(3,[0,1,2],[3.0,2.0,1.0])|
+---+---------------------+-------------------------+

