In [1]:
import findspark
findspark.init('/home/kant/spark-2.4.4-bin-hadoop2.7')

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('TF-IDF').getOrCreate()

In [4]:
from pyspark.ml.feature import HashingTF,IDF,Tokenizer

In [5]:
sentence_dataframe = spark.createDataFrame([
    (0,"I love my Data"),
    (1, "I know about the Spark"),
    (2, "I wish java could use case classes"),
    (3, "Logistis,regression, models are neat")
],['id','sentences'])

In [6]:
tokenizer = Tokenizer(inputCol='sentences',outputCol='words')

In [7]:
word_data = tokenizer.transform(sentence_dataframe)

In [8]:
word_data.show(truncate=False)

+---+------------------------------------+------------------------------------------+
|id |sentences                           |words                                     |
+---+------------------------------------+------------------------------------------+
|0  |I love my Data                      |[i, love, my, data]                       |
|1  |I know about the Spark              |[i, know, about, the, spark]              |
|2  |I wish java could use case classes  |[i, wish, java, could, use, case, classes]|
|3  |Logistis,regression, models are neat|[logistis,regression,, models, are, neat] |
+---+------------------------------------+------------------------------------------+



In [9]:
hashing_tf = HashingTF(inputCol='words',outputCol='rawfeatures')

In [10]:
featured_data = hashing_tf.transform(word_data)

In [11]:
idf = IDF(inputCol='rawfeatures', outputCol='features')

In [12]:
idf_model = idf.fit(featured_data)

In [13]:
rescaled_data = idf_model.transform(featured_data)

In [15]:
rescaled_data.select('id','features').show(truncate=False)

+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                                                                        |
+---+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |(262144,[24417,37852,160735,186480],[0.22314355131420976,0.9162907318741551,0.9162907318741551,0.9162907318741551])                                                                             |
|1  |(262144,[24417,91137,103838,140931,234657],[0.22314355131420976,0.9162907318741551,0.9162907318741551,0.9162907318741551,0.9162907318741551])                                                   |
|2  |

In [16]:
from pyspark.ml.feature import CountVectorizer

In [33]:
df = spark.createDataFrame([
    (0,"a b c".split(" ")),
    (1,"a b b c a".split(" "))
],["id","words"])

In [34]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [35]:
cv = CountVectorizer(inputCol='words',outputCol='features', minDF=2.0,vocabSize=3)

In [36]:
model = cv.fit(df)

In [37]:
result = model.transform(df)

In [38]:
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

