In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Natural Language Processing").getOrCreate()

# TF-IDF

In [2]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

In [3]:
sentencedata = spark.createDataFrame([
    (0.0,'Hi I heard about Spark'),
    (1.0,'I wish java could use case classes'),
    (2.0,'Logistic, regression, models, are, neat ')    
],['id','sentence'])

In [4]:
tokenizer = Tokenizer(inputCol = 'sentence', outputCol ='words')

In [5]:
tokenized = tokenizer.transform(sentencedata)

In [6]:
tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|0.0|Hi I heard about ...|[hi, i, heard, ab...|
|1.0|I wish java could...|[i, wish, java, c...|
|2.0|Logistic, regress...|[logistic,, regre...|
+---+--------------------+--------------------+



In [7]:
hashingTF = HashingTF(inputCol = 'words', outputCol ='rawFeatures')

In [8]:
featureizedData = hashingTF.transform(tokenized)

In [9]:
idf = IDF(inputCol = 'rawFeatures', outputCol ='features')

In [10]:
idfModel =idf.fit(featureizedData)

In [11]:
rescaledData = idfModel.transform(featureizedData)

In [12]:
rescaledData.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|            sentence|               words|         rawFeatures|            features|
+---+--------------------+--------------------+--------------------+--------------------+
|0.0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[18700,19...|(262144,[18700,19...|
|1.0|I wish java could...|[i, wish, java, c...|(262144,[19036,20...|(262144,[19036,20...|
|2.0|Logistic, regress...|[logistic,, regre...|(262144,[91006,14...|(262144,[91006,14...|
+---+--------------------+--------------------+--------------------+--------------------+



In [13]:
rescaledData.select('id','features').show(truncate=False)

+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                                                                      |
+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0|(262144,[18700,19036,33808,66273,173558],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                   |
|1.0|(262144,[19036,20719,55551,58672,98717,109547,192310],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])|
|2.0|(262144,[9

# CountVectorizer

In [14]:
from pyspark.ml.feature import CountVectorizer

In [15]:
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [16]:
# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
model = cv.fit(df)

In [17]:
result = model.transform(df)
result.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+



# Word2Vec

In [18]:
from pyspark.ml.feature import Word2Vec

In [19]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

In [20]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

In [21]:
result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [-0.029054476600140336,-0.01294696480035782,0.0056851662695407874]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [0.0259857994637319,-0.04903493928057807,0.07110526519162313]

Text: [Logistic, regression, models, are, neat] => 
Vector: [0.05493941362947226,0.043282629922032356,0.0456749327480793]



# FeatureHasher

In [22]:
from pyspark.ml.feature import FeatureHasher

In [23]:
dataset = spark.createDataFrame([
    (2.2, True, "1", "foo"),
    (3.3, False, "2", "bar"),
    (4.4, False, "3", "baz"),
    (5.5, False, "4", "foo")
], ["real", "bool", "stringNum", "string"])

In [24]:
hasher = FeatureHasher(inputCols=["real", "bool", "stringNum", "string"],
                       outputCol="features")

featurized = hasher.transform(dataset)
featurized.show(truncate=False)

+----+-----+---------+------+--------------------------------------------------------+
|real|bool |stringNum|string|features                                                |
+----+-----+---------+------+--------------------------------------------------------+
|2.2 |true |1        |foo   |(262144,[174475,247670,257907,262126],[2.2,1.0,1.0,1.0])|
|3.3 |false|2        |bar   |(262144,[70644,89673,173866,174475],[1.0,1.0,1.0,3.3])  |
|4.4 |false|3        |baz   |(262144,[22406,70644,174475,187923],[1.0,1.0,4.4,1.0])  |
|5.5 |false|4        |foo   |(262144,[70644,101499,174475,257907],[1.0,1.0,5.5,1.0]) |
+----+-----+---------+------+--------------------------------------------------------+

