In [1]:
from pyspark.ml.feature import Word2Vec
import time
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover

In [2]:
dat = spark.read.json('/hduser1/review.json').repartition(150)
dat.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [6]:
dat.count()

4153150

In [3]:
dat = dat.select('review_id','text')

I would like to do a simple word2vec transformation of the review texts. In other words, I would like to convert the string of texts to a vector that can represent each review. I need to several steps to achieve this goal.

1. Tokenize the text (convert it to a word - count pairs)
2. Remove all the stop words
3. Run the final versions into a word2vec model, which will then create a vector representing the "orientation" of the words in each review

In [7]:
start = time.time()
# tokenizer 
tokenizer = Tokenizer(inputCol="text", outputCol="words")
DTMmatrix = tokenizer.transform(dat)

# Stop word removal
stopremove = StopWordsRemover(inputCol='words',outputCol='cleaned')
dat3 = stopremove.transform(DTMmatrix)

#fit a word2vec model 
word2Vec = Word2Vec(vectorSize=15, minCount=0, numPartitions=150, inputCol="cleaned", outputCol="word2vec")
model = word2Vec.fit(dat3)
result = model.transform(dat3)

end = time.time()


In [8]:
print(end-start)

609.8282635211945
