In [1]:
from pyspark.ml.feature import Word2Vec
import time
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover

In [3]:
dat = spark.read.json('/hduser1/review.json').repartition(150)
dat.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- text: string (nullable = true)
 |-- type: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [4]:
dat.count()

4153150

In [5]:
dat = dat.select('review_id','text')

I would like to do a simple word2vec transformation of the review texts. In other words, I would like to convert the string of texts to a vector that can represent each review. I need to several steps to achieve this goal.

1. Tokenize the text (convert it to a word - count pairs)
2. Remove all the stop words
3. Run the final versions into a word2vec model, which will then create a vector representing the "orientation" of the words in each review

In [6]:
start = time.time()
# tokenizer 
tokenizer = Tokenizer(inputCol="text", outputCol="words")
DTMmatrix = tokenizer.transform(dat)

# Stop word removal
stopremove = StopWordsRemover(inputCol='words',outputCol='cleaned')
dat3 = stopremove.transform(DTMmatrix)

#fit a word2vec model 
word2Vec = Word2Vec(vectorSize=15, minCount=0, numPartitions=150, inputCol="cleaned", outputCol="word2vec")
model = word2Vec.fit(dat3)
result = model.transform(dat3)

end = time.time()


Py4JJavaError: An error occurred while calling o75.fit.
: org.apache.spark.SparkException: Job 4 cancelled 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1439)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1686)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:936)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:935)
	at org.apache.spark.mllib.feature.Word2Vec$$anonfun$doFit$1.apply$mcVI$sp(Word2Vec.scala:438)
	at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160)
	at org.apache.spark.mllib.feature.Word2Vec.doFit(Word2Vec.scala:358)
	at org.apache.spark.mllib.feature.Word2Vec.fit(Word2Vec.scala:319)
	at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:187)
	at org.apache.spark.ml.feature.Word2Vec.fit(Word2Vec.scala:127)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [14]:
print(end-start)

849.2570693492889
