First, we process business data

In [1]:
val bizJSONPath = "bdad_proj/yelp_academic_dataset_business.json"
val bizDataPath = "bdad_proj/business_data"

In [2]:
val businessDF = spark.read.json(bizJSONPath)
businessDF.cache

In [3]:
businessDF.printSchema

In [4]:
val bizCountDF = businessDF.groupBy($"state").count.sort($"count")
bizCountDF.show(50)

In [5]:
businessDF.show

In [6]:

val changedBizDF = businessDF.withColumn("stars", businessDF("stars").cast("double"))
      .withColumn("review_count", businessDF("review_count").cast("long"))
      .withColumn("is_open", businessDF("is_open").cast("boolean"))
      .withColumn("latitude", businessDF("latitude").cast("double"))
      .withColumn("longitude", businessDF("longitude").cast("double"))
      .withColumn("categories", split(trim(businessDF("categories")), ","))
      .na.drop("any")
      .filter($"address" =!= "" && $"business_id" =!= "" && $"city" =!= "" && $"name" =!= "" && length($"state") === 2 && length($"postal_code") === 5) //filter out non-american postal code and empty strings
      .toDF

In [7]:
changedBizDF.show

In [8]:
changedBizDF.write.mode("overwrite").parquet(bizDataPath)

Now we process tips data

In [10]:
val tipDF = spark.read.json("bdad_proj/yelp_academic_dataset_tip.json")
tipDF.cache.show

In [11]:
tipDF.printSchema
val tipTrainDF = tipDF.withColumn("date", to_timestamp(tipDF("date"), "yyyy-MM-dd HH:mm:ss"))
      .withColumn("compliment_count", tipDF("compliment_count").cast("long"))
      .withColumn("cleanse_text", regexp_replace($"text", "[\\W&&[^\\s+]]", ""))
      .withColumn("words", split($"cleanse_text", " "))
tipTrainDF.show
tipTrainDF.printSchema

In [12]:
tipDF.select($"text").show(false)

In [13]:
tipTrainDF.write.mode("overwrite").parquet("bdad_proj/tip_data")

Train word2vec data

In [15]:
val tipTrainDF = spark.read.parquet("bdad_proj/tip_data")

In [16]:
import org.apache.spark.ml.feature.{Word2Vec, Word2VecModel}
val w2v = new Word2Vec().setInputCol("words").setOutputCol("features").setVectorSize(100).setMinCount(0)

In [17]:
val model = w2v.fit(tipTrainDF)

Load model from HDFS

In [19]:
import org.apache.spark.ml.feature.{Word2Vec, Word2VecModel}
val model = Word2VecModel.load("bdad_proj/tip_word2vec_model")

In [20]:
model.getVectors.show

In [21]:
model.findSynonyms("car", 5).show(false)

Persistence model

In [23]:
model.write.overwrite.save("bdad_proj/tip_word2vec_model")