In [0]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS

In [1]:
val rating_review = spark.read.format("parquet").load("yelp_preprocess/yelp_review_with_ub_index")
// rating_review.show()

In [2]:
val Array(training, test) = rating_review.randomSplit(Array(0.8, 0.2))

In [3]:
val als = new ALS()
        .setCheckpointInterval(5)
        .setRank(15)
        .setMaxIter(20)
        .setRegParam(0.5)
        .setUserCol("user_index")
        .setItemCol("business_index")
        .setRatingCol("stars")

val model = als.fit(training)

In [4]:
val evaluator = new RegressionEvaluator()
        .setMetricName("r2")
        .setLabelCol("stars")
        .setPredictionCol("prediction")

In [5]:
model.setColdStartStrategy("drop")
val pred = model.transform(test)

In [6]:
val r2 = evaluator.evaluate(pred)
println(s"R2 = $r2")

In [7]:
val userRecs = model.recommendForAllUsers(5)

In [8]:
userRecs.show(false)

In [9]:
val user_rec_items = userRecs
        .withColumn("item1_id", $"recommendations".getItem(0).getItem("business_index"))
        .withColumn("item2_id", $"recommendations".getItem(1).getItem("business_index"))
        .withColumn("item3_id", $"recommendations".getItem(2).getItem("business_index"))
        .withColumn("item4_id", $"recommendations".getItem(3).getItem("business_index"))
        .withColumn("item5_id", $"recommendations".getItem(4).getItem("business_index"))
        .drop("recommendations")

In [10]:
user_rec_items.show(false)

In [11]:
val yelp_business = spark.read.format("json").load("yelp_origin/yelp_academic_dataset_business.json").select("business_id", "name", "categories")
yelp_business.show(false)

In [12]:
val yelp_user = spark.read.format("json").load("yelp_origin/yelp_academic_dataset_user.json")

In [13]:
val yelp_user_v2 = yelp_user.select("user_id", "name")
yelp_user_v2.show

In [14]:
val df = spark.read.format("parquet").load("yelp")
val ratings = df.select("user_id", "business_id", "stars", "timestamp")

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StructField,StructType,IntegerType, LongType}
import org.apache.spark.sql.Row

def generate_index(df: DataFrame, col: String, new_col_name: String):DataFrame = {
    val df_item = df.select(col).distinct()
    val new_schema = StructType(df_item.schema.fields ++ Array(StructField(new_col_name, LongType, nullable = false)))
    val df_rdd = df_item.rdd.zipWithIndex()
    spark.createDataFrame(df_rdd.map{case (row, index) => Row.fromSeq(row.toSeq ++ Array(index))}, new_schema)    
}

val ratings_user = ratings.select("user_id").distinct()
val ratings_user_with_index = generate_index(ratings_user, "user_id", "user_index")

val ratings_item = ratings.select("business_id").distinct()
val ratings_item_with_index = generate_index(ratings_item, "business_id", "business_index")

In [15]:
//ratings_user_with_index.filter('user_index === 463).show(false)
//ratings_user_with_index.filter('user_index === 471).show(false)
ratings_user_with_index.filter('user_index === 833).show(false)