本例子来自《Advanced Analytics with Spark 2nd》，需要在进入spark-shell时预先设置spark.driver.memory的大小，默认1G不够用。

In [10]:
val spk = spark
import spk.implicits._

In [11]:
//加载UserArtist（用户，作者和interaction count）数据
val UserArtistPath = "/Users/flyang/Documents/self-teaching/Data_Resources/profiledata_06-May-2005/user_artist_data.txt"
val rawUserArtistData = spark.read.textFile(path)

In [12]:
//分隔数据
val userArtistDF = rawUserArtistData.map { line => 
    val Array(user, artist, _*) = line.split(' ') 
    (user.toInt, artist.toInt)
}.toDF("user", "artist")

In [13]:
//加载artist（id和名字）数据
val artistPath = "/Users/flyang/Documents/self-teaching/Data_Resources/profiledata_06-May-2005/artist_data.txt"
val rawArtistData = spark.read.textFile(path)

In [38]:
//分隔数据
val artistByID = rawArtistData.flatMap { line => 
    val (id, name) = line.span(_ != '\t')
    try {
        Some(id.toInt, name.trim)
    } catch {
        case _: Exception => None 
    }
}.toDF("id", "name")

In [24]:
//加载artistAlias（作者id和别id）数据
val artistAliasPath = "/Users/flyang/Documents/self-teaching/Data_Resources/profiledata_06-May-2005/artist_alias.txt"
val rawArtistAlias = spark.read.textFile(artistAliasPath)

In [39]:
//分隔数据
val artistAlias = rawArtistAlias.flatMap { line =>
    val Array(artist, alias) = line.split('\t') 
    if (artist.isEmpty) {
        None
    }else{
        Some(artist.toInt, alias.toInt)
    }
}.collect().toMap

In [40]:
artistByID.filter($"id" isin (1208690, 1003926)).show()

+-------+----------------+
|     id|            name|
+-------+----------------+
|1208690|Collective Souls|
|1003926| Collective Soul|
+-------+----------------+



In [41]:
import org.apache.spark.sql._
import org.apache.spark.broadcast._

//下面代码把有别名ID的作品统一成唯一ID
def buildCounts(
    rawUserArtistData: Dataset[String],
    bArtistAlias: Broadcast[Map[Int,Int]]): DataFrame = {
        rawUserArtistData.map { line =>
            val Array(userID, artistID, count) = line.split(' ').map(_.toInt) 
            val finalArtistID =
              bArtistAlias.value.getOrElse(artistID, artistID)
            (userID, finalArtistID, count)
        }.toDF("user", "artist", "count")
    }
    
val bArtistAlias = spark.sparkContext.broadcast(artistAlias) 
val trainData = buildCounts(rawUserArtistData, bArtistAlias)
trainData.cache()

trainData.type = [user: int, artist: int ... 1 more field]

In [None]:
//尝试模型
import org.apache.spark.ml.recommendation._
import scala.util.Random
val alsModel = new ALS().setSeed(Random.nextLong()).setImplicitPrefs(true).setRank(10).setRegParam(0.01).setAlpha(1.0).setMaxIter(5).setUserCol("user").setItemCol("artist").setRatingCol("count").setPredictionCol("prediction").fit(trainData)

In [None]:
//查看结果
alsModel.recommendForAllUsers(10)
  .selectExpr("userId", "explode(recommendations)").show()