### 协同过滤（Collaborative Filtering）

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("TransformerExample")\
        .getOrCreate()

#### 1. ALS（alternating least squares）

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [3]:
lines = spark.read.text("../../data/sample_movielens_ratings.txt").rdd

In [4]:
parts = lines.map(lambda row: row.value.split("::"))

In [5]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))

In [6]:
ratings = spark.createDataFrame(ratingsRDD)

In [7]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [8]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId",
          itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

In [9]:
model = als.fit(training)

In [10]:
predictions = model.transform(test)

In [11]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [12]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.851825329699475
