In [55]:
from pyspark.sql import SparkSession
import time
# 初始化 Spark 会话
spark = SparkSession.\
        builder.\
        appName("MovieLens").\
        master("local[*]").\
        config("spark.executor.memory", "3g").\
        getOrCreate()


start_time = time.time()
# 读取评分数据
ratings_df = spark.read.csv('local-workspace/ml-25m/ratings.csv', header=True, inferSchema=True)
movies_df = spark.read.csv('local-workspace/ml-25m/movies.csv', header=True, inferSchema=True)
genome_scores_df = spark.read.csv('local-workspace/ml-25m/genome-scores.csv', header=True, inferSchema=True)

                                                                                

In [56]:
from pyspark.sql.functions import avg, count
from pyspark.sql.functions import split, col
# 计算评级数
average_ratings = ratings_df.groupBy("movieId").agg(avg("rating").alias("average_rating"))
# 计算每部电影的评分人数
rating_counts = ratings_df.groupBy("movieId").agg(count("userId").alias("rating_count"))
# 合并平均评分和评分人数
movie_ratings = average_ratings.join(rating_counts, "movieId")

# 将电影标题加入结果中
movies_with_titles = movie_ratings.join(movies_df, "movieId").select("movieId", "title", "average_rating", "rating_count")
# 按平均评分降序排序
sorted_movie_ratings = movies_with_titles.orderBy("average_rating", ascending=False)
# 显示结果
sorted_movie_ratings.show(5)



+-------+--------------------+--------------+------------+
|movieId|               title|average_rating|rating_count|
+-------+--------------------+--------------+------------+
| 204012|Kick That Habit (...|           5.0|           1|
| 122193|   Kit Carson (1940)|           5.0|           1|
| 202181| Warlock Moon (1973)|           5.0|           1|
| 159471|Evening's Civil T...|           5.0|           1|
| 131628|       Loaded (2014)|           5.0|           1|
+-------+--------------------+--------------+------------+


                                                                                

In [57]:
# 筛选出评分人数大于 10 的电影
movies_with_more_than_10_ratings = movies_with_titles.filter(movie_ratings.rating_count > 10)
# 按平均评分降序排序
sorted_movie_ratings = movies_with_more_than_10_ratings.orderBy("average_rating", ascending=False)
# 显示结果
sorted_movie_ratings.show(5)



+-------+--------------------+-----------------+------------+
|movieId|               title|   average_rating|rating_count|
+-------+--------------------+-----------------+------------+
| 171011|Planet Earth II (...|4.483096085409253|        1124|
| 159817| Planet Earth (2006)|4.464796794504865|        1747|
|    318|Shawshank Redempt...|4.413576004516335|       81482|
| 170705|Band of Brothers ...|4.398598820058997|        1356|
| 158958|    Pollyanna (2003)|4.384615384615385|          13|
+-------+--------------------+-----------------+------------+


                                                                                

In [58]:
# 筛选出评分人数大于 100 的电影
movies_with_more_than_100_ratings = movies_with_titles.filter(movie_ratings.rating_count > 100)
# 按平均评分降序排序
sorted_movie_ratings = movies_with_more_than_100_ratings.orderBy("average_rating", ascending=False)
# 显示结果
sorted_movie_ratings.show(5)



+-------+--------------------+------------------+------------+
|movieId|               title|    average_rating|rating_count|
+-------+--------------------+------------------+------------+
| 171011|Planet Earth II (...| 4.483096085409253|        1124|
| 159817| Planet Earth (2006)| 4.464796794504865|        1747|
|    318|Shawshank Redempt...| 4.413576004516335|       81482|
| 170705|Band of Brothers ...| 4.398598820058997|        1356|
| 171495|              Cosmos|4.3267148014440435|         277|
+-------+--------------------+------------------+------------+


                                                                                

In [59]:
# 筛选出评分人数大于 1000 的电影
movies_with_more_than_1000_ratings = movies_with_titles.filter(movie_ratings.rating_count > 1000)
# 按平均评分降序排序
sorted_movie_ratings = movies_with_more_than_1000_ratings.orderBy("average_rating", ascending=False)
# 显示结果
sorted_movie_ratings.show(5)



+-------+--------------------+-----------------+------------+
|movieId|               title|   average_rating|rating_count|
+-------+--------------------+-----------------+------------+
| 171011|Planet Earth II (...|4.483096085409253|        1124|
| 159817| Planet Earth (2006)|4.464796794504865|        1747|
|    318|Shawshank Redempt...|4.413576004516335|       81482|
| 170705|Band of Brothers ...|4.398598820058997|        1356|
|    858|Godfather, The (1...|4.324336165187245|       52498|
+-------+--------------------+-----------------+------------+


                                                                                

# 1. Splitting the Dataset:
First, import the necessary modules and read your rating data into a Spark DataFrame. Then, split this data into training and testing sets.

In [60]:
!pip install numpy







[0m



In [61]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.regression import LinearRegression

In [62]:
# Split the data
(train_data, test_data) = ratings_df.randomSplit([0.7, 0.3], seed=5021)

In [63]:
train_data.show()

[Stage 662:>                (0 + 1) / 1][Stage 671:==>           (37 + 2) / 200]

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    665|   5.0|1147878820|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   2011|   2.5|1147868079|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2573|   4.0|1147878923|
|     1|   2692|   5.0|1147869100|
|     1|   3448|   4.0|1147868480|
|     1|   3569|   5.0|1147879603|
|     1|   3949|   5.0|1147868678|
|     1|   4144|   5.0|1147868898|
|     1|   4325|   5.0|1147878122|
|     1|   4422|   3.0|1147869048|
|     1|   4703|   4.0|1147869223|
|     1|   5269|   0.5|1147879571|
+------+-------+------+----------+


                                                                                

In [64]:
test_data.describe().show()



+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|          7502892|           7502892|           7502892|             7502892|
|   mean|81188.96771058413| 21366.92766642516|3.5339601716244884|1.2155145892096126E9|
| stddev|46789.99952670272|39174.464718786876|1.0608207109360328|2.2683766468352246E8|
|    min|                1|                 1|               0.5|           789652009|
|    max|           162541|            209169|               5.0|          1574327549|
+-------+-----------------+------------------+------------------+--------------------+


                                                                                

# 2. Matrix Factorization with ALS:
ALS (Alternating Least Squares) is a popular matrix factorization algorithm in Spark's MLlib for collaborative filtering.

In [65]:
ALS_start_time = time.time()
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
als_model = als.fit(train_data)
# Predictions
predictions = als_model.transform(test_data)

                                                                                

In [66]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="r2")
test_eval = evaluator.evaluate(predictions)
print("R2: {}".format(test_eval))
ALS_end_time = time.time()
print("Time used for ALS: {}".format(ALS_end_time - ALS_start_time))

                                                                                

Root-mean-square error = 0.8221968954594174




R2: 0.39913453396002274
Time used for ALS: 147.75905752182007


                                                                                

# 3. Extract Features and Build Another ML Model:
For this part, you need to join your user and movie data with the rating data, then transform these features into a format suitable for machine learning models in Spark.

In [67]:
# Assuming movies_df and users_df are your DataFrames for movies and users
# Join them with the rating data and perform feature transformations

# An example of a feature transformation
lr_start_time = time.time()
string_indexer = StringIndexer(inputCol="userId", outputCol="userIdIndex")
vector_assembler = VectorAssembler(inputCols=["userIdIndex", "movieId"], outputCol="features")

lr = LinearRegression(featuresCol="features", labelCol="rating")

# Pipeline
pipeline = Pipeline(stages=[string_indexer, vector_assembler, lr])
lr_model = pipeline.fit(train_data)

# predictions
lr_predictions = lr_model.transform(test_data)

23/12/04 18:04:00 WARN Instrumentation: [5681e7cd] regParam is zero, which might cause numerical instability and overfitting.
23/12/04 18:04:00 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/12/04 18:04:30 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/12/04 18:04:31 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/12/04 18:04:51 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

In [68]:
# evaluations
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
test_eval = evaluator.evaluate(lr_predictions)
print("RMSE: {}".format(test_eval))
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="r2")
test_eval = evaluator.evaluate(lr_predictions)
print("R2: {}".format(test_eval))
lr_end_time = time.time()
print("Time used for Linear Regression: {}".format(lr_end_time - lr_start_time))

23/12/04 18:05:13 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/12/04 18:05:36 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
                                                                                

RMSE: 1.051967075754656


23/12/04 18:05:37 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
23/12/04 18:06:05 WARN DAGScheduler: Broadcasting large task binary with size 4.2 MiB
[Stage 897:====>                                                 (18 + 1) / 200]

R2: 0.016622260944419875
Time used for Linear Regression: 156.06520175933838


                                                                                

### Deal with the feature of movies
Count the average score of each movie
Get the one-hot encoding of genres

In [69]:

from pyspark.sql.functions import avg, col
data_pre_process_start_time = time.time()
# 求所有电影的平均评分
average_rating_df = movies_with_titles.select("movieId", "average_rating", "rating_count") # 前面已经算过
average_rating_df = average_rating_df.withColumnRenamed("average_rating", "movie_average_rating")
average_rating_df = average_rating_df.withColumnRenamed("rating_count", "movie_rating_count")

# 用均分填补没有打分的电影
overall_average_rating = average_rating_df.select(avg("movie_average_rating")).first()[0]
average_rating_df = average_rating_df.na.fill({"movie_average_rating": overall_average_rating})

# 显示前5条记录
average_rating_df.show(5)



+-------+--------------------+------------------+
|movieId|movie_average_rating|movie_rating_count|
+-------+--------------------+------------------+
|    148|   2.908955223880597|               335|
|    463|   2.813008130081301|               369|
|    471|  3.6579813752234034|             10631|
|    496|  3.2767624020887727|               383|
|    833|  2.7182422451994093|              1354|
+-------+--------------------+------------------+


                                                                                

In [70]:
from pyspark.sql.functions import split
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# 拆分 genres 字段
movies_df_vectorized = movies_df.withColumn("split_genres", split(col("genres"), "\|"))

In [71]:
from pyspark.sql.functions import col, split, explode, udf
from pyspark.sql.types import ArrayType, IntegerType
# 获取所有可能的类别
all_genres = movies_df_vectorized.select(explode(col("split_genres")).alias("genre")).distinct().collect()
all_genres = [row['genre'] for row in all_genres]

# 为每个类别定义一个 UDF
def genre_indicator(genre):
    def indicator(genres_list):
        return 1 if genre in genres_list else 0
    return udf(indicator, IntegerType())

# 为每个类别添加一个新列
for genre in all_genres:
    genre_udf = genre_indicator(genre)
    movies_df_vectorized = movies_df_vectorized.withColumn(genre, genre_udf(col("split_genres")))
movies_df_vectorized = movies_df_vectorized.drop("title", "genres", "split_genres")

                                                                                

In [72]:
movies_df_vectorized.show(5)

+-------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------------------+----+------+-------+------+--------+------+------+
|movieId|Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|(no genres listed)|IMAX|Horror|Western|Comedy|Children|Action|Sci-Fi|
+-------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------------------+----+------+-------+------+--------+------+------+
|      1|    0|      0|       0|        1|    0|  0|          0|      1|      0|      0|        1|        0|                 0|   0|     0|      0|     1|       1|     0|     0|
|      2|    0|      0|       0|        1|    0|  0|          0|      1|      0|      0|        0|        0|                 0|   0|     0|      0|     0|       1|     0|     0|
|      3|    0|      1|       0|        0|    0|  0|          0|      0|      0|      0|        0|        0|  

                                                                                

In [73]:
movies_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+


### Another vectorized method of movie genre


In [74]:
max_array_length = 10
# 假设 movie_df 是您的原始 DataFrame，并且 genres 列包含由 '|' 分隔的字符串
# 将 genres 列拆分为数组
split_genres = split(movies_df['genres'], '\|')

new_columns = [movies_df['movieId']]  # 包括 movieId 列
# 使用列表推导式创建新列
new_columns += [split_genres.getItem(i).alias(f'genre{i+1}') for i in range(max_array_length)]
# 创建新的 DataFrame，仅包含这些 genre 列
genre_df = movies_df.select(*new_columns)
# 显示结果
genre_df.show(5)

+-------+---------+---------+--------+------+-------+------+------+------+------+-------+
|movieId|   genre1|   genre2|  genre3|genre4| genre5|genre6|genre7|genre8|genre9|genre10|
+-------+---------+---------+--------+------+-------+------+------+------+------+-------+
|      1|Adventure|Animation|Children|Comedy|Fantasy|  null|  null|  null|  null|   null|
|      2|Adventure| Children| Fantasy|  null|   null|  null|  null|  null|  null|   null|
|      3|   Comedy|  Romance|    null|  null|   null|  null|  null|  null|  null|   null|
|      4|   Comedy|    Drama| Romance|  null|   null|  null|  null|  null|  null|   null|
|      5|   Comedy|     null|    null|  null|   null|  null|  null|  null|  null|   null|
+-------+---------+---------+--------+------+-------+------+------+------+------+-------+


In [75]:
# 进行indexer操作
indexers = [StringIndexer(inputCol=f'genre{i+1}', outputCol=f'genre{i+1}Index', handleInvalid="keep") for i in range(max_array_length)]
# 创建 Pipeline
pipeline = Pipeline(stages=indexers)
# 应用 Pipeline
indexed_df = pipeline.fit(genre_df).transform(genre_df)
selected_columns = [col for col in indexed_df.columns if not col.startswith('genre') or col.endswith('Index')]
genre_indexed_df = indexed_df.select(selected_columns)
# 显示结果
genre_indexed_df.show()

                                                                                

+-------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+
|movieId|genre1Index|genre2Index|genre3Index|genre4Index|genre5Index|genre6Index|genre7Index|genre8Index|genre9Index|genre10Index|
+-------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+------------+
|      1|        7.0|       12.0|        9.0|        5.0|        3.0|       11.0|        8.0|        3.0|        1.0|         1.0|
|      2|        7.0|       10.0|        5.0|       16.0|       13.0|       11.0|        8.0|        3.0|        1.0|         1.0|
|      3|        1.0|        1.0|       17.0|       16.0|       13.0|       11.0|        8.0|        3.0|        1.0|         1.0|
|      4|        1.0|        0.0|        1.0|       16.0|       13.0|       11.0|        8.0|        3.0|        1.0|         1.0|
|      5|        1.0|       18.0|       17.0|       16.0|       13.0|       11.0|  

## Deal with the feature of users
calculate the average rating of each user
calculate the average rating of each user in each kind of movie
calculate the max point and the min point of each user

In [76]:
# 假设 rating_df 已经是一个 PySpark DataFrame
# 计算每个用户的电影平均评分
user_average_ratings = ratings_df.groupBy("userId").agg(avg("rating").alias("user_average_rating"))
# 显示结果
user_average_ratings.show(5)



+------+-------------------+
|userId|user_average_rating|
+------+-------------------+
| 65408|  3.745614035087719|
| 65478|  4.638888888888889|
| 65867| 3.9887640449438204|
| 66010|  2.519230769230769|
| 66166|  3.742424242424242|
+------+-------------------+


                                                                                

In [77]:
big_df = ratings_df.join(user_average_ratings, on="userId")

In [78]:
big_df = big_df.join(movies_df_vectorized, on="movieId")

In [79]:
from pyspark.sql.functions import col, when, avg

# 假设 big_df 已经是一个 PySpark DataFrame
# 列出所有的电影类型
genres = ["Crime", "Romance", "Thriller", "Adventure", "Drama", "War", 
          "Documentary", "Fantasy", "Mystery", "Musical", "Animation", 
          "Film-Noir", "(no genres listed)", "IMAX", "Horror", "Western", 
          "Comedy", "Children", "Action", "Sci-Fi"]

# 计算每个用户对每个类型的平均评分
exprs = [avg(when(col(genre) == 1, col("rating"))).alias(genre) for genre in genres]

user_genre_ratings = big_df.groupBy("userId").agg(*exprs)
user_genre_ratings = user_genre_ratings.join(user_average_ratings, "userId")


# 填充缺失值
for genre in genres:
    user_genre_ratings = user_genre_ratings.withColumn(genre, when(col(genre).isNull(), col("user_average_rating")).otherwise(col(genre)))

In [80]:
# 改column名方便确认这部分特征是来自用户的
for genre in genres:
    user_genre_ratings = user_genre_ratings.withColumnRenamed(genre, "user_" + genre)

In [81]:
user_genre_ratings.show(5)

[Stage 999:===>                                                  (13 + 2) / 200]

+------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+-----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+
|userId|        user_Crime|     user_Romance|     user_Thriller|   user_Adventure|        user_Drama|          user_War|  user_Documentary|     user_Fantasy|      user_Mystery|      user_Musical|    user_Animation|    user_Film-Noir|user_(no genres listed)|         user_IMAX|       user_Horror|      user_Western|       user_Comedy|     user_Children|       user_Action|       user_Sci-Fi|user_average_rating|
+------+------------------+-----------------+------------------+-----------------+------------------+------------------+------------------+-----------------+------------------+--

                                                                                

In [82]:
from pyspark.sql.functions import col, when, avg, max, min
# 计算每个用户的最高分和最低分
user_max_min_ratings = big_df.groupBy("userId").agg(
    max("rating").alias("user_max"),
    min("rating").alias("user_min")
)

In [83]:
user_max_min_ratings.show(5)



+------+--------+--------+
|userId|user_max|user_min|
+------+--------+--------+
|   148|     5.0|     3.0|
|   463|     5.0|     1.0|
|   471|     5.0|     3.0|
|   496|     5.0|     0.5|
|   833|     5.0|     0.5|
+------+--------+--------+


                                                                                

### Join feature matrix and genome_scores by movieId 

In [84]:
big_df = big_df.join(average_rating_df, on="movieId")

In [85]:
big_df = big_df.drop('user_average_rating')
big_df = big_df.join(user_genre_ratings, on="userId")
big_df = big_df.join(user_max_min_ratings, on="userId")
big_df = big_df.join(movies_df, on="movieId")

In [86]:
big_df.show(5)

[Stage 1026:>               (0 + 1) / 1][Stage 1035:=====>       (80 + 2) / 200]

+-------+------+------+----------+-----+-------+--------+---------+-----+---+-----------+-------+-------+-------+---------+---------+------------------+----+------+-------+------+--------+------+------+--------------------+------------------+-----------------+-----------------+-----------------+-----------------+----------+--------+----------------+-----------------+-----------------+------------------+------------------+-----------------+-----------------------+------------------+-----------+-----------------+-----------+-----------------+-----------+-----------+-------------------+--------+--------+--------------------+--------------------+
|movieId|userId|rating| timestamp|Crime|Romance|Thriller|Adventure|Drama|War|Documentary|Fantasy|Mystery|Musical|Animation|Film-Noir|(no genres listed)|IMAX|Horror|Western|Comedy|Children|Action|Sci-Fi|movie_average_rating|movie_rating_count|       user_Crime|     user_Romance|    user_Thriller|   user_Adventure|user_Drama|user_War|user_Documentar

                                                                                

In [87]:
data_pre_process_end_time = time.time()
print("Time used for data pre-process: {}".format(data_pre_process_end_time - data_pre_process_start_time))

Time used for data pre-process: 237.48256659507751


In [88]:
# # Data Alignment Using another way of vectorizing
# big_df = big_df.join(genre_df, on="movieId")
# big_df = big_df.join(user_average_ratings, on="userId")
# big_df = big_df.join(average_rating_df, on="movieId")

# Using ML model to predict

### Use Linear Regression to predict

In [89]:
(train_data, test_data) = big_df.randomSplit([0.7, 0.3], seed=5021)

In [90]:
lr_final_start_time = time.time()
# Assuming movies_df and users_df are your DataFrames for movies and users
# Join them with the rating data and perform feature transformations
input_cols = big_df.columns
to_remove = ["rating", "userId", "movieId", "timestamp", "title", "genres"]
# to_remove = ["rating", "timestamp", "title", "genres"]
for col in to_remove:
    if col in input_cols:
        input_cols.remove(col)
input_cols

['Crime',
 'Romance',
 'Thriller',
 'Adventure',
 'Drama',
 'War',
 'Documentary',
 'Fantasy',
 'Mystery',
 'Musical',
 'Animation',
 'Film-Noir',
 '(no genres listed)',
 'IMAX',
 'Horror',
 'Western',
 'Comedy',
 'Children',
 'Action',
 'Sci-Fi',
 'movie_average_rating',
 'movie_rating_count',
 'user_Crime',
 'user_Romance',
 'user_Thriller',
 'user_Adventure',
 'user_Drama',
 'user_War',
 'user_Documentary',
 'user_Fantasy',
 'user_Mystery',
 'user_Musical',
 'user_Animation',
 'user_Film-Noir',
 'user_(no genres listed)',
 'user_IMAX',
 'user_Horror',
 'user_Western',
 'user_Comedy',
 'user_Children',
 'user_Action',
 'user_Sci-Fi',
 'user_average_rating',
 'user_max',
 'user_min']

In [91]:
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="features")

lr = LinearRegression(featuresCol="features", labelCol="rating", regParam=0.1)

# Pipeline
pipeline = Pipeline(stages=[vector_assembler, lr])
lr_model = pipeline.fit(train_data)

# predictions
lr_predictions = lr_model.transform(test_data)

                                                                                

In [92]:
# evaluations
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
test_eval = evaluator.evaluate(lr_predictions)
print("RMSE: {}".format(test_eval))
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="r2")
test_eval = evaluator.evaluate(lr_predictions)
print("R2: {}".format(test_eval))
lr_final_end_time = time.time()
print("Time used for Linear Regression: {}".format(lr_final_end_time - lr_final_start_time))

                                                                                

RMSE: 0.865145953599929




R2: 0.33508251480258344
Time used for Linear Regression: 625.677747964859


                                                                                

### Use random forest to predict

In [93]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor


rf_start_time = time.time()
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
rf = RandomForestRegressor(featuresCol="features", labelCol="rating")

# 创建管道
pipeline = Pipeline(stages=[assembler, rf])

# 训练模型
model = pipeline.fit(train_data)
# 预测
predictions = model.transform(test_data)

                                                                                

In [94]:
# evaluations
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
test_eval = evaluator.evaluate(predictions)
print("RMSE: {}".format(test_eval))
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="r2")
test_eval = evaluator.evaluate(predictions)
print("R2: {}".format(test_eval))
rf_end_time = time.time()
print("Time used for Random Forest: {}".format(rf_end_time - rf_start_time))

                                                                                

RMSE: 0.8783868647840285




R2: 0.31457386870496185
Time used for Random Forest: 593.2066202163696


                                                                                

### Use GBT to predict

In [95]:
from pyspark.ml.regression import GBTRegressor

GBT_start_time = time.time()
gbt = GBTRegressor(featuresCol="features", labelCol="rating", maxIter=10)
pipeline = Pipeline(stages=[assembler, gbt])
model = pipeline.fit(train_data)
predictions = model.transform(test_data)

                                                                                

In [96]:
# evaluations
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
test_eval = evaluator.evaluate(predictions)
print("RMSE: {}".format(test_eval))
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="r2")
test_eval = evaluator.evaluate(predictions)
print("R2: {}".format(test_eval))
GBT_end_time = time.time()
print("Time used for GBT: {}".format(GBT_end_time - GBT_start_time))

                                                                                

RMSE: 0.8606252044691601




R2: 0.34201330266161756
Time used for GBT: 914.3847703933716


                                                                                

### Use ALS to predict again (with genre and title)

In [97]:
# als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
# als_model = als.fit(train_data)
# # Predictions
# predictions = als_model.transform(test_data)

In [98]:
# # evaluations
# evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
# test_eval = evaluator.evaluate(predictions)
# print("RMSE: {}".format(test_eval))
# evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="r2")
# test_eval = evaluator.evaluate(predictions)
# print("R2: {}".format(test_eval))