In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


# 创建SparkSession
spark = SparkSession.builder.appName("MovieRecommend")\
    .config("spark.executor.memory", "8G")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances", "4")\
    .config("spark.driver.memory", "8G")\
    .getOrCreate()


# 加载数据
ratings_df = spark.read.csv("dataset/ratings.csv", inferSchema=True, header=True)

# Read additional data
tags_df = spark.read.csv("dataset/tags.csv", header=True, inferSchema=True)
movies_df = spark.read.csv("dataset/movies.csv", header=True, inferSchema=True)
genome_scores_df = spark.read.csv("dataset/genome-scores.csv", header=True, inferSchema=True)
genome_tags_df = spark.read.csv("dataset/genome-tags.csv", header=True, inferSchema=True)

23/11/27 20:41:33 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 192.168.102.123 instead (on interface ens192)
23/11/27 20:41:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/27 20:41:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [2]:
# Join DataFrames
# complete_data = movies_df.join(tags_df, "movieId") \
#                         .join(genome_scores_df, "movieId") \
#                         .join(genome_tags_df, "tagId") \
#                         .join(ratings_df, ["userId", "movieId"])

# genome_data=genome_scores_df.join(genome_tags_df,"tagId")

# complete_data=ratings_df.join(movies_df,"movieId")\
# .join(tags_df,"movieId")\
# .join(genome_data,["movieId","tag"])

# complete_data.printSchema()
# # complete_data.show(5)
# complete_data.count()

In [3]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame

def add_sample_label(ratings_df):
    # Show the first 5 rows and print the schema before modification
    ratings_df.show(5, truncate=False)
    ratings_df.printSchema()

    # Add a new column 'label' based on the condition
    ratings_df = ratings_df.withColumn('label', F.when(F.col('rating') >= 3.5, 2).otherwise(3))

    return ratings_df


In [4]:
import re
# Extract movie year
def parse_year(title):
    pattern = r"\((\d{4})\)"  # 正则表达式模式，匹配括号内的四位数字
    match = re.search(pattern, title)
    if match:
        year_str = match.group(1)
        return int(year_str)
    else:
        return 1900

In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os
import pickle
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline

# # spark = SparkSession.builder.appName("PrepareData").getOrCreate()
# # 创建SparkSession
# spark = SparkSession.builder.appName("MovieRecommend")\
#     .config("spark.executor.memory", "8G")\
#     .config("spark.executor.cores", "4")\
#     .config("spark.executor.instances", "4")\
#     .config("spark.driver.memory", "8G")\
#     .getOrCreate()


# Read movie data
data_path = "dataset/"
df_movie = spark.read.csv(
    "dataset/movies.csv", header=True,inferSchema=True)

# Extract movie year
parse_year_udf = udf(parse_year, IntegerType())
df_movie = df_movie.withColumn("movie_year", parse_year_udf(df_movie['title']))
df_movie=df_movie.drop('title')

# # Join genome data to df
# df = df.join(genome_scores_df, "movieId")
# df = df.join(genome_tags_df, "tagId")

# One-hot encoding genres
genres = df_movie.select("genres").distinct().rdd.flatMap(lambda x: x).collect()
genres_unique = list(set([i for sublist in genres for i in sublist.split("|")]))

for genres_name in genres_unique:
    col = "genres_" + genres_name
    df_movie = df_movie.withColumn(col, 
                                   (df_movie.genres.contains(genres_name)).cast(IntegerType()))

df_movie = df_movie.drop('genres')
genres_col_names = ["genres_" + x for x in genres_unique]

# Read rating data

df_rating = spark.read.csv(
    "dataset/ratings.csv",
    header=True,
    inferSchema=True
)

df_rating = add_sample_label(df_rating)
df_rating = df_rating.withColumn('timestamp', (df_rating['timestamp'] / (365 * 24 * 3600)).cast(IntegerType()))

# Data merge
df = df_rating.join(df_movie, 'movieId')


# df_X = df.select(["userId", "movieId"] + genres_col_names + ["timestamp"])
# df_y = df.select(["rating"])
# print("Data read completed")

df.printSchema()
df.show(20)
# df_X.show(10)
# df_y.show(10)


                                                                                

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |296    |5.0   |1147880044|
|1     |306    |3.5   |1147868817|
|1     |307    |5.0   |1147868828|
|1     |665    |5.0   |1147878820|
|1     |899    |3.5   |1147868510|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- label: integer (nullable = false)
 |-- movie_year: integer (nullable = true)
 |-- genres_(no genres listed): integer (nullable = true)
 |-- genres_Action: integer (nullable = true)
 |-- genres_Mystery: integer (nullable = true)
 |-- genres_Documentary: integer (nullable = true)
 |-- genres_Adventure: integer (nullable = true

23/11/27 20:41:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------+------+---------+-----+----------+-------------------------+-------------+--------------+------------------+----------------+----------+--------------+---------------+----------------+-------------+-------------+------------+----------------+---------------+-------------+--------------+------------+-----------+--------------+--------------+
|movieId|userId|rating|timestamp|label|movie_year|genres_(no genres listed)|genres_Action|genres_Mystery|genres_Documentary|genres_Adventure|genres_War|genres_Fantasy|genres_Children|genres_Animation|genres_Comedy|genres_Horror|genres_Crime|genres_Film-Noir|genres_Thriller|genres_Sci-Fi|genres_Western|genres_Drama|genres_IMAX|genres_Musical|genres_Romance|
+-------+------+------+---------+-----+----------+-------------------------+-------------+--------------+------------------+----------------+----------+--------------+---------------+----------------+-------------+-------------+------------+----------------+---------------+--------

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
import time

# 定义评估器
rf = RandomForestRegressor(featuresCol="features", labelCol="rating")

# 定义参数范围
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

# 定义评估器和评估方法
evaluator = RegressionEvaluator(labelCol="rating", metricName="rmse")

# 定义训练验证拆分
tvs = TrainValidationSplit(estimator=rf,
                           estimatorParamMaps=param_grid,
                           evaluator=evaluator,
                           trainRatio=0.7)  # 训练比例

# 记录每次训练的时间和性能
results = []

for param_map in param_grid:
    start_time = time.time()
    model = rf.fit(training_data, param_map)
    end_time = time.time()

    predictions = model.transform(test_data)
    rmse = evaluator.evaluate(predictions)

    training_time = end_time - start_time

    results.append({
        "params": param_map,
        "rmse": rmse,
        "training_time": training_time
    })

# 输出每次训练的时间和性能
for result in results:
    print("Parameters:", result["params"])
    print("RMSE:", result["rmse"])
    print("Training time:", result["training_time"])
    print("\n")


In [6]:
from pyspark.ml.feature import VectorAssembler

# 使用 VectorAssembler 进行特征组合
assembler = VectorAssembler(
    inputCols=["userId", "movieId", "timestamp","movie_year","label"] + genres_col_names ,
    outputCol="features"
)

# 对数据框进行转换
df = assembler.transform(df)

# 打印数据读取完成的消息
print("Data read completed")


Data read completed


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator



# 选择特征和目标列
df = df.select("features", "rating")

# 划分数据集为训练集和测试集
(training_data, test_data) = df.randomSplit([0.7, 0.3],seed=123)

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="rating",
    numTrees=10,
    maxDepth=5,
    maxBins=32,
    minInstancesPerNode=1
)

# 训练模型
model_rf = rf.fit(training_data)

# Predicting on test data
predictions_rf = model_rf.transform(test_data)


# Evaluate the model
evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions_rf)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

23/11/27 20:43:12 WARN MemoryStore: Not enough space to cache rdd_117_3 in memory! (computed 156.0 MiB so far)
23/11/27 20:43:12 WARN BlockManager: Persisting block rdd_117_3 to disk instead.
23/11/27 20:43:12 WARN MemoryStore: Not enough space to cache rdd_117_10 in memory! (computed 156.0 MiB so far)
23/11/27 20:43:12 WARN BlockManager: Persisting block rdd_117_10 to disk instead.
23/11/27 20:43:12 WARN MemoryStore: Not enough space to cache rdd_117_9 in memory! (computed 156.0 MiB so far)
23/11/27 20:43:12 WARN BlockManager: Persisting block rdd_117_9 to disk instead.
23/11/27 20:43:12 WARN MemoryStore: Not enough space to cache rdd_117_5 in memory! (computed 156.0 MiB so far)
23/11/27 20:43:12 WARN BlockManager: Persisting block rdd_117_5 to disk instead.
23/11/27 20:43:13 WARN MemoryStore: Not enough space to cache rdd_117_4 in memory! (computed 156.0 MiB so far)
23/11/27 20:43:13 WARN BlockManager: Persisting block rdd_117_4 to disk instead.
23/11/27 20:43:13 WARN MemoryStore: No

Root Mean Squared Error (RMSE) on test data = 0.620166


                                                                                