In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


# 创建SparkSession
spark = SparkSession.builder.appName("MovieRecommend")\
    .config("spark.executor.memory", "8G")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances", "4")\
    .config("spark.driver.memory", "8G")\
    .getOrCreate()


# 加载数据
ratings_df = spark.read.csv("dataset/ratings.csv", inferSchema=True, header=True)

# Read additional data
tags_df = spark.read.csv("dataset/tags.csv", header=True, inferSchema=True)
movies_df = spark.read.csv("dataset/movies.csv", header=True, inferSchema=True)
genome_scores_df = spark.read.csv("dataset/genome-scores.csv", header=True, inferSchema=True)
genome_tags_df = spark.read.csv("dataset/genome-tags.csv", header=True, inferSchema=True)

In [None]:
# Join DataFrames
# complete_data = movies_df.join(tags_df, "movieId") \
#                         .join(genome_scores_df, "movieId") \
#                         .join(genome_tags_df, "tagId") \
#                         .join(ratings_df, ["userId", "movieId"])

# genome_data=genome_scores_df.join(genome_tags_df,"tagId")

# complete_data=ratings_df.join(movies_df,"movieId")\
# .join(tags_df,"movieId")\
# .join(genome_data,["movieId","tag"])

# complete_data.printSchema()
# # complete_data.show(5)
# complete_data.count()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame

def add_sample_label(ratings_df):
    # Show the first 5 rows and print the schema before modification
    ratings_df.show(5, truncate=False)
    ratings_df.printSchema()

    # Add a new column 'label' based on the condition
    ratings_df = ratings_df.withColumn('label', F.when(F.col('rating') >= 3.5, 2).otherwise(3))

    return ratings_df


In [None]:
import re
# Extract movie year
def parse_year(title):
    pattern = r"\((\d{4})\)"  # 正则表达式模式，匹配括号内的四位数字
    match = re.search(pattern, title)
    if match:
        year_str = match.group(1)
        return int(year_str)
    else:
        return 1900

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os
import pickle
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline

# # spark = SparkSession.builder.appName("PrepareData").getOrCreate()
# # 创建SparkSession
# spark = SparkSession.builder.appName("MovieRecommend")\
#     .config("spark.executor.memory", "8G")\
#     .config("spark.executor.cores", "4")\
#     .config("spark.executor.instances", "4")\
#     .config("spark.driver.memory", "8G")\
#     .getOrCreate()


# Read movie data
data_path = "dataset/"
df_movie = spark.read.csv(
    "dataset/movies.csv", header=True,inferSchema=True)

# Extract movie year
parse_year_udf = udf(parse_year, IntegerType())
df_movie = df_movie.withColumn("movie_year", parse_year_udf(df_movie['title']))
df_movie=df_movie.drop('title')

# # Join genome data to df
# df = df.join(genome_scores_df, "movieId")
# df = df.join(genome_tags_df, "tagId")

# One-hot encoding genres
genres = df_movie.select("genres").distinct().rdd.flatMap(lambda x: x).collect()
genres_unique = list(set([i for sublist in genres for i in sublist.split("|")]))

for genres_name in genres_unique:
    col = "genres_" + genres_name
    df_movie = df_movie.withColumn(col, 
                                   (df_movie.genres.contains(genres_name)).cast(IntegerType()))

df_movie = df_movie.drop('genres')
genres_col_names = ["genres_" + x for x in genres_unique]

# Read rating data

df_rating = spark.read.csv(
    "dataset/ratings.csv",
    header=True,
    inferSchema=True
)

df_rating = add_sample_label(df_rating)
df_rating = df_rating.withColumn('timestamp', (df_rating['timestamp'] / (365 * 24 * 3600)).cast(IntegerType()))

# Data merge
df = df_rating.join(df_movie, 'movieId')


# df_X = df.select(["userId", "movieId"] + genres_col_names + ["timestamp"])
# df_y = df.select(["rating"])
# print("Data read completed")

df.printSchema()
df.show(20)
# df_X.show(10)
# df_y.show(10)


In [None]:
from pyspark.ml.feature import VectorAssembler

# 使用 VectorAssembler 进行特征组合
assembler = VectorAssembler(
    inputCols=["userId", "movieId", "timestamp","movie_year","label"] + genres_col_names ,
    outputCol="features"
)

# 对数据框进行转换
df = assembler.transform(df)

# 打印数据读取完成的消息
print("Data read completed")


In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from datetime import datetime
import pandas as pd

# # Create a Spark session
# spark = SparkSession.builder.appName("RandomForestRegressorExample").getOrCreate()


# Select features and target column
df = df.select("features", "rating")

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.7, 0.3], seed=123)

# Create a RandomForestRegressor
rf = RandomForestRegressor(featuresCol="features", labelCol="rating")

# Define the parameter grid to search over
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .addGrid(rf.maxBins, [16, 32, 64]) \
    .addGrid(rf.minInstancesPerNode, [1, 5, 10]) \
    .build()


# Define multiple evaluators
rmse_evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
mae_evaluator = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")

# Create the CrossValidator
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=param_grid,
                          evaluator=rmse_evaluator,
                          numFolds=3)  # Number of folds for cross-validation

# Record the start time
start_time = datetime.now()

# Run cross-validation, and choose the best set of parameters
cv_model = crossval.fit(training_data)

# Record the end time
end_time = datetime.now()

# Calculate the training time
training_time = end_time - start_time
print(f"Training Time: {training_time}")

# Make predictions on test data
predictions_rf_cv = cv_model.transform(test_data)

# Evaluate the model
rmse_cv = rmse_evaluator.evaluate(predictions_rf_cv)
mae_cv = mae_evaluator.evaluate(predictions_rf_cv)
print(f"Cross-validated Root Mean Square Error (RMSE): {rmse_cv}")
print(f"Cross-validated Mean Absolute Error (MAE): {mae_cv}")

# Print the best parameters found during cross-validation
best_params = cv_model.bestModel.extractParamMap()
print("Best Parameters:")
for param, value in best_params.items():
    print(f"{param.name}: {value}")

# Create a pandas DataFrame with the results
results_dict = {
    "Training Time": [str(training_time)],
    "RMSE": [rmse_cv],
    "MAE": [mae_cv],
    "Best Parameters": [best_params]
}

results_df = pd.DataFrame(results_dict)

# Save the DataFrame to a CSV file
csv_path = "/root/CineSpark-Insights/results/rf_model_performance.csv"
results_df.to_csv(csv_path, index=False)

print(f"Results saved to: {csv_path}")


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator



# 选择特征和目标列
df = df.select("features", "rating")

# 划分数据集为训练集和测试集
(training_data, test_data) = df.randomSplit([0.7, 0.3],seed=123)

rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="rating",
    numTrees=10,
    maxDepth=5,
    maxBins=32,
    minInstancesPerNode=1
)

# 训练模型
model_rf = rf.fit(training_data)

# Predicting on test data
predictions_rf = model_rf.transform(test_data)



# Define multiple evaluators
rmse_evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
mae_evaluator = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")

rmse = rmse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)
print(f"Root Mean Square Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")