In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


# 创建SparkSession
spark = SparkSession.builder.appName("MovieRecommend")\
    .config("spark.executor.memory", "8G")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.instances", "4")\
    .config("spark.driver.memory", "8G")\
    .getOrCreate()


# 加载数据
ratings_df = spark.read.csv("dataset/ratings.csv", inferSchema=True, header=True)

# Read additional data
tags_df = spark.read.csv("dataset/tags.csv", header=True, inferSchema=True)
movies_df = spark.read.csv("dataset/movies.csv", header=True, inferSchema=True)
genome_scores_df = spark.read.csv("dataset/genome-scores.csv", header=True, inferSchema=True)
genome_tags_df = spark.read.csv("dataset/genome-tags.csv", header=True, inferSchema=True)

23/11/27 07:49:38 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 192.168.102.123 instead (on interface ens192)
23/11/27 07:49:38 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/27 07:49:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/27 07:49:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [2]:
# Join DataFrames
# complete_data = movies_df.join(tags_df, "movieId") \
#                         .join(genome_scores_df, "movieId") \
#                         .join(genome_tags_df, "tagId") \
#                         .join(ratings_df, ["userId", "movieId"])

# genome_data=genome_scores_df.join(genome_tags_df,"tagId")

# complete_data=ratings_df.join(movies_df,"movieId")\
# .join(tags_df,"movieId")\
# .join(genome_data,["movieId","tag"])

# complete_data.printSchema()
# # complete_data.show(5)
# complete_data.count()

In [3]:
from pyspark.sql import functions as F
from pyspark.sql import DataFrame

def add_sample_label(ratings_df):
    # Show the first 5 rows and print the schema before modification
    ratings_df.show(5, truncate=False)
    ratings_df.printSchema()

    # Add a new column 'label' based on the condition
    ratings_df = ratings_df.withColumn('label', F.when(F.col('rating') >= 3.5, 1).otherwise(0))

    return ratings_df


In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os
import pickle
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.types import IntegerType
from pyspark.ml import Pipeline

# spark = SparkSession.builder.appName("PrepareData").getOrCreate()
# 创建SparkSession
spark = SparkSession.builder.appName('movie_lens').\
  config('spark.driver.memory', '8g') \
  .config('spark.executor.memory', '8g') \
  .getOrCreate()


# Read movie data
data_path = "dataset/"
df_movie = spark.read.csv(
    "dataset/movies.csv", header=True,inferSchema=True)

# Extract movie year
def parse_year(x):
    year = x.strip()[-5:-1]
    if year.startswith("19"):
        return year
    else:
        return "1900"

parse_year_udf = udf(parse_year, StringType())
df_movie = df_movie.withColumn("movie_year", parse_year_udf(df_movie['title'])).drop('title')

df_movie=df_movie.drop('title')

# One-hot encoding genres
genres = df_movie.select("genres").distinct().rdd.flatMap(lambda x: x).collect()
genres_unique = list(set([i for sublist in genres for i in sublist.split("|")]))

for genres_name in genres_unique:
    col = "genres_" + genres_name
    df_movie = df_movie.withColumn(col, 
                                   (df_movie.genres.contains(genres_name)).cast(IntegerType()))

df_movie = df_movie.drop('genres')
genres_col_names = ["genres_" + x for x in genres_unique]

# Read rating data

df_rating = spark.read.csv(
    "dataset/ratings.csv",
    header=True,
    inferSchema=True
)

df_rating = add_sample_label(df_rating)
df_rating = df_rating.withColumn('timestamp', (df_rating['timestamp'] / (365 * 24 * 3600)).cast(IntegerType()))

# Data merge
df = df_rating.join(df_movie, 'movieId')

# Join genome data to df
df = df.join(genome_scores_df, "movieId")
df = df.join(genome_tags_df, "tagId")

df_X = df.select(["userId", "movieId"] + genres_col_names + ["timestamp"])
df_y = df.select(["rating"])
print("Data read completed")

df.printSchema()
df.show(200)


23/11/27 07:49:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |296    |5.0   |1147880044|
|1     |306    |3.5   |1147868817|
|1     |307    |5.0   |1147868828|
|1     |665    |5.0   |1147878820|
|1     |899    |3.5   |1147868510|
+------+-------+------+----------+
only showing top 5 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

Data read completed
root
 |-- tagId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- label: integer (nullable = false)
 |-- movie_year: string (nullable = true)
 |-- genres_Mystery: integer (nullable = true)
 |-- genres_Musical: integer (nullable = true)
 |-- genres_Western: integer (nullable = true)
 |-- genres_Drama: integer (nullable = true)
 |-- g

23/11/27 07:50:04 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 24:>                                                         (0 + 1) / 1]

+-----+-------+------+------+---------+-----+----------+--------------+--------------+--------------+------------+----------------+--------------+-------------+-------------------------+---------------+-------------+------------------+------------+-------------+-----------+----------------+-------------+---------------+--------------+----------+----------------+--------------------+--------------------+
|tagId|movieId|userId|rating|timestamp|label|movie_year|genres_Mystery|genres_Musical|genres_Western|genres_Drama|genres_Animation|genres_Fantasy|genres_Comedy|genres_(no genres listed)|genres_Thriller|genres_Action|genres_Documentary|genres_Crime|genres_Horror|genres_IMAX|genres_Adventure|genres_Sci-Fi|genres_Children|genres_Romance|genres_War|genres_Film-Noir|           relevance|                 tag|
+-----+-------+------+------+---------+-----+----------+--------------+--------------+--------------+------------+----------------+--------------+-------------+-------------------------+

                                                                                

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator



assembler = VectorAssembler(
    inputCols=df_X.columns,
    outputCol="features"
)

df = assembler.transform(df)

# 选择特征和目标列
df = df.select("features", "rating")
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(df)

# Split the data into training and test sets (30% held out for testing)
(training, test) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(training)

# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)


# Evaluate the model
evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions_rf)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

[Stage 31:>                                                       (0 + 16) / 17]

In [None]:
# model_rf.save("model_rf_1")

from pyspark.ml.regression import RandomForestRegressionModel
loaded_model = RandomForestRegressionModel.load("model_rf_1")

predictions_1 = loaded_model.transform(test_data)
rmse = evaluator.evaluate(predictions_1)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

# 创建SparkSession
spark = SparkSession.builder.appName('movie_lens').\
  config('spark.driver.memory', '8g') \
  .config('spark.executor.memory', '8g') \
  .getOrCreate()

# 加载数据
ratings_df = spark.read.csv("dataset/ratings.csv", inferSchema=True, header=True)
#划分数据集
(training, test) = ratings_df.randomSplit([0.7, 0.3])



In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

# Define a grid of parameters to search
param_grid = {
    "rank": [10, 20, 30],
    "maxIter": [5, 10, 20],
    "regParam": [0.01, 0.1, 1.0]
}

# Initialize evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

best_model = None
best_error = float('inf')
best_params = None

# Grid search through parameters
for rank in param_grid["rank"]:
    for max_iter in param_grid["maxIter"]:
        for reg_param in param_grid["regParam"]:
            als.setParams(rank=rank, maxIter=max_iter, regParam=reg_param)
            
            # Fit ALS model on training data
            model = als.fit(training)
            
            # Evaluate on test data
            predictions = model.transform(test)
            error = evaluator.evaluate(predictions)
            
            # Update best parameters if current model is better
            if error < best_error:
                best_error = error
                best_model = model
                best_params = {"rank": rank, "maxIter": max_iter, "regParam": reg_param}

# Output the best parameters and error
print("Best Parameters:", best_params)
print("Best RMSE:", best_error)

# Make predictions using the best model
test_predictions = best_model.transform(test)
