# Part 1: Basic Data Manipulation & Simple Recommendation



## 1.Read in the rating file and create an RDD consisting of parsed lines, then count the number of ratings.

In [6]:
from pyspark import SparkContext

# Create a Spark session
sc = SparkContext .getOrCreate()

ratings_rdd = sc.textFile("dataset/ratings.csv")
header = ratings_rdd.first()

# Remove the header and then parse each line
ratings_rdd = ratings_rdd.filter(lambda line: line != header) \
    .map(lambda line: line.split(',')) \
    .map(lambda tokens: (tokens[0], tokens[1], float(tokens[2]), tokens[3]))

num_ratings = ratings_rdd.count()
print(num_ratings)



25000095


                                                                                

## 2. Recommend 5 movies with the highest average rating.

In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MovieLens25M").config("spark.executor.memory", "16g").config("spark.driver.memory", "4g").getOrCreate()
ratings_df = spark.createDataFrame(ratings_rdd, ["userId", "movieId", "rating", "timestamp"])

#  Count the number of ratings
avg_ratings_df = ratings_df.groupBy("movieId").avg("rating")

# Recommend 5 movies with the highest average rating
top_movies = avg_ratings_df.orderBy("avg(rating)", ascending=False).limit(5)

top_movies.show()



+-------+-----------+
|movieId|avg(rating)|
+-------+-----------+
| 124903|        5.0|
| 137964|        5.0|
| 184903|        5.0|
| 192351|        5.0|
| 133297|        5.0|
+-------+-----------+



                                                                                

## 3. Other operations to enrich your data analysis.

## 4. Try to create visualizations to convey the insights.

In [None]:
import matplotlib.pyplot as plt

# Retrieve the data in the form of a Pandas DataFrame
top_movies_pd = top_movies.toPandas()

# Create a visualization
plt.figure(figsize=(10, 6))
plt.bar(top_movies_pd['movieId'], top_movies_pd['avg(rating)'])
plt.xlabel('Movie ID')
plt.ylabel('Average Rating')
plt.title('Top 5 Movies by Average Rating')
plt.show()


# Part2: Rating Prediction

## 1. Prepare the Dataset: split rating data into 70% training set and 30% testing set.

In [None]:
from pyspark.sql import Row

ratings_ml = ratings_df.rdd.map(lambda r: Row(userId=int(r[0]), movieId=int(r[1]), rating=float(r[2])))
ratings_ml_df = spark.createDataFrame(ratings_ml)

# Split the dataset
(training, test) = ratings_ml_df.randomSplit([0.1, 0.9])

## 2.  Choose one matrix factorization algorithm to predict the rating score based on the rating data file only.

## 2.1 Train an ALS model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


# Create ALS model and training
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)
model_1 = als.fit(training)

## 2.2 Make Recommendations with the ALSModel

In [None]:
# Generate top 3 recommendations for all users with the ALSModel
recommendations = model_2.recommendForAllUsers(3)
recommendations.show(truncate=False)

## 2.3Evaluate the model performance


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

predictions = ALSmodel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")


## 3. Extract features from movies and users (join movie and user data and do some feature transformation), then build another machine learning model to predict rating scores for the testing set.

## 3.1 Read and integrate additional data

In [8]:
# Read additional data
tags_df = spark.read.csv("dataset/tags.csv", header=True, inferSchema=True)
movies_df = spark.read.csv("dataset/movies.csv", header=True, inferSchema=True)
genome_scores_df = spark.read.csv("dataset/genome-scores.csv", header=True, inferSchema=True)
genome_tags_df = spark.read.csv("dataset/genome-tags.csv", header=True, inferSchema=True)

# Join DataFrames
complete_data = movies_df.join(tags_df, "movieId") \
                        .join(genome_scores_df, "movieId") \
                        .join(genome_tags_df, "tagId") \
                        .join(ratings_df, ["userId", "movieId"])


                                                                                

## 3.2 Feature Transformation： One-Hot Encoding

In [9]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# One-hot encode genres
stringIndexer_genres = StringIndexer(inputCol="genres", outputCol="genresIndex")
model_genres = stringIndexer_genres.fit(complete_data)
indexed_genres = model_genres.transform(complete_data)

encoder_genres = OneHotEncoder(inputCol="genresIndex", outputCol="genresVec")
complete_data = encoder_genres.transform(indexed_genres)

# One-hot encode tags
stringIndexer_tags = StringIndexer(inputCol="tag", outputCol="tagIndex")
model_tags = stringIndexer_tags.fit(complete_data)
indexed_tags = model_tags.transform(complete_data)

encoder_tags = OneHotEncoder(inputCol="tagIndex", outputCol="tagVec")
complete_data = encoder_tags.transform(indexed_tags)

# Merge genres and tags features into a single feature vector
assembler = VectorAssembler(inputCols=["genresVec", "tagVec"], outputCol="features")
complete_data = assembler.transform(complete_data)


23/11/24 10:33:13 ERROR Executor: Exception in task 13.0 in stage 26.0 (TID 163)
java.lang.OutOfMemoryError: Java heap space
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57)
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335)
	at org.apache.spark.io.ReadAheadInputStream.<init>(ReadAheadInputStream.java:105)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:74)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:555)
	at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:172)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage9.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.sp

ConnectionRefusedError: [Errno 111] Connection refused

## 3.3 Build and train machine learning models

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# Assuming 'features' is the merged feature vector column
assembler = VectorAssembler(inputCols=["genresVec", "tagVec"], outputCol="features")

# Define the Random Forest Regressor model
rf = RandomForestRegressor(featuresCol="features", labelCol="rating")

# Create a pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Split the data into training and test sets
(training_data, test_data) = complete_data.randomSplit([0.1, 0.9], seed=42)

# Train the model
model_2 = pipeline.fit(training_data)

# Make predictions on the test set
predictions = model_2.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)

# Print the Root Mean Squared Error (RMSE)
print(f"Root Mean Squared Error (RMSE): {rmse}")

## 4. Compare the pros and cons of these two models and report it.

ALS模型:
+ 优点：适合大规模数据集，能有效处理稀疏性问题，常用于推荐系统。
+ 缺点：需要调整多个参数，对冷启动问题敏感。

随机森林模型:
+ 优点：处理非线性关系效果好，不太容易过拟合。
+ 缺点：需要大量特征工程，计算成本较高。

## 5. Try to create visualizations to convey the insights.

In [None]:
import matplotlib.pyplot as plt

# 计算误差
predictions_pd = predictions.toPandas()
predictions_rf_pd = predictions_rf.toPandas()

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(predictions_pd['rating'] - predictions_pd['prediction'], bins=20, color='blue', alpha=0.7)
plt.title('ALS Prediction Error')

plt.subplot(1, 2, 2)
plt.hist(predictions_rf_pd['rating'] - predictions_rf_pd['prediction'], bins=20, color='green', alpha=0.7)
plt.title('Random Forest Prediction Error')
plt.show()