# Part 1: Basic Data Manipulation & Simple Recommendation



## 1.Read in the rating file and create an RDD consisting of parsed lines, then count the number of ratings.

In [1]:
from pyspark import SparkContext

# Create a Spark session
sc = SparkContext .getOrCreate()

ratings_rdd = sc.textFile("dataset/ratings.csv")
header = ratings_rdd.first()

# Remove the header and then parse each line
ratings_rdd = ratings_rdd.filter(lambda line: line != header) \
    .map(lambda line: line.split(',')) \
    .map(lambda tokens: (tokens[0], tokens[1], float(tokens[2]), tokens[3]))

num_ratings = ratings_rdd.count()
print(num_ratings)

23/11/23 09:34:32 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 192.168.102.123 instead (on interface ens192)
23/11/23 09:34:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/23 09:34:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/23 09:34:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/11/23 09:34:34 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.

25000095


                                                                                

## 2. Recommend 5 movies with the highest average rating.

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MovieLens").config("spark.driver.memory", "4G").\
  config("spark.executor.memory", "1.5G").getOrCreate()
ratings_df = spark.createDataFrame(ratings_rdd, ["userId", "movieId", "rating", "timestamp"])

#  Count the number of ratings
avg_ratings_df = ratings_df.groupBy("movieId").avg("rating")

# Recommend 5 movies with the highest average rating
top_movies = avg_ratings_df.orderBy("avg(rating)", ascending=False).limit(5)

top_movies.show()



+-------+-----------+
|movieId|avg(rating)|
+-------+-----------+
| 169820|        5.0|
| 182345|        5.0|
| 195641|        5.0|
| 193529|        5.0|
| 140014|        5.0|
+-------+-----------+



                                                                                

## 3. Other operations to enrich your data analysis.

## 4. Try to create visualizations to convey the insights.

# Part2: Rating Prediction

## 1. First split rating data into 70% training set and 30% testing set.

In [3]:
from pyspark import SparkContext

# Create a Spark session
sc = SparkContext .getOrCreate()

ratings_rdd = sc.textFile("dataset/ratings.csv")
header = ratings_rdd.first()

# Remove the header and then parse each line
ratings_rdd = ratings_rdd.filter(lambda line: line != header) \
    .map(lambda line: line.split(',')) \
    .map(lambda tokens: (tokens[0], tokens[1], float(tokens[2]), tokens[3]))


spark = SparkSession.builder.appName("MovieLens").config("spark.driver.memory", "4G").\
  config("spark.executor.memory", "1.5G").getOrCreate()
ratings_df = spark.createDataFrame(ratings_rdd, ["userId", "movieId", "rating", "timestamp"])



from pyspark.sql import Row

ratings_ml = ratings_df.rdd.map(lambda r: Row(userId=int(r[0]), movieId=int(r[1]), rating=float(r[2])))
ratings_ml_df = spark.createDataFrame(ratings_ml)

# Split the dataset
(training, test) = ratings_ml_df.randomSplit([0.7, 0.3])

23/11/23 09:35:57 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 6 (TID 52): Attempting to kill Python Worker
                                                                                

## 2. Choose one matrix factorization algorithm to predict the rating score based on the rating data file only.

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS


# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

# Define a grid of parameters to search
param_grid = {
    "rank": [10, 20, 30],
    "maxIter": [5, 10, 20],
    "regParam": [0.01, 0.1, 1.0]
}

# Initialize evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

best_model = None
best_error = float('inf')
best_params = None

# Grid search through parameters
for rank in param_grid["rank"]:
    for max_iter in param_grid["maxIter"]:
        for reg_param in param_grid["regParam"]:
            als.setParams(rank=rank, maxIter=max_iter, regParam=reg_param)
            
            # Fit ALS model on training data
            model = als.fit(training)
            
            # Evaluate on test data
            predictions = model.transform(test)
            error = evaluator.evaluate(predictions)
            
            # Update best parameters if current model is better
            if error < best_error:
                best_error = error
                best_model = model
                best_params = {"rank": rank, "maxIter": max_iter, "regParam": reg_param}

# Output the best parameters and error
print("Best Parameters:", best_params)
print("Best RMSE:", best_error)

# Make predictions using the best model
test_predictions = best_model.transform(test)

# Generate top 3 recommendations for all users using the best model
recommendations = best_model.recommendForAllUsers(3)
recommendations.show(truncate=False)

23/11/23 09:37:38 WARN BlockManager: Block rdd_38_5 could not be removed as it was not found on disk or in memory
23/11/23 09:37:38 WARN BlockManager: Block rdd_39_5 could not be removed as it was not found on disk or in memory
23/11/23 09:37:38 ERROR Executor: Exception in task 5.0 in stage 10.0 (TID 101)
java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.ml.recommendation.ALS$UncompressedInBlockSort.allocate(ALS.scala:1596)
	at org.apache.spark.ml.recommendation.ALS$UncompressedInBlockSort.allocate(ALS.scala:1545)
	at org.apache.spark.util.collection.TimSort$SortState.ensureCapacity(TimSort.java:943)
	at org.apache.spark.util.collection.TimSort$SortState.mergeLo(TimSort.java:691)
	at org.apache.spark.util.collection.TimSort$SortState.mergeAt(TimSort.java:517)
	at org.apache.spark.util.collection.TimSort$SortState.mergeCollapse(TimSort.java:445)
	at org.apache.spark.util.collection.TimSort$SortState.access$200(TimSort.java:308)
	at org.apache.spark.util.collection.TimSor

ConnectionRefusedError: [Errno 111] Connection refused

# Evaluate the model performance


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on the test data = {rmse}")

## 3. Extract features from movies and users (join movie and user data and do some feature transformation), then build another machine learning model to predict rating scores for the testing set.

## 3.1 Read and integrate additional data

In [None]:
# Read additional data
tags_df = spark.read.csv("dataset/tags.csv", header=True, inferSchema=True)
movies_df = spark.read.csv("dataset/movies.csv", header=True, inferSchema=True)
genome_scores_df = spark.read.csv("dataset/genome-scores.csv", header=True, inferSchema=True)
genome_tags_df = spark.read.csv("dataset/genome-tags.csv", header=True, inferSchema=True)

# Potential feature transformations
# For example, perform one-hot encoding on movie genres
from pyspark.ml.feature import OneHotEncoder, StringIndexer

# Type conversion
stringIndexer = StringIndexer(inputCol="genres", outputCol="genresIndex")
model = stringIndexer.fit(movies_df)
indexed = model.transform(movies_df)

encoder = OneHotEncoder(inputCol="genresIndex", outputCol="genresVec")
movies_encoded = encoder.transform(indexed)


## 3.2 Feature engineering

In [None]:
# 合并标签和评分数据
# 这里是一个简化的例子，具体实现可能更复杂
tag_features_df = tags_df.join(genome_scores_df, "movieId").join(genome_tags_df, "tagId")

# 将电影信息和标签特征合并
movie_features_df = movies_encoded.join(tag_features_df, "movieId")

# 合并用户评分和电影特征
complete_data_df = ratings_ml_df.join(movie_features_df, "movieId")

## 3.3 Build and train machine learning models

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor

# 特征向量化
assembler = VectorAssembler(inputCols=["genresVec", "tagFeatures"], outputCol="features")
data_ready = assembler.transform(complete_data_df)

# 划分数据集
(training_features, test_features) = data_ready.randomSplit([0.7, 0.3])

# 使用随机森林模型
rf = RandomForestRegressor(featuresCol="features", labelCol="rating")
rf_model = rf.fit(training_features)

# 在测试集上进行预测
predictions_rf = rf_model.transform(test_features)

## 4. Compare the pros and cons of these two models and report it.

ALS模型:
+ 优点：适合大规模数据集，能有效处理稀疏性问题，常用于推荐系统。
+ 缺点：需要调整多个参数，对冷启动问题敏感。

随机森林模型:
+ 优点：处理非线性关系效果好，不太容易过拟合。
+ 缺点：需要大量特征工程，计算成本较高。

## 5. Try to create visualizations to convey the insights.

In [None]:
import matplotlib.pyplot as plt

# 计算误差
predictions_pd = predictions.toPandas()
predictions_rf_pd = predictions_rf.toPandas()

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(predictions_pd['rating'] - predictions_pd['prediction'], bins=20, color='blue', alpha=0.7)
plt.title('ALS Prediction Error')

plt.subplot(1, 2, 2)
plt.hist(predictions_rf_pd['rating'] - predictions_rf_pd['prediction'], bins=20, color='green', alpha=0.7)
plt.title('Random Forest Prediction Error')
plt.show()