# Part 1

## 1. Read in the rating file and create an RDD consisting of parsed lines, then count the number of ratings.

In [5]:
from pyspark import SparkContext

sc = SparkContext()
ratings_rdd = sc.textFile("../dataset/ratings.csv")

header = ratings_rdd.first()

# 去除头部，然后解析每一行
ratings_rdd = ratings_rdd.filter(lambda line: line != header) \
    .map(lambda line: line.split(',')) \
    .map(lambda tokens: (tokens[0], tokens[1], float(tokens[2]), tokens[3]))

num_ratings = ratings_rdd.count()
print(num_ratings)

# 关闭 SparkContext
sc.stop()



25000095


                                                                                

## 2. Recommend 5 movies with the highest average rating.

In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MovieLens").getOrCreate()
ratings_df = spark.createDataFrame(ratings_rdd, ["userId", "movieId", "rating", "timestamp"])

# 计算每部电影的平均评分
avg_ratings_df = ratings_df.groupBy("movieId").avg("rating")

# 降序排列并选出前5部
top_movies = avg_ratings_df.orderBy("avg(rating)", ascending=False).limit(5)

top_movies.show()

                                                                                

+-------+-----------+
|movieId|avg(rating)|
+-------+-----------+
| 195549|        5.0|
| 140014|        5.0|
| 133297|        5.0|
| 196547|        5.0|
| 182345|        5.0|
+-------+-----------+



## 3. Try to create visualizations to convey the insights.

In [8]:
import matplotlib.pyplot as plt

# 以Pandas DataFrame的形式获取数据
top_movies_pd = top_movies.toPandas()

# 创建可视化
plt.figure(figsize=(10, 6))
plt.bar(top_movies_pd['movieId'], top_movies_pd['avg(rating)'])
plt.xlabel('Movie ID')
plt.ylabel('Average Rating')
plt.title('Top 5 Movies by Average Rating')
plt.show()

# Part2

## 6. Divide training set and test set

In [None]:
# 首先，转换成适合机器学习的格式
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer

ratings_ml = ratings_df.rdd.map(lambda r: Row(userId=int(r[0]), movieId=int(r[1]), rating=float(r[2])))
ratings_ml_df = spark.createDataFrame(ratings_ml)

# 划分数据集
(training, test) = ratings_ml_df.randomSplit([0.7, 0.3])