In [1]:
import json
import pandas as pd
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode


spark = SparkSession.builder.appName("ProcessingDataset")\
    .config("spark.kryoserializer.buffer.max", "512m") \
    .getOrCreate()
spark.conf.set('spark.sql.caseSensitive', True)

In [2]:
user_reviews= spark.read.parquet("Dataset/user_reviews.parquet")

In [3]:
user_reviews.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [4]:
user_reviews_demo = user_reviews.limit(100000)

In [5]:
user_reviews_demo.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [6]:
metadata=spark.read.parquet("Dataset/metadata.parquet")
metadata.show(5, False)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------------------------------------------------------+-----+-----------+
|parent_asin|title                                                                                                                                                                                                   |average_rating|rating_number|main_category            |categories                                                         |price|store      |
+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------

In [7]:
full_data = user_reviews_demo.join(metadata, user_reviews_demo.parent_asin == metadata.parent_asin, "inner")

In [8]:
full_data_filter = full_data.select(
	"review_id",
	metadata["title"].alias("title"),
	"rating",
	"average_rating",
	"rating_number",
	user_reviews_demo["asin"].alias("asin"),
	metadata["parent_asin"].alias("parent_asin"),
	"user_id",
	"helpful_vote",
	"categories"
)
full_data_filter.show(5, False)

+------------+------------------------------------------------------------------------------------------------------------------------------+------+--------------+-------------+----------+-----------+----------------------------+------------+------------------------------------------------------------+
|review_id   |title                                                                                                                         |rating|average_rating|rating_number|asin      |parent_asin|user_id                     |helpful_vote|categories                                                  |
+------------+------------------------------------------------------------------------------------------------------------------------------+------+--------------+-------------+----------+-----------+----------------------------+------------+------------------------------------------------------------+
|103079215104|The First Templar - Xbox 360                                              

In [9]:
full_data_filter.printSchema()

root
 |-- review_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- asin: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



## Simple recommendation model


In [10]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [19]:
ratings = full_data_filter.select("user_id", "parent_asin", "rating", "rating_number", "helpful_vote")

In [20]:
ratings.show(5, False)

+----------------------------+-----------+------+-------------+------------+
|user_id                     |parent_asin|rating|rating_number|helpful_vote|
+----------------------------+-----------+------+-------------+------------+
|AHP4ABT4AOUOKHKXCUT3JCFU623A|B004HD55V0 |5.0   |155          |0           |
|AHP4ABT4AOUOKHKXCUT3JCFU623A|B00AR5BNYU |5.0   |31           |1           |
|AHP4ABT4AOUOKHKXCUT3JCFU623A|B003NE6BQW |1.0   |41           |4           |
|AHP4ABT4AOUOKHKXCUT3JCFU623A|B003L77ZHK |4.0   |14           |3           |
|AEHINTI4PJFF3EQ7OL3Q2C3NYP7Q|B07WRDYD2N |5.0   |1702         |0           |
+----------------------------+-----------+------+-------------+------------+
only showing top 5 rows



In [None]:
(train, test) = ratings.randomSplit([0.8, 0.2], seed=42)

In [26]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS

# Chuyển đổi user_id và parent_asin sang dạng số
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_index")
item_indexer = StringIndexer(inputCol="parent_asin", outputCol="parent_asin_index")

# Fit và transform trên tập train và test
train_indexed = user_indexer.fit(train).transform(train)
train_indexed = item_indexer.fit(train_indexed).transform(train_indexed)
test_indexed = user_indexer.fit(test).transform(test)
test_indexed = item_indexer.fit(test_indexed).transform(test_indexed)

# Khởi tạo mô hình ALS với các cột đã được index
als = ALS(
    userCol="user_id_index",
    itemCol="parent_asin_index",
    ratingCol="weighted_rating",
    nonnegative=True,  # Đảm bảo giá trị rating không âm
    coldStartStrategy="drop",  # Xử lý người dùng/sản phẩm mới
    implicitPrefs=False,  # Sử dụng explicit feedback (rating)
    rank=10,  # Số chiều latent factor
    regParam=0.01,  # Param regularization
    maxIter=10  # Số lần lặp
)

# Huấn luyện mô hình
model = als.fit(train_indexed)

In [27]:
from pyspark.ml.evaluation import RegressionEvaluator

# Dự đoán trên tập test
predictions = model.transform(test_indexed)

# Đánh giá mô hình bằng RMSE
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

Root-mean-square error (RMSE) = 8.32067381526845


In [28]:
# Lấy top-N gợi ý cho người dùng
user_recs = model.recommendForAllUsers(numItems=10)
user_recs.show(truncate=False)

+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id_index|recommendations                                                                                                                                                                                  |
+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|28           |[{3973, 28.289854}, {2017, 26.465809}, {4182, 26.12461}, {4371, 25.730793}, {20660, 25.359581}, {20662, 25.32248}, {2032, 25.282751}, {1706, 25.219788}, {3946, 25.212965}, {3050, 25.092312}]   |
|31           |[{5553, 14.505441}, {22759, 14.482898}, {9618, 13.807688}, {5077, 13.478688}, {7539, 13.425328}, {8354, 13.224518}, {1784, 13.038552}, {4014, 12.