In [1]:
import json
import pandas as pd
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode


spark = SparkSession.builder.appName("ProcessingDataset")\
    .config("spark.kryoserializer.buffer.max", "512m") \
    .getOrCreate()
spark.conf.set('spark.sql.caseSensitive', True)

In [2]:
user_reviews= spark.read.parquet("Dataset/user_reviews.parquet")

In [3]:
user_reviews.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [4]:
user_reviews_demo = user_reviews.limit(50000)

In [5]:
user_reviews_demo.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [5]:
metadata=spark.read.parquet("Dataset/metadata.parquet")
metadata.show(5, False)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------------------------------------------------------+-----+-----------+
|parent_asin|title                                                                                                                                                                                                   |average_rating|rating_number|main_category            |categories                                                         |price|store      |
+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------

In [6]:
full_data = user_reviews_demo.join(metadata, user_reviews_demo.parent_asin == metadata.parent_asin, "inner")

In [8]:
full_data_filter = full_data.select(
	"review_id",
	user_reviews_demo["title"].alias("title"),
	"rating",
	"average_rating",
	"rating_number",
	user_reviews_demo["asin"].alias("asin"),
	metadata["parent_asin"].alias("parent_asin"),
	"user_id",
	"helpful_vote",
	"categories"
)
full_data_filter.show(5, False)

+------------+--------------------------------------------------------+------+--------------+-------------+----------+-----------+----------------------------+------------+------------------------------------------------------------+
|review_id   |title                                                   |rating|average_rating|rating_number|asin      |parent_asin|user_id                     |helpful_vote|categories                                                  |
+------------+--------------------------------------------------------+------+--------------+-------------+----------+-----------+----------------------------+------------+------------------------------------------------------------+
|103079215104|Great game for those wanting historical info of Templars|5.0   |4.4           |155          |B004H0J5QY|B004HD55V0 |AHP4ABT4AOUOKHKXCUT3JCFU623A|0           |[Video Games, Legacy Systems, Xbox Systems, Xbox 360, Games]|
|103079215105|Great value,for the amount of game that you get   

In [9]:
full_data_filter.printSchema()

root
 |-- review_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- asin: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



## Simple recommendation model


In [11]:
from pyspark.sql import functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, VectorAssembler, StringIndexer
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline

# 1. Tiền xử lý dữ liệu
cleaned_data = full_data_filter.filter(
    (col("rating").isNotNull()) &
    (col("user_id").isNotNull()) &
    (col("asin").isNotNull())
)

# 2. Xử lý văn bản
tokenizer = Tokenizer(inputCol="title", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
word2vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="title_vector")

# 3. Mã hóa user_id và asin
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index")
item_indexer = StringIndexer(inputCol="asin", outputCol="item_index")

# 4. Kết hợp các đặc trưng
assembler = VectorAssembler(
    inputCols=["title_vector", "rating_number", "helpful_vote"],
    outputCol="features"
)

# 5. Chuẩn bị dữ liệu
pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    word2vec,
    user_indexer,
    item_indexer,
    assembler
])
processed_data = pipeline.fit(cleaned_data).transform(cleaned_data)

# 6. Chia tập train/test
(train, test) = processed_data.randomSplit([0.8, 0.2])

# 7. Xây dựng mô hình
gbt = GBTRegressor(featuresCol="features", labelCol="rating", maxIter=10)
model = gbt.fit(train)

# 8. Dự đoán và đánh giá
predictions = model.transform(test)
evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")
print(f"RMSE = {evaluator.evaluate(predictions)}")

RMSE = 1.2358194096099213


In [12]:
# Hiển thị 10 dòng đầu tiên của DataFrame predictions
predictions.select("user_id", "asin", "rating", "prediction").show(10)

+--------------------+----------+------+------------------+
|             user_id|      asin|rating|        prediction|
+--------------------+----------+------+------------------+
|AHP4ABT4AOUOKHKXC...|B00AR5BNYU|   5.0| 4.543541260479821|
|AFVOXOECS5WI3ZWDO...|B00BT9DTC2|   5.0| 4.952730037246921|
|AFR75TQBIL7AJJV3O...|B004YCRITQ|   5.0| 3.632893100678571|
|AFJZGWC6JKRAK7GO6...|B009NX3FPW|   4.0|   4.0056641527055|
|AHYIZZXDFYHBM7JNR...|B01GOK34SO|   3.0|4.1863578862768644|
|AGJQ3FGQOPUBKJLJQ...|B005EZ5GW8|   5.0| 4.701062296723748|
|AGGWCWNXE7KBRWQL4...|B000FETCF0|   5.0| 4.989848269364911|
|AFHMXYAANNO3ESFWR...|B002I0J51U|   4.0| 3.906573838056802|
|AHGH5577R3CXYIINS...|B000NB7PUY|   3.0|3.8580413291048834|
|AEJYH6V7ETPBM4JW7...|0375869026|   5.0| 4.989848269364911|
+--------------------+----------+------+------------------+
only showing top 10 rows



## Another model


In [13]:
from pyspark.sql.window import Window

window_spec = Window.partitionBy("asin")
data_with_label = full_data_filter.withColumn(
    "label", F.avg("rating").over(window_spec)
)

In [14]:
cleaned_data = data_with_label.dropna(subset=["asin", "title", "label"])

In [15]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

tokenizer = Tokenizer(inputCol="title", outputCol="words")
words_df = tokenizer.transform(cleaned_data)

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = remover.transform(words_df)

In [16]:
from pyspark.ml.feature import Word2Vec

word2vec = Word2Vec(vectorSize=100, inputCol="filtered_words", outputCol="title_vector")
w2v_model = word2vec.fit(filtered_df)
title_vectors_df = w2v_model.transform(filtered_df)

In [17]:
from pyspark.ml.feature import StringIndexer

asin_indexer = StringIndexer(inputCol="asin", outputCol="asin_index", handleInvalid="keep")
indexed_df = asin_indexer.fit(title_vectors_df).transform(title_vectors_df)

In [18]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["asin_index", "title_vector"],
    outputCol="features"
)
final_df = assembler.transform(indexed_df).select("features", "label", "asin")

In [19]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [21]:
# Re-assemble features to exclude asin_index (categorical with too many values)
assembler = VectorAssembler(
	inputCols=["title_vector"],
	outputCol="features"
)
final_df_no_cat = assembler.transform(title_vectors_df).select("features", "label", "asin")

# Split again for training/testing
train_df_no_cat, test_df_no_cat = final_df_no_cat.randomSplit([0.8, 0.2], seed=42)

gbt = GBTRegressor(featuresCol="features", labelCol="label", maxIter=10)
model = gbt.fit(train_df_no_cat)

In [22]:
predictions = model.transform(test_df)

# Đánh giá bằng RMSE
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"RMSE = {rmse}")

RMSE = 1.4348452796290567


In [23]:
# Giữ lại average_rating gốc từ dữ liệu ban đầu
test_with_avg = predictions.join(
    cleaned_data.select("asin", "average_rating").dropDuplicates(["asin"]),
    on="asin",
    how="inner"
)

# Hiển thị kết quả
test_with_avg.select("asin", "prediction", "average_rating").show(10)

+----------+------------------+--------------+
|      asin|        prediction|average_rating|
+----------+------------------+--------------+
|0143130838|3.9676121749323325|           4.6|
|0345339258| 4.080485964866758|           4.8|
|0375869026|3.6698861928055995|           4.7|
|0375869026| 4.113153827606006|           4.7|
|0375869026| 3.246148724858587|           4.7|
|0375869026| 4.207403931572312|           4.7|
|0399588175|3.2336419633113738|           4.7|
|0547978847|2.6220789529068442|           4.7|
|0700026657|2.4076677465541634|           3.2|
|0761174427|3.3306437764436767|           4.6|
+----------+------------------+--------------+
only showing top 10 rows



In [None]:
# # Fit the asin_indexer if not already fitted (use title_vectors_df as in cell 17)
# asin_indexer_model = asin_indexer.fit(title_vectors_df)

# def predict_rating(asin_input):
#     # Lấy title từ metadata
#     title_row = metadata.filter(metadata.asin == asin_input).select("title").first()
#     if title_row is None:
#         raise ValueError(f"ASIN {asin_input} not found in metadata.")
#     title = title_row["title"]
#     # Tạo DataFrame với asin và title
#     df = spark.createDataFrame([(asin_input, title)], ["asin", "title"])
#     df = asin_indexer_model.transform(df)
#     df = tokenizer.transform(df)
#     df = remover.transform(df)
#     df = w2v_model.transform(df)
#     df = assembler.transform(df)
#     result = model.transform(df)
#     return result.select("prediction").first()[0]

# # Dùng thử
# predicted_rating = predict_rating("B004HD55V0")
# print(f"Dự đoán rating cho B004HD55V0: {predicted_rating}")