In [1]:
import json
import pandas as pd
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode


spark = SparkSession.builder.appName("ProcessingDataset")\
    .config("spark.kryoserializer.buffer.max", "512m") \
    .getOrCreate()
spark.conf.set('spark.sql.caseSensitive', True)

In [2]:
user_reviews= spark.read.parquet("Dataset/user_reviews.parquet")

In [3]:
user_reviews.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [4]:
user_reviews_demo = user_reviews.limit(50000)

In [5]:
user_reviews_demo.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [6]:
metadata=spark.read.parquet("Dataset/metadata.parquet")
metadata.show(5, False)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------------------------------------------------------+-----+-----------+
|parent_asin|title                                                                                                                                                                                                   |average_rating|rating_number|main_category            |categories                                                         |price|store      |
+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------

In [7]:
full_data = user_reviews_demo.join(metadata, user_reviews_demo.parent_asin == metadata.parent_asin, "inner")

In [8]:
full_data_filter = full_data.select(
	"review_id",
	user_reviews_demo["title"].alias("title"),
	metadata["title"].alias("product_title"),
	"rating",
	"average_rating",
	"rating_number",
	user_reviews_demo["asin"].alias("asin"),
	metadata["parent_asin"].alias("parent_asin"),
	"user_id",
	"helpful_vote",
	"categories"
)
full_data_filter.show(5, False)

+------------+----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+--------------+-------------+----------+-----------+----------------------------+------------+---------------------------------------------------------------------+
|review_id   |title                                   |product_title                                                                                                                                                                                           |rating|average_rating|rating_number|asin      |parent_asin|user_id                     |helpful_vote|categories                                                           |
+------------+----------------------------------------+-----------------------------------------------------------------------------------------

In [9]:
full_data_filter.printSchema()

root
 |-- review_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- asin: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



## Simple recommendation model


In [10]:
from pyspark.sql import functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, VectorAssembler, StringIndexer
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline

# 1. Tiền xử lý dữ liệu
cleaned_data = full_data_filter.filter(
    (col("rating").isNotNull()) &
    (col("user_id").isNotNull()) &
    (col("asin").isNotNull())
)

# 2. Xử lý văn bản
tokenizer = Tokenizer(inputCol="title", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
word2vec = Word2Vec(vectorSize=100, minCount=5, inputCol="filtered_words", outputCol="title_vector")

# 3. Mã hóa user_id và asin
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index")
item_indexer = StringIndexer(inputCol="asin", outputCol="item_index")

# 4. Kết hợp các đặc trưng
assembler = VectorAssembler(
    inputCols=["title_vector", "rating_number", "helpful_vote"],
    outputCol="features"
)

# 5. Chuẩn bị dữ liệu
pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    word2vec,
    user_indexer,
    item_indexer,
    assembler
])
processed_data = pipeline.fit(cleaned_data).transform(cleaned_data)

# 6. Chia tập train/test
(train, test) = processed_data.randomSplit([0.8, 0.2])

# 7. Xây dựng mô hình
gbt = GBTRegressor(featuresCol="features", labelCol="rating", maxIter=10)
model = gbt.fit(train)

# 8. Dự đoán và đánh giá
predictions = model.transform(test)
evaluator = RegressionEvaluator(labelCol="rating", predictionCol="prediction", metricName="rmse")
print(f"RMSE = {evaluator.evaluate(predictions)}")

RMSE = 1.2309190488855302


In [11]:
# Hiển thị 10 dòng đầu tiên của DataFrame predictions
predictions.select("user_id", "asin", "rating", "prediction").show(10)

+--------------------+----------+------+------------------+
|             user_id|      asin|rating|        prediction|
+--------------------+----------+------+------------------+
|AFM7AIBN4HWKS5AGW...|B07GHWHFR5|   5.0| 4.990097059163044|
|AHWN2QPRGWKGU4LQJ...|B0038MTE7C|   4.0|3.5108793489073546|
|AHUJDDNT2Q2W6UCQI...|B0788L9WJ1|   5.0|  4.79146265093197|
|AEKUB3IPTTU6XZIEK...|B08XB5QX5M|   2.0| 4.576241042385885|
|AFPNUS6ZEVBGIJVTQ...|B0056JPS84|   1.0| 4.016090537623408|
|AF7MBS6TESBWUHHGJ...|B01A8HWLWE|   5.0|3.5307112337387627|
|AGECWTU7YASGQPDS4...|B077WYPYC7|   5.0| 4.286001595668527|
|AH45BPEN63SXGJJVK...|B00HZWEB74|   5.0| 4.062079699336152|
|AFM7J3OCFMHI4V23S...|B07RCXNW3K|   1.0|3.7774125824062867|
|AE2ZIKSCUNJIB7HEX...|B07XHMFCJ2|   2.0|3.6240134159474686|
+--------------------+----------+------+------------------+
only showing top 10 rows



## Another model


In [12]:
from pyspark.sql.window import Window

window_spec = Window.partitionBy("asin")
data_with_label = full_data_filter.withColumn(
    "label", F.avg("rating").over(window_spec)
)

In [13]:
cleaned_data = data_with_label.dropna(subset=["asin", "title", "label","product_title"])

In [14]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

tokenizer = Tokenizer(inputCol="title", outputCol="words")
words_df = tokenizer.transform(cleaned_data)

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_df = remover.transform(words_df)

In [15]:
from pyspark.ml.feature import Word2Vec

word2vec = Word2Vec(vectorSize=100, inputCol="filtered_words", outputCol="title_vector")
w2v_model = word2vec.fit(filtered_df)
title_vectors_df = w2v_model.transform(filtered_df)

In [16]:
from pyspark.ml.feature import StringIndexer

asin_indexer = StringIndexer(inputCol="asin", outputCol="asin_index", handleInvalid="keep")
indexed_df = asin_indexer.fit(title_vectors_df).transform(title_vectors_df)

In [17]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["asin_index", "title_vector"],
    outputCol="features"
)
final_df = assembler.transform(indexed_df).select("features", "label", "asin")

In [18]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

In [19]:
# Re-assemble features to exclude asin_index (categorical with too many values)
assembler = VectorAssembler(
	inputCols=["title_vector"],
	outputCol="features"
)
final_df_no_cat = assembler.transform(title_vectors_df).select("features", "label", "asin")

# Split again for training/testing
train_df_no_cat, test_df_no_cat = final_df_no_cat.randomSplit([0.8, 0.2], seed=42)

gbt = GBTRegressor(featuresCol="features", labelCol="label", maxIter=10)
model = gbt.fit(train_df_no_cat)

In [20]:
predictions = model.transform(test_df)

# Đánh giá bằng RMSE
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"RMSE = {rmse}")

RMSE = 1.3813362763468389


In [24]:
# Giữ lại average_rating và title gốc từ dữ liệu ban đầu
test_with_avg = predictions.join(
    cleaned_data.select("asin", "average_rating", "product_title").dropDuplicates(["asin"]),
    on="asin",
    how="inner"
)

# Hiển thị kết quả
test_with_avg.select("asin", "product_title", "prediction", "average_rating").show(10)

+----------+--------------------+------------------+--------------+
|      asin|       product_title|        prediction|average_rating|
+----------+--------------------+------------------+--------------+
|B0136JP9OS|Xbox One 1TB Cons...| 3.747078295604036|           4.5|
|B00886WO7A|Dead Island Ripti...| 3.747078295604036|           4.1|
|B012DFI02O|VersionTECH. G200...| 3.666605169849606|           4.2|
|B012DFI02O|VersionTECH. G200...| 3.666605169849606|           4.2|
|B012DFI02O|VersionTECH. G200...|3.2628304737308027|           4.2|
|B012DFI02O|VersionTECH. G200...| 2.737113415400969|           4.2|
|B012DFI02O|VersionTECH. G200...|2.8748260123482363|           4.2|
|B012DFI02O|VersionTECH. G200...|2.6346214253974325|           4.2|
|B012DFI02O|VersionTECH. G200...| 2.653466897508133|           4.2|
|B012DFI02O|VersionTECH. G200...|3.3315377277625386|           4.2|
+----------+--------------------+------------------+--------------+
only showing top 10 rows



In [22]:
metadata.printSchema()

root
 |-- parent_asin: string (nullable = true)
 |-- title: string (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- main_category: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- price: string (nullable = true)
 |-- store: string (nullable = true)



In [23]:
# Fit the asin_indexer if not already fitted (use title_vectors_df as in cell 17)
asin_indexer_model = asin_indexer.fit(title_vectors_df)

def predict_rating(asin_input):
    # Lấy title từ metadata
    title_row = metadata.filter(metadata.parent_asin == asin_input).select("title").first()
    if title_row is None:
        raise ValueError(f"ASIN {asin_input} not found in metadata.")
    title = title_row["title"]
    # Tạo DataFrame với asin và title
    df = spark.createDataFrame([(asin_input, title)], ["parent_asin", "title"])
    df = asin_indexer_model.transform(df)
    df = tokenizer.transform(df)
    df = remover.transform(df)
    df = w2v_model.transform(df)
    df = assembler.transform(df)
    result = model.transform(df)
    return result.select("prediction").first()[0]

# Dùng thử
predicted_rating = predict_rating("B004HD55V0")
print(f"Dự đoán rating cho B004HD55V0: {predicted_rating}")

Dự đoán rating cho B004HD55V0: 4.1005248275288615
