In [1]:
import json
import pandas as pd
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode


spark = SparkSession.builder.appName("ProcessingDataset")\
    .config("spark.kryoserializer.buffer.max", "512m") \
    .getOrCreate()
spark.conf.set('spark.sql.caseSensitive', True)

In [2]:
user_reviews= spark.read.parquet("Dataset/user_reviews.parquet")

In [3]:
user_reviews.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [4]:
user_reviews_demo = user_reviews.limit(100000)

In [5]:
user_reviews_demo.show(5, False)

+------------+--------------------------------------------------------+------+------------+----------------------------+----------+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+------------+-----------------+-------------------+
|review_id   |ti

In [6]:
metadata=spark.read.parquet("Dataset/metadata.parquet")
metadata.show(5, False)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------------------------------------------------------+-----+-----------+
|parent_asin|title                                                                                                                                                                                                   |average_rating|rating_number|main_category            |categories                                                         |price|store      |
+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+-------------+-------------------------+-------------------

In [7]:
full_data = user_reviews_demo.join(metadata, user_reviews_demo.parent_asin == metadata.parent_asin, "inner")

In [10]:
full_data_filter = full_data.select(
	"review_id",
	metadata["title"].alias("title"),
	"rating",
	"average_rating",
	"rating_number",
	user_reviews_demo["asin"].alias("asin"),
	metadata["parent_asin"].alias("parent_asin"),
	"user_id",
	"helpful_vote",
	"categories"
)
full_data_filter.show(5, False)

+------------+------------------------------------------------------------------------------------------------------------------------------+------+--------------+-------------+----------+-----------+----------------------------+------------+------------------------------------------------------------+
|review_id   |title                                                                                                                         |rating|average_rating|rating_number|asin      |parent_asin|user_id                     |helpful_vote|categories                                                  |
+------------+------------------------------------------------------------------------------------------------------------------------------+------+--------------+-------------+----------+-----------+----------------------------+------------+------------------------------------------------------------+
|103079215104|The First Templar - Xbox 360                                              

In [11]:
full_data_filter.printSchema()

root
 |-- review_id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- average_rating: double (nullable = true)
 |-- rating_number: long (nullable = true)
 |-- asin: string (nullable = true)
 |-- parent_asin: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- helpful_vote: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



## Simple recommendation model


In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [12]:
from pyspark.sql.functions import col, explode, array_join

# Giữ lại các cột cần thiết
ratings_df = full_data_filter.select(
    col("user_id"),
    col("asin").alias("item_id"),
    col("rating")
).dropna()

# Tạo bảng sản phẩm chứa thông tin nội dung
products_df = full_data_filter.select(
    col("asin").alias("item_id"),
    col("title"),
    col("categories")
).dropDuplicates(["item_id"])

In [13]:
from pyspark.ml.feature import StringIndexer

user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_index", handleInvalid="keep")
item_indexer = StringIndexer(inputCol="item_id", outputCol="item_id_index", handleInvalid="keep")

indexed_ratings = user_indexer.fit(ratings_df).transform(ratings_df)
indexed_ratings = item_indexer.fit(indexed_ratings).transform(indexed_ratings)

In [14]:
from pyspark.ml.recommendation import ALS

als = ALS(
    maxIter=10,
    regParam=0.1,
    rank=10,
    userCol="user_id_index",
    itemCol="item_id_index",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True
)

model_als = als.fit(indexed_ratings)

In [15]:
from pyspark.sql.functions import explode, when

userRecs = model_als.recommendForAllUsers(10)  # gợi ý top 10 sản phẩm mỗi người dùng
userRecs_exploded = userRecs.withColumn("recommendations", explode("recommendations"))

final_recommendations = userRecs_exploded.select(
    col("user_id_index"),
    col("recommendations")["item_id_index"].alias("item_id_index"),
    col("recommendations")["rating"].alias("predicted_rating")
).withColumn(
    "predicted_rating",
   col("predicted_rating")
)

In [17]:
from pyspark.sql.functions import concat_ws, split
from pyspark.ml.feature import CountVectorizer

# Kết hợp title + categories thành một chuỗi đặc trưng
products_df = products_df.withColumn("features_text", concat_ws(" ", col("title"), array_join(col("categories"), " ")))

# Chuyển chuỗi đặc trưng thành mảng từ (array<string>)
products_df = products_df.withColumn("features_text", split(col("features_text"), " "))

# Vector hóa bằng CountVectorizer
cv = CountVectorizer(inputCol="features_text", outputCol="features", vocabSize=1000, minDF=2.0)
cv_model = cv.fit(products_df)
content_features_df = cv_model.transform(products_df)

In [24]:
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from pyspark.ml.linalg import Vector
import numpy as np

# 1. Hàm tính Cosine Similarity giữ nguyên
def cosine_similarity(vec1: Vector, vec2: Vector) -> float:
    a = np.array(vec1.toArray())
    b = np.array(vec2.toArray())
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

cos_sim_udf = udf(cosine_similarity, FloatType())

# 2. Đặt alias cho bảng gốc và bảng crossJoin
content_features_df = content_features_df.alias("left")  # đặt alias cho bảng gốc

# Tạo Cartesian Product giữa bảng "left" và chính nó với alias "right"
all_pairs = content_features_df.crossJoin(
    content_features_df.alias("right")
)

# 3. Tính Cosine Similarity giữa từng cặp sản phẩm
similarities_df = all_pairs.select(
    col("left.item_id").alias("item_id_1"),
    col("right.item_id").alias("item_id_2"),
    cos_sim_udf(col("left.features"), col("right.features")).alias("similarity")
).filter(
    col("item_id_1") != col("item_id_2")  # loại bỏ so sánh cùng một sản phẩm
)

# 4. Hiển thị kết quả
similarities_df.orderBy(col("similarity").desc()).show(10)

: 