# Hệ thống Gợi ý Sản phẩm - Gold Layer

Notebook này chứa implementation của hệ thống gợi ý sản phẩm sử dụng:
- Content-Based Filtering
- Collaborative Filtering
- Hybrid Approach

## 1. Khởi tạo Spark và Load dữ liệu từ Gold Layer

In [None]:
from pyspark.sql import SparkSession
import pyspark

AWS_ACCESS_KEY = "minioadmin"
AWS_SECRET_KEY = "minioadmin"
AWS_S3_ENDPOINT = "http://minio_server:9000"
WAREHOUSE = "s3a://gold/" 
NESSIE_URI = "http://nessie:19120/api/v1"

conf = (
    pyspark.SparkConf()
    .setAppName("Recommendation-System-GOLD")  
    .set('spark.jars.packages',
         'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,'
         'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.67.0,'
         'org.apache.hadoop:hadoop-aws:3.3.4,'
         'com.amazonaws:aws-java-sdk-bundle:1.12.300')
    .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
    .set("spark.sql.catalog.nessie.ref", "main")
    .set("spark.sql.catalog.nessie.authentication.type", "NONE")
    .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
    .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
    .set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
    .set("spark.sql.catalog.nessie.s3.access-key", AWS_ACCESS_KEY)
    .set("spark.sql.catalog.nessie.s3.secret-key", AWS_SECRET_KEY)
    .set("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
)

spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

In [None]:
# Load các bảng từ Gold Layer
dim_customer = spark.table("nessie.dim_customer")
dim_product = spark.table("nessie.dim_product")
dim_time = spark.table("nessie.dim_time")
dim_location = spark.table("nessie.dim_location")
fact_order = spark.table("nessie.fact_order")

print("Đã load các bảng từ Gold Layer:")
print(f"- dim_customer: {dim_customer.count()} records")
print(f"- dim_product: {dim_product.count()} records")
print(f"- dim_time: {dim_time.count()} records")
print(f"- dim_location: {dim_location.count()} records")
print(f"- fact_order: {fact_order.count()} records")

## 2. Content-Based Filtering

Gợi ý dựa trên đặc điểm của sản phẩm (category, title)


In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import udf, col, F
from pyspark.sql.types import DoubleType
import numpy as np

# Chuẩn bị dữ liệu cho Content-Based Filtering
product_features = dim_product.select(
    "product_id",
    "product_title",
    "product_category"
).filter(col("product_title").isNotNull())

# Tokenize product_title
tokenizer = Tokenizer(inputCol="product_title", outputCol="words")
product_features = tokenizer.transform(product_features)

# Loại bỏ stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
product_features = remover.transform(product_features)

# Tạo CountVectorizer cho text
cv = CountVectorizer(inputCol="filtered_words", outputCol="title_features", vocabSize=1000, minDF=2.0)
cv_model = cv.fit(product_features)
product_features = cv_model.transform(product_features)

# Index category
category_indexer = StringIndexer(inputCol="product_category", outputCol="category_index")
category_model = category_indexer.fit(product_features)
product_features = category_model.transform(product_features)

# Kết hợp features
assembler = VectorAssembler(
    inputCols=["title_features", "category_index"],
    outputCol="product_features"
)
product_features = assembler.transform(product_features)

# Normalize vectors
normalizer = Normalizer(inputCol="product_features", outputCol="normalized_features", p=2.0)
product_features = normalizer.transform(product_features)

# Hàm tính cosine similarity
def cosine_similarity(vec1, vec2):
    """Tính cosine similarity giữa hai vectors"""
    if isinstance(vec1, DenseVector) and isinstance(vec2, DenseVector):
        dot_product = float(vec1.dot(vec2))
        norm1 = float(vec1.norm(2))
        norm2 = float(vec2.norm(2))
        if norm1 == 0 or norm2 == 0:
            return 0.0
        return dot_product / (norm1 * norm2)
    return 0.0

# Lưu product features để sử dụng sau
product_features_cached = product_features.select(
    "product_id", 
    "product_title", 
    "product_category",
    "normalized_features"
).cache()

print(f"Số lượng sản phẩm: {product_features_cached.count()}")

# Gợi ý sản phẩm dựa trên Content-Based Filtering
def content_based_recommend(user_id, n_recommendations=10):
    """
    Gợi ý sản phẩm dựa trên Content-Based Filtering
    """
    user_purchased = fact_order.filter(col("customer_id") == user_id) \
                               .select("product_id") \
                               .distinct()
    
    if user_purchased.count() == 0:
        print(f"Không tìm thấy lịch sử mua hàng cho user {user_id}")
        return None
    
    user_product_features = user_purchased.join(
        product_features_cached, 
        on="product_id", 
        how="inner"
    )
    
    features_list = user_product_features.select("normalized_features").rdd \
        .map(lambda row: np.array(row.normalized_features.toArray())).collect()
    
    if len(features_list) == 0:
        return None
    
    user_profile = np.mean(features_list, axis=0)
    user_profile_vec = DenseVector(user_profile.tolist())
    
    all_products_pd = product_features_cached.select(
        "product_id", "product_title", "product_category", "normalized_features"
    ).toPandas()
    
    similarities = []
    for idx, row in all_products_pd.iterrows():
        product_vec = DenseVector(row['normalized_features'].toArray())
        sim = cosine_similarity(user_profile_vec, product_vec)
        similarities.append(sim)
    
    all_products_pd['similarity'] = similarities
    
    user_purchased_list = user_purchased.select("product_id").rdd.map(lambda x: x[0]).collect()
    all_products_pd = all_products_pd[~all_products_pd['product_id'].isin(user_purchased_list)]
    
    recommendations_pd = all_products_pd.nlargest(n_recommendations, 'similarity')[
        ['product_id', 'product_title', 'product_category', 'similarity']
    ]
    
    recommendations = spark.createDataFrame(recommendations_pd)
    return recommendations



In [None]:
# Test với một user
test_user = "R_01vNIayewjIIKMF"
print(f"\n=== Content-Based Recommendations cho user: {test_user} ===")
cb_recommendations = content_based_recommend(test_user, n_recommendations=10)
if cb_recommendations:
    cb_recommendations.show(truncate=False)

In [None]:
# Lấy danh sách user duy nhất
user_list = fact_order.select("customer_id").distinct().rdd.map(lambda row: row[0]).collect()

# Tạo danh sách chứa kết quả gợi ý
all_recommendations = []

# Lặp qua từng user và gọi hàm gợi ý
for user_id in user_list:
    recs = content_based_recommend(user_id, n_recommendations=10)
    if recs:
        # Thêm cột customer_id để biết gợi ý thuộc user nào
        recs = recs.withColumn("customer_id", F.lit(user_id))
        all_recommendations.append(recs)

# Gộp tất cả kết quả lại
if all_recommendations:
    final_recommendations = all_recommendations[0]
    for rec_df in all_recommendations[1:]:
        final_recommendations = final_recommendations.union(rec_df)
    
    print(f"Tổng số gợi ý được tạo ra: {final_recommendations.count()}")
    final_recommendations.show(20, truncate=False)
else:
    print("Không có gợi ý nào được tạo ra.")


## 3. Collaborative Filtering.

Gợi ý dựa trên hành vi mua hàng của người dùng tương tự

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, lit, F

# Chuẩn bị dữ liệu cho Collaborative Filtering
user_item_data = fact_order.groupBy("customer_id", "product_id") \
    .agg(
        F.count("*").alias("purchase_count"),
        F.sum("total_price").alias("total_spent"),
        F.avg("purchase_price_per_unit").alias("avg_price")
    )

user_item_data = user_item_data.withColumn(
    "rating",
    F.least(
        F.lit(5.0),
        F.greatest(
            F.lit(0.0),
            (col("purchase_count") * 2.0 + F.log(col("total_spent") + 1) * 0.5) / 2.0
        )
    )
)

user_indexer = StringIndexer(inputCol="customer_id", outputCol="user_index")
user_model = user_indexer.fit(user_item_data)
user_item_data = user_model.transform(user_item_data)

product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index")
product_model = product_indexer.fit(user_item_data)
user_item_data = product_model.transform(user_item_data)

cf_data = user_item_data.select(
    col("user_index").cast("int"),
    col("product_index").cast("int"),
    col("rating").cast("float")
).filter(col("rating") > 0)

print(f"Số lượng interactions: {cf_data.count()}")
print(f"Số lượng users: {cf_data.select('user_index').distinct().count()}")
print(f"Số lượng products: {cf_data.select('product_index').distinct().count()}")
cf_data.show(10)


In [None]:
# Chia dữ liệu train/test
(train_data, test_data) = cf_data.randomSplit([0.8, 0.2], seed=42)

# Huấn luyện mô hình ALS
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="user_index",
    itemCol="product_index",
    ratingCol="rating",
    coldStartStrategy="drop",
    implicitPrefs=False
)

print("Đang huấn luyện mô hình Collaborative Filtering...")
als_model = als.fit(train_data)

predictions = als_model.transform(test_data)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)
print(f"\nRMSE của mô hình: {rmse:.4f}")

user_to_index_map = user_item_data.select("customer_id", "user_index").distinct()
product_to_index_map = user_item_data.select("product_id", "product_index").distinct()

print("\nĐã hoàn thành huấn luyện mô hình Collaborative Filtering!")


In [None]:
def collaborative_filtering_recommend(user_id, n_recommendations=10):
    """Gợi ý sản phẩm dựa trên Collaborative Filtering"""
    user_check = user_to_index_map.filter(col("customer_id") == user_id)
    
    if user_check.count() == 0:
        print(f"User {user_id} không có trong hệ thống")
        return None
    
    user_idx = user_check.select("user_index").first()[0]
    all_products = product_to_index_map.select("product_index").distinct()
    user_products = all_products.withColumn("user_index", lit(int(user_idx)))
    
    recommendations = als_model.transform(user_products)
    
    user_purchased = fact_order.filter(col("customer_id") == user_id) \
                               .select("product_id") \
                               .distinct()
    
    user_purchased_indices = user_purchased.join(
        product_to_index_map,
        on="product_id",
        how="inner"
    ).select("product_index")
    
    recommendations = recommendations.join(
        user_purchased_indices,
        on="product_index",
        how="left_anti"
    ).filter(col("prediction").isNotNull()) \
     .orderBy(col("prediction").desc()) \
     .limit(n_recommendations)
    
    recommendations = recommendations.join(
        product_to_index_map,
        on="product_index",
        how="inner"
    ).join(
        dim_product,
        on="product_id",
        how="inner"
    ).select(
        "product_id",
        "product_title",
        "product_category",
        col("prediction").alias("rating_score")
    )
    
    return recommendations


In [None]:
# Test với một user
test_user = "R_01vNIayewjIIKMF"
print(f"\n=== Collaborative Filtering Recommendations cho user: {test_user} ===")
cf_recommendations = collaborative_filtering_recommend(test_user, n_recommendations=10)
if cf_recommendations:
    cf_recommendations.show(truncate=False)

In [None]:
# Gợi ý top 10 sản phẩm cho toàn bộ user
all_recommendations = als_model.recommendForAllUsers(10)

# Join lại với thông tin user_id và product_id ban đầu
all_recommendations = all_recommendations \
    .join(user_to_index_map, on="user_index", how="inner") \
    .withColumn("recommendations", F.explode("recommendations")) \
    .select(
        "customer_id",
        F.col("recommendations.product_index").alias("product_index"),
        F.col("recommendations.rating").alias("rating_score")
    ) \
    .join(product_to_index_map, on="product_index", how="inner") \
    .join(dim_product, on="product_id", how="inner") \
    .select("customer_id", "product_id", "product_title", "product_category", "rating_score")

all_recommendations.show(20, truncate=False)


## 4. Hybrid Approach

Kết hợp Content-Based và Collaborative Filtering


In [None]:
def hybrid_recommend(user_id, n_recommendations=10, cb_weight=0.4, cf_weight=0.6):
    """
    Gợi ý sản phẩm sử dụng Hybrid Approach
    Gợi ý sản phẩm sử dụng Hybrid Approach
    Kết hợp Content-Based và Collaborative Filtering
    
    Args:
        user_id: ID của khách hàng
        n_recommendations: Số lượng gợi ý
        cb_weight: Trọng số cho Content-Based (mặc định 0.4)
        cf_weight: Trọng số cho Collaborative Filtering (mặc định 0.6)
    
    Returns:
        DataFrame chứa các sản phẩm được gợi ý với điểm số tổng hợp
    """
    cb_recs = content_based_recommend(user_id, n_recommendations=n_recommendations * 2)
    cf_recs = collaborative_filtering_recommend(user_id, n_recommendations=n_recommendations * 2)
    
    if cb_recs is None and cf_recs is None:
        print(f"Không thể tạo recommendations cho user {user_id}")
        return None
    
    def normalize_score(df, score_col):
        if df is None:
            return None
        count = df.count()
        if count == 0:
            return None
        
        min_max = df.agg(F.min(score_col).alias("min"), F.max(score_col).alias("max")).first()
        min_score = min_max["min"]
        max_score = min_max["max"]
        
        if max_score == min_score or max_score is None or min_score is None:
            return df.withColumn("normalized_score", lit(0.5))
        
        return df.withColumn(
            "normalized_score",
            (col(score_col) - lit(min_score)) / (lit(max_score) - lit(min_score))
        )
    
    if cb_recs is not None:
        cb_recs = normalize_score(cb_recs, "similarity")
        cb_recs = cb_recs.select("product_id", col("normalized_score").alias("cb_score"))
    
    if cf_recs is not None:
        cf_recs = normalize_score(cf_recs, "rating_score")
        cf_recs = cf_recs.select("product_id", col("normalized_score").alias("cf_score"))
    
    if cb_recs is not None and cf_recs is not None:
        hybrid_recs = cb_recs.join(cf_recs, on="product_id", how="full_outer")
        hybrid_recs = hybrid_recs.fillna(0.0, subset=["cb_score", "cf_score"])
        hybrid_recs = hybrid_recs.withColumn(
            "hybrid_score",
            col("cb_score") * cb_weight + col("cf_score") * cf_weight
        )
    elif cb_recs is not None:
        hybrid_recs = cb_recs.withColumn("hybrid_score", col("cb_score"))
    else:
        hybrid_recs = cf_recs.withColumn("hybrid_score", col("cf_score"))
    
    hybrid_recs = hybrid_recs.join(
        dim_product,
        on="product_id",
        how="inner"
    ).select(
        "product_id",
        "product_title",
        "product_category",
        col("hybrid_score").alias("recommendation_score"),
        col("cb_score").alias("content_based_score"),
        col("cf_score").alias("collaborative_score")
    ).orderBy(col("recommendation_score").desc()) \
     .limit(n_recommendations)
    
    return hybrid_recs


In [None]:
# Test với một user
test_user = "R_01vNIayewjIIKMF"
print(f"\n=== Hybrid Recommendations cho user: {test_user} ===")
hybrid_recs = hybrid_recommend(test_user, n_recommendations=10)
if hybrid_recs:
    hybrid_recs.show(truncate=False)

## 5. Test và So sánh các phương pháp


In [None]:
# Test với một user
test_user = "R_01vNIayewjIIKMF"

print("="*80)
print(f"TEST RECOMMENDATIONS CHO USER: {test_user}")
print("="*80)

print("\n1. CONTENT-BASED FILTERING:")
cb_recommendations = content_based_recommend(test_user, n_recommendations=10)
if cb_recommendations:
    cb_recommendations.show(truncate=50)

print("\n2. COLLABORATIVE FILTERING:")
cf_recommendations = collaborative_filtering_recommend(test_user, n_recommendations=10)
if cf_recommendations:
    cf_recommendations.show(truncate=50)

print("\n3. HYBRID APPROACH:")
hybrid_recs = hybrid_recommend(test_user, n_recommendations=10)
if hybrid_recs:
    hybrid_recs.show(truncate=50)
