In [1]:
from pyspark.sql import SparkSession
import pyspark

AWS_ACCESS_KEY = "minioadmin"
AWS_SECRET_KEY = "minioadmin"
AWS_S3_ENDPOINT = "http://minio_server:9000"
WAREHOUSE = "s3a://gold/" 
NESSIE_URI = "http://nessie:19120/api/v1"

conf = (
    pyspark.SparkConf()
    .setAppName("Lakehouse-Iceberg-TrainModel")  
    .set('spark.jars.packages',
         'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,'
         'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.67.0,'
         'org.apache.hadoop:hadoop-aws:3.3.4,'
         'com.amazonaws:aws-java-sdk-bundle:1.12.300')
    .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
    .set("spark.sql.catalog.nessie.ref", "main")
    .set("spark.sql.catalog.nessie.authentication.type", "NONE")
    .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
    .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
    .set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
    .set("spark.sql.catalog.nessie.s3.access-key", AWS_ACCESS_KEY)
    .set("spark.sql.catalog.nessie.s3.secret-key", AWS_SECRET_KEY)
    .set("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
)

spark = (
    SparkSession.builder
    .config(conf=conf) 
    .config("spark.driver.memory", "4g") 
    .config("spark.executor.memory", "4g")
    .getOrCreate()
)

spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")


In [2]:
df_fact = spark.table("nessie.fact_order")
df_customer = spark.table("nessie.dim_customer")
df_product = spark.table("nessie.dim_product")
df_time = spark.table("nessie.dim_time")
df_location = spark.table("nessie.dim_location")


In [3]:
query = """
SELECT  
    f.time_id,
    f.customer_id,
    f.product_id,
    f.location_id,
    f.purchase_price_per_unit,
    f.quantity,
    f.total_price,

    -- Dim_time
    t.order_date,
    t.year,
    t.month,
    t.day,
    t.quarter,
    t.weekday_name,

    -- Dim_customer
    c.age_group,
    c.gender,
    c.education,
    c.income,
    c.race,
    c.state,

    -- Dim_product
    p.product_title,
    p.product_category,

    -- Dim_location
    l.state_code,
    l.state_name,
    l.region

FROM nessie.fact_order AS f
LEFT JOIN nessie.dim_time AS t ON f.time_id = t.time_id
LEFT JOIN nessie.dim_customer AS c ON f.customer_id = c.customer_id
LEFT JOIN nessie.dim_product AS p ON f.product_id = p.product_id
LEFT JOIN nessie.dim_location AS l ON f.location_id = l.location_id
"""

In [4]:
df_fact_full = spark.sql(query)
df_fact_full.limit(10).toPandas()


Unnamed: 0,time_id,customer_id,product_id,location_id,purchase_price_per_unit,quantity,total_price,order_date,year,month,...,gender,education,income,race,state,product_title,product_category,state_code,state_name,region
0,439,R_1jZkLNE1JdtyVpH,000217653X,44,29.99,1.0,29.99,2020-09-16,2020,9,...,Female,High school diploma or GED,"Less than $25,000",White or Caucasian,Florida,THE DINAH'S CUPBOARD COOK BOOK: Recipes and Me...,ABIS_BOOK,FL,Unknown,Unknown
1,439,R_1jZkLNE1JdtyVpH,000217653X,39,13.55,1.0,13.55,2020-09-16,2020,9,...,Female,High school diploma or GED,"Less than $25,000",White or Caucasian,Florida,THE DINAH'S CUPBOARD COOK BOOK: Recipes and Me...,ABIS_BOOK,TX,Texas,South
2,444,R_3qIPMah81MezsJn,0007137508,33,19.95,1.0,19.95,2022-12-05,2022,12,...,Male,Bachelor's degree,"$50,000 - $74,999",White or Caucasian,Tennessee,Wellington: The Iron Duke,ABIS_BOOK,TN,Unknown,Unknown
3,428,R_vD2O13NgdnWBXMt,0007302622,4,13.25,1.0,13.25,2019-08-10,2019,8,...,Female,"Graduate or professional degree (MA, MS, MBA, ...","$50,000 - $74,999",White or Caucasian,New Jersey,Duck in the Truck,ABIS_BOOK,NJ,New Jersey,Northeast
4,1573,R_1QsZS0nI2sw5gl5,000745287X,41,14.96,1.0,14.96,2022-06-27,2022,6,...,Male,"Graduate or professional degree (MA, MS, MBA, ...","$150,000 or more",White or Caucasian,Georgia,Sharpe's Regiment: Richard Sharpe and the Inva...,ABIS_BOOK,GA,Unknown,Unknown
5,1815,R_2aldwxmUZox7Yfd,0007483791,16,10.84,1.0,10.84,2018-03-21,2018,3,...,Male,"Graduate or professional degree (MA, MS, MBA, ...","$150,000 or more",White or Caucasian,California,Deep Time,ABIS_BOOK,CA,California,West
6,23,R_3GD1CL4OyjglmbZ,0007510837,36,24.04,1.0,24.04,2020-01-21,2020,1,...,Female,High school diploma or GED,"$25,000 - $49,999",White or Caucasian,Pennsylvania,Collins German Dictionary Complete and Unabrid...,ABIS_BOOK,PA,Unknown,Unknown
7,328,R_3Pc1ZZfNy58AvgE,0007544790,16,18.72,1.0,18.72,2019-07-23,2019,7,...,Male,Bachelor's degree,"$100,000 - $149,999",Other,California,My Virgin Kitchen: Delicious recipes you can m...,ABIS_BOOK,CA,California,West
8,354,R_27Nf8ImFlWu3J9O,000756032X,4,9.99,1.0,9.99,2018-06-20,2018,6,...,Male,High school diploma or GED,"Less than $25,000",White or Caucasian,California,Born into the Children of God: My life in a re...,ABIS_BOOK,NJ,New Jersey,Northeast
9,1201,R_3Pp1HTLxoglta9u,0008100713,32,23.08,1.0,23.08,2022-06-08,2022,6,...,Male,"Graduate or professional degree (MA, MS, MBA, ...","$75,000 - $99,999",White or Caucasian,Ohio,Well Gardened Mind,ABIS_BOOK,OH,Unknown,Unknown


In [5]:
# Hiển thị schema sau khi làm sạch
df_fact_full.printSchema()

root
 |-- time_id: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- location_id: long (nullable = true)
 |-- purchase_price_per_unit: double (nullable = true)
 |-- quantity: double (nullable = true)
 |-- total_price: double (nullable = true)
 |-- order_date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- weekday_name: string (nullable = true)
 |-- age_group: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- education: string (nullable = true)
 |-- income: string (nullable = true)
 |-- race: string (nullable = true)
 |-- state: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- state_code: string (nullable = true)
 |-- state_name: string (nullable = true)
 |-- region: string (nullable = true)



In [6]:
# Đếm số dòng
num_rows =df_fact_full.count()
# Đếm số cột
num_cols = len(df_fact_full.columns)
print(f"\nKích thước dữ liệu: ({num_rows}, {num_cols})")


Kích thước dữ liệu: (1675015, 24)


In [7]:
# =====================================================
# Tính amount và tổng hợp feature trong 1 groupBy
# =====================================================
from pyspark.sql import functions as F
from pyspark.sql.window import Window

df_fact_full = df_fact_full.withColumn('amount', F.col('purchase_price_per_unit') * F.col('quantity'))

# Most frequent product_category
df_cat = df_fact_full.groupBy("customer_id", "product_category").agg(F.count("*").alias("cnt"))
w_cat = Window.partitionBy("customer_id").orderBy(F.desc("cnt"))
df_cat_rank = df_cat.withColumn("rank", F.row_number().over(w_cat))
df_most_freq_category = df_cat_rank.filter(F.col("rank")==1)\
                                  .select("customer_id", F.col("product_category").alias("most_freq_category"))

# Numeric features
df_numeric = df_fact_full.groupBy("customer_id").agg(
    F.sum("amount").alias("total_spend"),
    F.count("amount").alias("n_orders"),
    F.avg("amount").alias("avg_order_value"),
    F.stddev("amount").alias("std_order_value"),
    (F.sum("quantity") / F.countDistinct("time_id")).alias("avg_items_per_order"),
    F.countDistinct("time_id").alias("total_orders"),
    F.min("year").alias("first_year"),
    F.max("year").alias("last_year")
).withColumn("years_active", (F.col("last_year") - F.col("first_year") + 1))\
 .withColumn("years_active", F.when(F.col("years_active") <= 0, 1).otherwise(F.col("years_active")))\
 .fillna({'std_order_value': 0})

# Merge features + demographics
demographics_cols = ['gender','education','income','state']
df_demo = df_fact_full.select('customer_id', *demographics_cols).dropDuplicates(['customer_id'])

df_features = df_numeric.join(df_most_freq_category, on='customer_id', how='left')\
                        .join(df_demo, on='customer_id', how='left')

# =====================================================
# tạo target: next_year_orders
# =====================================================
df_target = df_fact_full.groupBy('customer_id','year').agg(F.count("*").alias("orders_per_year"))
w = Window.partitionBy("customer_id").orderBy("year")
df_target = df_target.withColumn("next_year_orders", F.lead("orders_per_year",1).over(w))
df_target_max = df_target.groupBy("customer_id").agg(F.max("next_year_orders").alias("next_year_orders"))

df_final = df_features.join(df_target_max, on='customer_id', how='left').fillna({'next_year_orders':0})

# =====================================================
# Chuẩn bị features & target
# =====================================================
numeric_features = ['total_spend','n_orders','avg_order_value','std_order_value',
                    'avg_items_per_order','total_orders','years_active']
categorical_features = ['most_freq_category','gender','education','income','state']
target = 'next_year_orders'

# StringIndexer + OneHotEncoder
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer, OneHotEncoder

indexers = [StringIndexer(inputCol=c, outputCol=c+"_idx", handleInvalid="keep") for c in categorical_features]
encoders = [OneHotEncoder(inputCol=c+"_idx", outputCol=c+"_ohe") for c in categorical_features]

# VectorAssembler + StandardScaler
assembler = VectorAssembler(inputCols=numeric_features + [c+"_ohe" for c in categorical_features],
                            outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)

# =====================================================
# Chia train/test
# =====================================================
train_df = df_final.filter(F.col('last_year') <= 2021)
test_df  = df_final.filter(F.col('last_year') == 2022)
print(f"Train: {train_df.count()} | Test: {test_df.count()}")


Train: 167 | Test: 2689


In [8]:
# In 10 dòng đầu train
print("=== Train sample ===")
train_df.show(10, truncate=False)

=== Train sample ===
+-----------------+------------------+--------+------------------+------------------+-------------------+------------+----------+---------+------------+-------------------------+------+--------------------------------------------------------------------+-----------------+-------------+----------------+
|customer_id      |total_spend       |n_orders|avg_order_value   |std_order_value   |avg_items_per_order|total_orders|first_year|last_year|years_active|most_freq_category       |gender|education                                                           |income           |state        |next_year_orders|
+-----------------+------------------+--------+------------------+------------------+-------------------+------------+----------+---------+------------+-------------------------+------+--------------------------------------------------------------------+-----------------+-------------+----------------+
|R_2qEEh58vzt5peUq|2900.7            |108     |26.85833333333333 |5

## RF

In [9]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline

# --------------------------
# Pipeline mô hình Random Forest
# --------------------------
rf = RandomForestRegressor(
    labelCol=target,
    featuresCol="scaledFeatures",
    seed=42
)

rf_pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, rf])

# --------------------------
# Tập siêu tham số cần thử
# --------------------------
param_grid = (
    ParamGridBuilder()
    .addGrid(rf.numTrees, [50, 100])
    .addGrid(rf.maxDepth, [10, 12])
    .build()
)

# --------------------------
# TrainValidationSplit
# --------------------------
evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="r2")

tvs = TrainValidationSplit(
    estimator=rf_pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    trainRatio=0.8,
    parallelism=4
)

# --------------------------
# Huấn luyện mô hình
# --------------------------
rf_tvs_model = tvs.fit(train_df)

# --------------------------
# In thông tin mô hình tốt nhất
# --------------------------
best_rf = rf_tvs_model.bestModel.stages[-1]
print("Best Random Forest Model:")
print(" - numTrees:", best_rf.getNumTrees)
print(" - maxDepth:", best_rf.getMaxDepth())


Best Random Forest Model:
 - numTrees: 50
 - maxDepth: 10


In [10]:
# --------------------------
# Đánh giá mô hình
# --------------------------
train_pred = rf_tvs_model.bestModel.transform(train_df)
test_pred  = rf_tvs_model.bestModel.transform(test_df)

metrics = ['r2', 'mae', 'rmse']
for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName=metric)
    print(f"{metric.upper()} Train:", round(evaluator.evaluate(train_pred), 4))
    print(f"{metric.upper()} Test :", round(evaluator.evaluate(test_pred), 4))
    print("----------")


R2 Train: 0.9539
R2 Test : 0.2481
----------
MAE Train: 3.0962
MAE Test : 48.1052
----------
RMSE Train: 6.2239
RMSE Test : 105.6282
----------


## xgb

In [11]:
from pyspark.ml import Pipeline
from xgboost.spark import SparkXGBRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

# --------------------------
# XGBoost Spark phân tán
# --------------------------
xgb = SparkXGBRegressor(
    features_col="scaledFeatures",
    label_col=target,
    num_workers=1,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    objective="reg:squarederror"
)

# --------------------------
# Pipeline
# --------------------------
xgb_pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler, xgb])
# --------------------------
# Grid Search / TrainValidationSplit
# --------------------------
param_grid = (
    ParamGridBuilder()
    .addGrid(xgb.max_depth, [4, 6])
    .addGrid(xgb.n_estimators, [50, 100])
    .addGrid(xgb.learning_rate, [0.05, 0.1])
    .addGrid(xgb.subsample, [0.7, 0.8])
    .build()
)

evaluator = RegressionEvaluator(
    labelCol=target,
    predictionCol="prediction",
    metricName="r2"
)

tvs = TrainValidationSplit(
    estimator=xgb_pipeline,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    trainRatio=0.8,
    parallelism=4,
    seed=42
)

# --------------------------
# Huấn luyện mô hình
# --------------------------
xgb_tvs_model = tvs.fit(train_df)

# --------------------------
# In thông tin mô hình tốt nhất
# --------------------------
best_xgb = xgb_tvs_model.bestModel.stages[-1]
print("Best XGBoost Model:")
print(" - maxDepth:", best_xgb.getMaxDepth())
print(" - n_estimators:", best_xgb.getNEstimators())
print(" - learning_rate:", best_xgb.getLearningRate())
print(" - subsample:", best_xgb.getSubsample())


IllegalArgumentException: most_freq_category_ohe does not exist. Available: customer_id, total_spend, n_orders, avg_order_value, std_order_value, avg_items_per_order, total_orders, first_year, last_year, years_active, most_freq_category, gender, education, income, state, next_year_orders, TrainValidationSplit_cc65fb4a4e7a_rand

In [None]:
# --------------------------
# Dự đoán & đánh giá
# --------------------------
train_pred = xgb_tvs_model.bestModel.transform(train_df)
test_pred  = xgb_tvs_model.bestModel.transform(test_df)

metrics = ['r2', 'mae', 'rmse']
for metric in metrics:
    evaluator = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName=metric)
    print(f"{metric.upper()} Train:", round(evaluator.evaluate(train_pred), 4))
    print(f"{metric.upper()} Test :", round(evaluator.evaluate(test_pred), 4))
    print("----------")
