In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, isnan, when
import os
import pyspark

AWS_ACCESS_KEY = "minioadmin"
AWS_SECRET_KEY = "minioadmin"
AWS_S3_ENDPOINT = "http://minio_server:9000"
WAREHOUSE = "s3a://silver/"
NESSIE_URI = "http://nessie:19120/api/v1"


conf = (
    pyspark.SparkConf()
    .setAppName("Lakehouse-Iceberg-ETL")
    .set('spark.jars.packages',
         'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,'
         'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.67.0,'
         'org.apache.hadoop:hadoop-aws:3.3.4,'
         'com.amazonaws:aws-java-sdk-bundle:1.12.300')
    .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
    .set("spark.sql.catalog.nessie.ref", "main")
    .set("spark.sql.catalog.nessie.authentication.type", "NONE")
    .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
    .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
    .set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
    .set("spark.sql.catalog.nessie.s3.access-key", AWS_ACCESS_KEY)
    .set("spark.sql.catalog.nessie.s3.secret-key", AWS_SECRET_KEY)
    .set("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
)

# Tạo SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Bật path-style access cho MinIO sau khi SparkSession được tạo
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")

## Đọc dữ liệu

In [2]:
df_amazon = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .option("multiLine", "true") \
    .csv("s3a://bronze/ecommerse/amazon-purchases.csv")


In [2]:
df_amazon = spark.read.option("header", "true").csv("s3a://bronze/ecommerse/amazon-purchases.csv")
df_survey = spark.read.option("header", "true").csv("s3a://bronze/ecommerse/survey.csv")
df_fields = spark.read.option("header", "true").csv("s3a://bronze/ecommerse/fields.csv")


### Thông tin dữ liệu

#### 1.Amazon-purchase

In [4]:
import pandas as pd
import numpy as np

print("5 dòng đầu tiên:")
display(df_amazon.limit(10).toPandas())


5 dòng đầu tiên:


Unnamed: 0,Order Date,Purchase Price Per Unit,Quantity,Shipping Address State,Title,ASIN/ISBN (Product Code),Category,Survey ResponseID
0,2018-12-04,7.98,1.0,NJ,SanDisk Ultra 16GB Class 10 SDHC UHS-I Memory ...,B0143RTB1E,FLASH_MEMORY,R_01vNIayewjIIKMF
1,2018-12-22,13.99,1.0,NJ,Betron BS10 Earphones Wired Headphones in Ear ...,B01MA1MJ6H,HEADPHONES,R_01vNIayewjIIKMF
2,2018-12-24,8.99,1.0,NJ,,B078JZTFN3,,R_01vNIayewjIIKMF
3,2018-12-25,10.45,1.0,NJ,Perfecto Stainless Steel Shaving Bowl. Durable...,B06XWF9HML,DISHWARE_BOWL,R_01vNIayewjIIKMF
4,2018-12-25,10.0,1.0,NJ,Proraso Shaving Cream for Men,B00837ZOI0,SHAVING_AGENT,R_01vNIayewjIIKMF
5,2019-02-18,10.99,1.0,NJ,Micro USB Cable Android Charger - Syncwire [2-...,B01GFB2E9M,COMPUTER_PROCESSOR,R_01vNIayewjIIKMF
6,2019-02-18,4.99,1.0,NJ,Amazon Basics USB 2.0 Charger Cable - A-Male t...,B00NH13S44,COMPUTER_ADD_ON,R_01vNIayewjIIKMF
7,2019-03-15,124.99,1.0,NJ,"Fire HD 8 Tablet (8"" HD Display, 32 GB, withou...",B077H6L7T9,AMAZON_TABLET,R_01vNIayewjIIKMF
8,2019-04-23,12.99,1.0,NJ,"Men's Leather Belt, Ratchet Dress Belt with Au...",B07L84ZZXC,APPAREL_BELT,R_01vNIayewjIIKMF
9,2019-04-23,24.69,1.0,NJ,,B06XKNWJN2,,R_01vNIayewjIIKMF


In [5]:
# Đếm số dòng
num_rows = df_amazon.count()

# Đếm số cột
num_cols = len(df_amazon.columns)

print(f"\nKích thước dữ liệu: ({num_rows}, {num_cols})")



Kích thước dữ liệu: (1850717, 8)


In [6]:
from pyspark.sql import functions as F

# Đếm null mỗi cột và chuyển thành bảng "column | null_count"
null_counts = (
    df_amazon.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df_amazon.columns
    ])
    .toPandas()  # chuyển thành DataFrame Pandas
    .melt(var_name="column", value_name="null_count")  # xoay bảng
    .sort_values(by="null_count", ascending=False)  # sắp xếp giảm dần
)

display(null_counts)


Unnamed: 0,column,null_count
4,Title,89740
6,Category,89458
3,Shipping Address State,87812
5,ASIN/ISBN (Product Code),973
0,Order Date,0
1,Purchase Price Per Unit,0
2,Quantity,0
7,Survey ResponseID,0


#### 2.Survey

In [6]:

print("2 dòng đầu tiên:")
df_survey.limit(2).toPandas()


2 dòng đầu tiên:


Unnamed: 0,Survey ResponseID,Q-demos-age,Q-demos-hispanic,Q-demos-race,Q-demos-education,Q-demos-income,Q-demos-gender,Q-sexual-orientation,Q-demos-state,Q-amazon-use-howmany,...,Q-substance-use-marijuana,Q-substance-use-alcohol,Q-personal-diabetes,Q-personal-wheelchair,Q-life-changes,Q-sell-YOUR-data,Q-sell-consumer-data,Q-small-biz-use,Q-census-use,Q-research-society
0,R_1ou69fj4DQGsVcp,35 - 44 years,No,Black or African American,High school diploma or GED,"$25,000 - $49,999",Female,heterosexual (straight),Iowa,2,...,No,Yes,No,No,Lost a job,No,No,No,No,No
1,R_2UbJL30HRjK1sdD,45 - 54 years,No,White or Caucasian,High school diploma or GED,"$100,000 - $149,999",Male,heterosexual (straight),Ohio,2,...,No,No,No,No,,No,No,No,No,Yes


In [7]:
# Đếm số dòng
num_rows = df_survey.count()

# Đếm số cột
num_cols = len(df_survey.columns)

print(f"\nKích thước dữ liệu: ({num_rows}, {num_cols})")



Kích thước dữ liệu: (5027, 23)


In [8]:
from pyspark.sql import functions as F

# Đếm null mỗi cột và chuyển thành bảng "column | null_count"
null_counts = (
    df_survey.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df_survey.columns
    ])
    .toPandas()  # chuyển thành DataFrame Pandas
    .melt(var_name="column", value_name="null_count")  # xoay bảng
    .sort_values(by="null_count", ascending=False)  # sắp xếp giảm dần
)

display(null_counts)


Unnamed: 0,column,null_count
17,Q-life-changes,3384
0,Survey ResponseID,0
12,Q-substance-use-cigarettes,0
21,Q-census-use,0
20,Q-small-biz-use,0
19,Q-sell-consumer-data,0
18,Q-sell-YOUR-data,0
16,Q-personal-wheelchair,0
15,Q-personal-diabetes,0
14,Q-substance-use-alcohol,0


#### 3.Fields

In [9]:

print("2 dòng đầu tiên:")
df_fields.limit(2).toPandas()


2 dòng đầu tiên:


Unnamed: 0,_c0,fields
0,Survey ResponseID,Response ID
1,Q-demos-age,What is your age group?


In [10]:
# Đếm số dòng
num_rows = df_fields.count()

# Đếm số cột
num_cols = len(df_fields.columns)

print(f"\nKích thước dữ liệu: ({num_rows}, {num_cols})")



Kích thước dữ liệu: (29, 2)


In [24]:
from pyspark.sql import functions as F

# Đếm null mỗi cột và chuyển thành bảng "column | null_count"
null_counts = (
    df_fields.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df_fields.columns
    ])
    .toPandas()  # chuyển thành DataFrame Pandas
    .melt(var_name="column", value_name="null_count")  # xoay bảng
    .sort_values(by="null_count", ascending=False)  # sắp xếp giảm dần
)

display(null_counts)


Unnamed: 0,column,null_count
1,fields,6
0,_c0,0


## Tiền xử lý dữ liệu

### 1.Amazon-purchase

In [7]:
# Chuyển kiểu dữ liệu cột ngày
from pyspark.sql.functions import to_date
df_amazon = df_amazon.withColumn("Order Date", to_date(col("Order Date"), "yyyy-MM-dd"))

# Loại bỏ toàn bộ dòng có giá trị null
df_amazon_clean = df_amazon.na.drop()

In [8]:
# Đếm lại số dòng còn lại
clean_count = df_amazon_clean.count()
print(f"\nSố dòng còn lại sau khi loại bỏ null: {clean_count}")


Số dòng còn lại sau khi loại bỏ null: 1675015


In [9]:
from pyspark.sql import functions as F

# Đếm null mỗi cột và chuyển thành bảng "column | null_count"
null_counts = (
    df_amazon_clean.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df_amazon_clean.columns
    ])
    .toPandas()  # chuyển thành DataFrame Pandas
    .melt(var_name="column", value_name="null_count")  # xoay bảng
    .sort_values(by="null_count", ascending=False)  # sắp xếp giảm dần
)

display(null_counts)


Unnamed: 0,column,null_count
0,Order Date,0
1,Purchase Price Per Unit,0
2,Quantity,0
3,Shipping Address State,0
4,Title,0
5,ASIN/ISBN (Product Code),0
6,Category,0
7,Survey ResponseID,0


In [10]:
# Hiển thị schema sau khi làm sạch
df_amazon_clean.printSchema()

root
 |-- Order Date: date (nullable = true)
 |-- Purchase Price Per Unit: double (nullable = true)
 |-- Quantity: double (nullable = true)
 |-- Shipping Address State: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ASIN/ISBN (Product Code): string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Survey ResponseID: string (nullable = true)



In [12]:
# (Tùy chọn) xem 5 dòng đầu sau khi làm sạch
df_amazon_clean.limit(10).toPandas()

Unnamed: 0,Order Date,Purchase Price Per Unit,Quantity,Shipping Address State,Title,ASIN/ISBN (Product Code),Category,Survey ResponseID
0,2018-12-04,7.98,1.0,NJ,SanDisk Ultra 16GB Class 10 SDHC UHS-I Memory ...,B0143RTB1E,FLASH_MEMORY,R_01vNIayewjIIKMF
1,2018-12-22,13.99,1.0,NJ,Betron BS10 Earphones Wired Headphones in Ear ...,B01MA1MJ6H,HEADPHONES,R_01vNIayewjIIKMF
2,2018-12-25,10.45,1.0,NJ,Perfecto Stainless Steel Shaving Bowl. Durable...,B06XWF9HML,DISHWARE_BOWL,R_01vNIayewjIIKMF
3,2018-12-25,10.0,1.0,NJ,Proraso Shaving Cream for Men,B00837ZOI0,SHAVING_AGENT,R_01vNIayewjIIKMF
4,2019-02-18,10.99,1.0,NJ,Micro USB Cable Android Charger - Syncwire [2-...,B01GFB2E9M,COMPUTER_PROCESSOR,R_01vNIayewjIIKMF
5,2019-02-18,4.99,1.0,NJ,Amazon Basics USB 2.0 Charger Cable - A-Male t...,B00NH13S44,COMPUTER_ADD_ON,R_01vNIayewjIIKMF
6,2019-03-15,124.99,1.0,NJ,"Fire HD 8 Tablet (8"" HD Display, 32 GB, withou...",B077H6L7T9,AMAZON_TABLET,R_01vNIayewjIIKMF
7,2019-04-23,12.99,1.0,NJ,"Men's Leather Belt, Ratchet Dress Belt with Au...",B07L84ZZXC,APPAREL_BELT,R_01vNIayewjIIKMF
8,2019-05-02,9.99,1.0,NJ,UGREEN Tablet Stand Holder Adjustable Portable...,B07CG71KQ1,PORTABLE_ELECTRONIC_DEVICE_STAND,R_01vNIayewjIIKMF
9,2019-05-02,12.79,1.0,NJ,Betron B25 in-Ear Headphones Earphones with Mi...,B079GFF4HZ,HEADPHONES,R_01vNIayewjIIKMF


### 2.Survey

In [18]:
from pyspark.sql.functions import col, when, trim, lit
# Làm sạch cột "Q-life-changes"
# Thay thế giá trị null hoặc chuỗi trống bằng "No"
df_survey_clean = df_survey.withColumn(
    "Q-life-changes",
    when(
        (col("Q-life-changes").isNull()) | (trim(col("Q-life-changes")) == ""),  # ô null hoặc chuỗi trống
        lit("No")                                                                # thay bằng "No"
    ).otherwise(col("Q-life-changes"))                                           # giữ nguyên các giá trị còn lại
)


In [19]:
#Hiển thị kết quả sau khi làm sạch
print("\nSau khi làm sạch cột 'Q-life-changes':")
df_survey_clean.select("Q-life-changes").show(10, truncate=False)


Sau khi làm sạch cột 'Q-life-changes':
+------------------------+
|Q-life-changes          |
+------------------------+
|Lost a job              |
|No                      |
|No                      |
|No                      |
|No                      |
|No                      |
|No                      |
|No                      |
|No                      |
|Moved place of residence|
+------------------------+
only showing top 10 rows



In [22]:
# Đếm null mỗi cột và chuyển thành bảng "column | null_count"
null_counts = (
    df_survey_clean.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df_survey_clean.columns
    ])
    .toPandas()  # chuyển thành DataFrame Pandas
    .melt(var_name="column", value_name="null_count")  # xoay bảng
    .sort_values(by="null_count", ascending=False)  # sắp xếp giảm dần
)

display(null_counts)

Unnamed: 0,column,null_count
0,Survey ResponseID,0
12,Q-substance-use-cigarettes,0
21,Q-census-use,0
20,Q-small-biz-use,0
19,Q-sell-consumer-data,0
18,Q-sell-YOUR-data,0
17,Q-life-changes,0
16,Q-personal-wheelchair,0
15,Q-personal-diabetes,0
14,Q-substance-use-alcohol,0


In [24]:
df_survey_clean.limit(2).toPandas()


Unnamed: 0,Survey ResponseID,Q-demos-age,Q-demos-hispanic,Q-demos-race,Q-demos-education,Q-demos-income,Q-demos-gender,Q-sexual-orientation,Q-demos-state,Q-amazon-use-howmany,...,Q-substance-use-marijuana,Q-substance-use-alcohol,Q-personal-diabetes,Q-personal-wheelchair,Q-life-changes,Q-sell-YOUR-data,Q-sell-consumer-data,Q-small-biz-use,Q-census-use,Q-research-society
0,R_1ou69fj4DQGsVcp,35 - 44 years,No,Black or African American,High school diploma or GED,"$25,000 - $49,999",Female,heterosexual (straight),Iowa,2,...,No,Yes,No,No,Lost a job,No,No,No,No,No
1,R_2UbJL30HRjK1sdD,45 - 54 years,No,White or Caucasian,High school diploma or GED,"$100,000 - $149,999",Male,heterosexual (straight),Ohio,2,...,No,No,No,No,No,No,No,No,No,Yes


### 3.Fields

In [25]:
# Loại bỏ toàn bộ dòng có giá trị null
df_fields_clean = df_fields.na.drop()

In [26]:
# Đếm lại số dòng còn lại
clean_count = df_fields_clean.count()
print(f"\nSố dòng còn lại sau khi loại bỏ null: {clean_count}")


Số dòng còn lại sau khi loại bỏ null: 23


In [27]:
# Đếm null mỗi cột và chuyển thành bảng "column | null_count"
null_counts = (
    df_fields_clean.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df_fields_clean.columns
    ])
    .toPandas()  # chuyển thành DataFrame Pandas
    .melt(var_name="column", value_name="null_count")  # xoay bảng
    .sort_values(by="null_count", ascending=False)  # sắp xếp giảm dần
)

display(null_counts)


Unnamed: 0,column,null_count
0,_c0,0
1,fields,0


In [28]:
df_fields_clean.limit(2).toPandas()

Unnamed: 0,_c0,fields
0,Survey ResponseID,Response ID
1,Q-demos-age,What is your age group?


## Lưu 3 DataFrame vào Silver

In [13]:
#lưu bảng amazon_clean vào Iceberg
df_amazon_clean.writeTo("nessie.amazon_purchase").createOrReplace()

In [14]:
spark.sql("SELECT * FROM nessie.amazon_purchase").show(10)

+----------+-----------------------+--------+----------------------+--------------------+------------------------+--------------------+-----------------+
|Order Date|Purchase Price Per Unit|Quantity|Shipping Address State|               Title|ASIN/ISBN (Product Code)|            Category|Survey ResponseID|
+----------+-----------------------+--------+----------------------+--------------------+------------------------+--------------------+-----------------+
|2018-12-04|                   7.98|     1.0|                    NJ|SanDisk Ultra 16G...|              B0143RTB1E|        FLASH_MEMORY|R_01vNIayewjIIKMF|
|2018-12-22|                  13.99|     1.0|                    NJ|Betron BS10 Earph...|              B01MA1MJ6H|          HEADPHONES|R_01vNIayewjIIKMF|
|2018-12-25|                  10.45|     1.0|                    NJ|Perfecto Stainles...|              B06XWF9HML|       DISHWARE_BOWL|R_01vNIayewjIIKMF|
|2018-12-25|                   10.0|     1.0|                    NJ|Proraso 

In [32]:
spark.sql("SHOW TABLES IN nessie").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|         |  amazon_purchase|      false|
|         |silver_bostontest|      false|
+---------+-----------------+-----------+



In [34]:
df_survey_clean.writeTo("nessie.survey").createOrReplace()

In [35]:
spark.sql("SHOW TABLES IN nessie").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|         |  amazon_purchase|      false|
|         |silver_bostontest|      false|
|         |           survey|      false|
+---------+-----------------+-----------+



In [40]:
spark.sql("SELECT * FROM nessie.survey").limit(5).toPandas()

Unnamed: 0,Survey ResponseID,Q-demos-age,Q-demos-hispanic,Q-demos-race,Q-demos-education,Q-demos-income,Q-demos-gender,Q-sexual-orientation,Q-demos-state,Q-amazon-use-howmany,...,Q-substance-use-marijuana,Q-substance-use-alcohol,Q-personal-diabetes,Q-personal-wheelchair,Q-life-changes,Q-sell-YOUR-data,Q-sell-consumer-data,Q-small-biz-use,Q-census-use,Q-research-society
0,R_1ou69fj4DQGsVcp,35 - 44 years,No,Black or African American,High school diploma or GED,"$25,000 - $49,999",Female,heterosexual (straight),Iowa,2,...,No,Yes,No,No,Lost a job,No,No,No,No,No
1,R_2UbJL30HRjK1sdD,45 - 54 years,No,White or Caucasian,High school diploma or GED,"$100,000 - $149,999",Male,heterosexual (straight),Ohio,2,...,No,No,No,No,No,No,No,No,No,Yes
2,R_UPXamGKtmf4RVIZ,25 - 34 years,No,White or Caucasian,High school diploma or GED,"$25,000 - $49,999",Male,heterosexual (straight),Arkansas,1 (just me!),...,No,No,Yes,No,No,No,No,No,No,Yes
3,R_2dYk5auG9Fv5Qve,35 - 44 years,Yes,White or Caucasian,"Graduate or professional degree (MA, MS, MBA, ...","$50,000 - $74,999",Male,heterosexual (straight),Tennessee,1 (just me!),...,No,No,No,No,No,No,No,No,No,No
4,R_2aP0GyIR66gSTiR,25 - 34 years,No,White or Caucasian,High school diploma or GED,"$50,000 - $74,999",Male,heterosexual (straight),Virginia,2,...,No,Yes,No,No,No,No,Yes if consumers get part of the profit,I don't know,No,No


In [41]:
df_fields_clean.writeTo("nessie.fields").createOrReplace()

In [42]:
spark.sql("SHOW TABLES IN nessie").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|         |  amazon_purchase|      false|
|         |           fields|      false|
|         |silver_bostontest|      false|
|         |           survey|      false|
+---------+-----------------+-----------+



In [43]:
spark.sql("SELECT * FROM nessie.fields").limit(5).toPandas()

Unnamed: 0,_c0,fields
0,Survey ResponseID,Response ID
1,Q-demos-age,What is your age group?
2,Q-demos-hispanic,"Are you of Spanish, Hispanic, or Latino origin?"
3,Q-demos-race,Choose one or more races that you consider you...
4,Q-demos-education,What is the highest level of education you hav...


In [15]:
spark.stop()  # Dừng SparkSession hiện tại