In [2]:
from pyspark.sql import SparkSession
import os
import pyspark

AWS_ACCESS_KEY = "minioadmin"
AWS_SECRET_KEY = "minioadmin"
AWS_S3_ENDPOINT = "http://minio_server:9000"
WAREHOUSE = "s3a://silver/"
NESSIE_URI = "http://nessie:19120/api/v1"


conf = (
    pyspark.SparkConf()
    .setAppName("Lakehouse-Iceberg-ETL")
    .set('spark.jars.packages',
         'org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:1.3.1,'
         'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:0.67.0,'
         'org.apache.hadoop:hadoop-aws:3.3.4,'
         'com.amazonaws:aws-java-sdk-bundle:1.12.300')
    .set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.nessie.uri", NESSIE_URI)
    .set("spark.sql.catalog.nessie.ref", "main")
    .set("spark.sql.catalog.nessie.authentication.type", "NONE")
    .set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .set("spark.sql.catalog.nessie.warehouse", WAREHOUSE)
    .set("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
    .set("spark.sql.catalog.nessie.s3.endpoint", AWS_S3_ENDPOINT)
    .set("spark.sql.catalog.nessie.s3.access-key", AWS_ACCESS_KEY)
    .set("spark.sql.catalog.nessie.s3.secret-key", AWS_SECRET_KEY)
    .set("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .set("spark.hadoop.fs.s3a.path.style.access", "true")
)

# Tạo SparkSession
spark = SparkSession.builder.config(conf=conf).getOrCreate()

# Bật path-style access cho MinIO sau khi SparkSession được tạo
spark._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")


In [6]:
# ---------------- 1️⃣ READ Bronze CSV ----------------
bronze_path = "s3a://brozen/Boston.csv"
df_bronze = spark.read.option("header", True).option("inferSchema", True).csv(bronze_path)
# Hiển thị 5 dòng đầu tiên
df_bronze.show(1)

+-------+----+-----+----+-----+-----+----+----+---+---+-------+-----+-----+----+
|   crim|  zn|indus|chas|  nox|   rm| age| dis|rad|tax|ptratio|black|lstat|medv|
+-------+----+-----+----+-----+-----+----+----+---+---+-------+-----+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|4.09|  1|296|   15.3|396.9| 4.98|24.0|
+-------+----+-----+----+-----+-----+----+----+---+---+-------+-----+-----+----+
only showing top 1 row



In [7]:
df_silver = df_bronze.dropna()
df_silver = df_silver.withColumn("CRIM", df_silver["CRIM"].cast("double"))


In [8]:
df_silver.writeTo("nessie.silver_bostontest").createOrReplace()


In [9]:
spark.sql("SELECT * FROM nessie.silver_bostontest").show(5)


+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|   CRIM|  zn|indus|chas|  nox|   rm| age|   dis|rad|tax|ptratio| black|lstat|medv|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575|65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185|61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998|45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147|54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
+-------+----+-----+----+-----+-----+----+------+---+---+-------+------+-----+----+
only showing top 5 rows



In [3]:
spark.sql("SHOW TABLES IN nessie").show()


+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|         |silver_bostontest|      false|
+---------+-----------------+-----------+

