In [11]:
from pyspark.sql import SparkSession
import os

# AWS_REGION = os.getenv("AWS_REGION")
# AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
# AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

POLARIS_USERNAME = os.getenv("POLARIS_USERNAME")
POLARIS_PASSWORD = os.getenv("POLARIS_PASSWORD")
POLARIS_CATALOG_NAME = os.getenv("POLARIS_CATALOG_NAME", 'demo_catalog')

packages = ",".join(
    [
        "org.apache.iceberg:iceberg-aws-bundle:1.10.0",
        "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.0",
    ]
)

# Polaris + Iceberg with MinIO
spark = (SparkSession.builder \
    .appName("Polaris-Iceberg-MinIO") \
    .config("spark.jars.packages", packages)
    .config('spark.sql.iceberg.vectorization.enabled', 'false')
    .config("spark.sql.catalog.polaris.type", "rest")
    .config("spark.sql.catalog.polaris", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.polaris.uri", "http://polaris:8181/api/catalog")
    .config("spark.sql.catalog.polaris.token-refresh-enabled", "true")
    .config("spark.sql.catalog.polaris.credential", f"{POLARIS_USERNAME}:{POLARIS_PASSWORD}")
    .config("spark.sql.catalog.polaris.warehouse", POLARIS_CATALOG_NAME)
    .config("spark.sql.catalog.polaris.scope", "PRINCIPAL_ROLE:ALL")
    .config("spark.sql.catalog.polaris.io-impl", "org.apache.iceberg.io.ResolvingFileIO")
    .config("spark.sql.catalog.polaris.s3.region", "us-west-1")
).getOrCreate()

print("✓ Spark session created!")
print("✓ Connected to Polaris REST catalog")

✓ Spark session created!
✓ Connected to Polaris REST catalog


In [12]:
# Show namespaces
spark.sql("USE polaris")
spark.sql("SHOW NAMESPACES").show()

=== Current Namespaces ===
+------------+
|   namespace|
+------------+
|          db|
|COLLADO_TEST|
+------------+



In [14]:
# Create Nested Namespaces
spark.sql("CREATE NAMESPACE IF NOT EXISTS COLLADO_TEST")
spark.sql("CREATE NAMESPACE IF NOT EXISTS COLLADO_TEST.PUBLIC")
spark.sql("SHOW NAMESPACES IN COLLADO_TEST").show()

+-------------------+
|          namespace|
+-------------------+
|COLLADO_TEST.PUBLIC|
+-------------------+



In [15]:
# Create a table
spark.sql("USE NAMESPACE COLLADO_TEST.PUBLIC")
spark.sql("""CREATE TABLE IF NOT EXISTS TEST_TABLE (
    id bigint NOT NULL COMMENT 'unique id',
    data string)
USING iceberg;
""")

DataFrame[]

In [16]:
# It's Empty
spark.sql("SELECT * FROM TEST_TABLE").show()

+---+-------------+
| id|         data|
+---+-------------+
|  1|    some data|
|  2|    more data|
|  3|yet more data|
+---+-------------+



In [10]:
# Insert some records
spark.sql("INSERT INTO TEST_TABLE VALUES (1, 'some data'), (2, 'more data'), (3, 'yet more data')")
spark.sql("SELECT * FROM TEST_TABLE").show()

+---+-------------+
| id|         data|
+---+-------------+
|  1|    some data|
|  2|    more data|
|  3|yet more data|
+---+-------------+



In [17]:
# Count rows
spark.sql("SELECT COUNT(*) as total FROM TEST_TABLE").show()

+-----+
|total|
+-----+
|    3|
+-----+



In [18]:
# Show table metadata
spark.sql("DESCRIBE EXTENDED TEST_TABLE").show(100, truncate=False)

+-----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|col_name                     |data_type                                                                                                                                                       |comment  |
+-----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+
|id                           |bigint                                                                                                                                                          |unique id|
|data                         |string                                                                                                                                                       

In [23]:
# Show table history (Iceberg time travel feature)
spark.sql("SELECT * FROM COLLADO_TEST.PUBLIC.TEST_TABLE.history").show(truncate=False)


+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-------------------+---------+-------------------+
|2025-12-23 14:59:18.469|6153580418667096964|NULL     |true               |
+-----------------------+-------------------+---------+-------------------+



In [22]:
# Show snapshots
spark.sql("SELECT * FROM COLLADO_TEST.PUBLIC.TEST_TABLE.snapshots").show(truncate=False)

+-----------------------+-------------------+---------+---------+---------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id|operation|manifest_list                                                                                                              |summary                                                                                                                                                           