In [11]:
from pyspark.sql import SparkSession
import os

# Set AWS credentials and region for SDK v2
os.environ['AWS_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

packages = ",".join(
    [
        "org.apache.hadoop:hadoop-aws:3.3.4",
        "com.amazonaws:aws-java-sdk-bundle:1.12.262",
        "org.apache.iceberg:iceberg-aws-bundle:1.4.3",
        "org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.3",
    ]
)

# Polaris + Iceberg with MinIO
spark = SparkSession.builder \
    .appName("Polaris-Iceberg-MinIO") \
    .config("spark.jars.packages", packages) \
    .config("spark.sql.catalog.polaris", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.polaris.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.polaris.uri", "http://host.docker.internal:8181/api/catalog") \
    .config("spark.sql.catalog.polaris.credential", "admin:admin") \
    .config("spark.sql.catalog.polaris.warehouse", "demo_catalog") \
    .config("spark.sql.catalog.polaris.scope", "PRINCIPAL_ROLE:ALL") \
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minio") \
    .config("spark.hadoop.fs.s3a.secret.key", "minio123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()

print("✓ Spark session created!")
print("✓ Connected to Polaris REST catalog")

✓ Spark session created!
✓ Connected to Polaris REST catalog


In [12]:
# Show namespaces
print("=== Current Namespaces ===")
spark.sql("SHOW NAMESPACES IN polaris").show()

=== Current Namespaces ===
+---------+
|namespace|
+---------+
+---------+



In [13]:
# Create namespace
spark.sql("CREATE NAMESPACE IF NOT EXISTS polaris.test_db")
print("✓ Namespace 'test_db' created!")

✓ Namespace 'test_db' created!


In [33]:
# Create table
spark.sql("""
    CREATE TABLE IF NOT EXISTS polaris.test_db.test_table (
        id INT,
        name STRING
    ) USING iceberg
""")
print("✓ Table 'test_table' created!")

# Insert data
spark.sql("INSERT INTO polaris.test_db.test_table VALUES (1, 'Hello'), (2, 'World')")
print("✓ Data inserted!")

✓ Table 'test_table' created!
✓ Data inserted!


In [34]:
# Query data
print("=== Query Results ===")
spark.sql("SELECT * FROM polaris.test_db.test_table").show()

=== Query Results ===
+---+-----+
| id| name|
+---+-----+
|  1|Hello|
|  2|World|
|  1|Hello|
|  2|World|
+---+-----+



In [17]:
# Count rows
spark.sql("SELECT COUNT(*) as total FROM polaris.test_db.test_table").show()

+-----+
|total|
+-----+
|    2|
+-----+



In [18]:
# Show table metadata
spark.sql("DESCRIBE EXTENDED polaris.test_db.test_table").show(100, truncate=False)

+----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                                                                       |comment|
+----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|id                          |int                                                                                                                                                             |NULL   |
|name                        |string                                                                                                                                                          |NULL   |


In [19]:
# Show table history (Iceberg time travel feature)
spark.sql("SELECT * FROM polaris.test_db.test_table.history").show(truncate=False)


+-----------------------+-------------------+---------+-------------------+
|made_current_at        |snapshot_id        |parent_id|is_current_ancestor|
+-----------------------+-------------------+---------+-------------------+
|2025-12-11 13:40:27.611|5276107973295498253|NULL     |true               |
+-----------------------+-------------------+---------+-------------------+



In [35]:
# Show snapshots
spark.sql("SELECT * FROM polaris.test_db.test_table.snapshots").show(truncate=False)

+-----------------------+-------------------+-------------------+---------+---------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|committed_at           |snapshot_id        |parent_id          |operation|manifest_list                                                                                                  |summary                                                                                                                                                                                                                                                                                         |
+-----------------------+-------------------+-