In [7]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Query_Iceberg")
    .config(
        "spark.jars",
        ",".join([
            "/home/jovyan/jars/iceberg-spark-runtime-3.4_2.12-1.5.2.jar",
            "/home/jovyan/jars/iceberg-nessie-1.5.2.jar",
            "/home/jovyan/jars/nessie-client-0.99.0.jar",
            "/home/jovyan/jars/nessie-spark-extensions-3.4_2.12-0.105.7.jar",
            "/home/jovyan/jars/iceberg-aws-bundle-1.5.2.jar",
            "/home/jovyan/jars/hadoop-aws-3.3.4.jar",
            "/home/jovyan/jars/aws-java-sdk-bundle-1.12.772.jar"
        ])
    )
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
    )
    .config("spark.sql.defaultCatalog", "nessie")
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .config(
        "spark.sql.catalog.nessie.catalog-impl",
        "org.apache.iceberg.nessie.NessieCatalog"
    )
    .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v2")
    .config("spark.sql.catalog.nessie.ref", "main")
    .config(
        "spark.sql.catalog.nessie.warehouse",
        "s3a://promotionengine-search"
    )
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
    )
    .getOrCreate()
)


In [8]:
spark

In [9]:
spark.sql("SHOW CATALOGS").show()

+-------------+
|      catalog|
+-------------+
|       nessie|
|spark_catalog|
+-------------+



In [10]:
spark.sql("SHOW TABLES IN nessie.sales").show(truncate=False)

+---------+------------+-----------+
|namespace|tableName   |isTemporary|
+---------+------------+-----------+
|sales    |mongo_orders|false      |
+---------+------------+-----------+



In [11]:
spark.sql("""
SELECT *
FROM nessie.sales.mongo_orders
LIMIT 10
""").show(truncate=False)

[Stage 4:>                                                          (0 + 1) / 1]

+------------------------+-----------+--------+-------------------------------------------------------------------+-------------------+--------------+-------------------------------------------+----------+------------+
|_id                     |customer_id|discount|items                                                              |order_date         |order_id      |shipping_address                           |status    |total_amount|
+------------------------+-----------+--------+-------------------------------------------------------------------+-------------------+--------------+-------------------------------------------+----------+------------+
|694a6bcd0b38f64d236d68ee|CUST-101   |null    |[{P001, Gaming Laptop, 1, 1200.5}, {P005, Wireless Mouse, 1, 25.0}]|2024-02-01T08:30:00|ORD-2024-001  |{Hyderabad, null, Telangana, 500081}       |DELIVERED |1225.5      |
|694a6bcd0b38f64d236d68ef|CUST-102   |null    |[{P003, Mechanical Keyboard, 2, 45.0}]                             |2024-02-0

                                                                                

In [14]:
a=spark.sql("""
SELECT *
FROM nessie.sales.mongo_orders
LIMIT 10
""")
print(a)

DataFrame[_id: string, customer_id: string, discount: int, items: array<struct<product_id:string,product_name:string,quantity:int,unit_price:double>>, order_date: string, order_id: string, shipping_address: struct<city:string,landmark:string,state:string,zip:string>, status: string, total_amount: double]
