# Iceberg SQL-Only Transform + Aggregate

This notebook uses Spark SQL only (no DataFrame transformation API) to:
- read from Iceberg table `local.bronze.marquez_raw_combined_v1`
- apply modifications and aggregations
- write results to a new Iceberg table


In [1]:
from pyspark.sql import SparkSession


In [3]:
# Iceberg settings
WAREHOUSE_PATH = "/home/jovyan/work/data/lakehouse/warehouse"
SOURCE_TABLE = "local.bronze.marquez_raw_combined_v1"
TARGET_TABLE = "local.silver.marquez_sql_only_agg_v1"

spark = (
    SparkSession.builder
    .appName("iceberg-sql-only-transform-aggregate")
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.local.type", "hadoop")
    .config("spark.sql.catalog.local.warehouse", WAREHOUSE_PATH)
    .getOrCreate()
)

spark.version


'3.5.3'

In [4]:
# Validate source table exists
spark.sql(f"SELECT COUNT(*) AS source_row_count FROM {SOURCE_TABLE}").show(truncate=False)


+----------------+
|source_row_count|
+----------------+
|172             |
+----------------+



In [9]:
spark.table(SOURCE_TABLE).printSchema()


root
 |-- event_time: timestamp (nullable = true)
 |-- event: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- job_name: string (nullable = true)
 |-- job_namespace: string (nullable = true)
 |-- producer: string (nullable = true)
 |-- run_uuid: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- _event_type: string (nullable = true)
 |-- ingestion_ts: timestamp (nullable = true)



In [17]:
spark.sql(f"""
INSERT INTO {TARGET_TABLE}
SELECT * FROM
(WITH base AS (
    SELECT
        CAST(ingestion_ts AS TIMESTAMP) AS ingestion_ts,
        TO_DATE(CAST(ingestion_ts AS TIMESTAMP)) AS ingestion_date,
        CAST(COALESCE(event_type, event, job_name, job_namespace, producer) AS STRING) AS entity_value
    FROM {SOURCE_TABLE}
),
cleaned AS (
    SELECT
        ingestion_ts,
        ingestion_date,
        UPPER(TRIM(entity_value)) AS entity_value_norm,
        CASE
            WHEN UPPER(TRIM(entity_value)) LIKE 'COMPLETE%' THEN 'COMPLETE'
            WHEN UPPER(TRIM(entity_value)) LIKE 'START%' THEN 'START'
            WHEN UPPER(TRIM(entity_value)) LIKE 'FAIL%' THEN 'FAIL'
            ELSE 'OTHER'
        END AS entity_bucket
    FROM base
    WHERE entity_value IS NOT NULL
),
agg AS (
    SELECT
        ingestion_date,
        entity_bucket,
        entity_value_norm,
        COUNT(*) AS row_count,
        MIN(ingestion_ts) AS first_seen_ts,
        MAX(ingestion_ts) AS last_seen_ts
    FROM cleaned
    GROUP BY ingestion_date, entity_bucket, entity_value_norm
)
SELECT *
FROM agg) A
""")


DataFrame[]

In [13]:
# SQL-only transformation + aggregation and save to Iceberg
spark.sql("CREATE NAMESPACE IF NOT EXISTS local.silver")

spark.sql(f"DROP TABLE IF EXISTS {TARGET_TABLE}")

# spark.sql(f"""
# CREATE TABLE {TARGET_TABLE}
# USING iceberg
# TBLPROPERTIES ('format-version'='2')
# AS
# WITH base AS (
#     SELECT
#         CAST(ingestion_ts AS TIMESTAMP) AS ingestion_ts,
#         TO_DATE(CAST(ingestion_ts AS TIMESTAMP)) AS ingestion_date,
#         CAST(COALESCE(event_type, event, job_name, job_namespace, producer) AS STRING) AS entity_value
#     FROM {SOURCE_TABLE}
# ),
# cleaned AS (
#     SELECT
#         ingestion_ts,
#         ingestion_date,
#         UPPER(TRIM(entity_value)) AS entity_value_norm,
#         CASE
#             WHEN UPPER(TRIM(entity_value)) LIKE 'COMPLETE%' THEN 'COMPLETE'
#             WHEN UPPER(TRIM(entity_value)) LIKE 'START%' THEN 'START'
#             WHEN UPPER(TRIM(entity_value)) LIKE 'FAIL%' THEN 'FAIL'
#             ELSE 'OTHER'
#         END AS entity_bucket
#     FROM base
#     WHERE entity_value IS NOT NULL
# ),
# agg AS (
#     SELECT
#         ingestion_date,
#         entity_bucket,
#         entity_value_norm,
#         COUNT(*) AS row_count,
#         MIN(ingestion_ts) AS first_seen_ts,
#         MAX(ingestion_ts) AS last_seen_ts
#     FROM cleaned
#     GROUP BY ingestion_date, entity_bucket, entity_value_norm
# )
# SELECT *
# FROM agg
# """)


DataFrame[]

In [14]:
# Preview written Iceberg table
spark.sql(f"SELECT * FROM {TARGET_TABLE} ORDER BY row_count DESC, ingestion_date, entity_value_norm LIMIT 100").show(truncate=False)
spark.sql(f"SELECT COUNT(*) AS target_row_count FROM {TARGET_TABLE}").show(truncate=False)


+--------------+-------------+-----------------+---------+--------------------------+--------------------------+
|ingestion_date|entity_bucket|entity_value_norm|row_count|first_seen_ts             |last_seen_ts              |
+--------------+-------------+-----------------+---------+--------------------------+--------------------------+
|2026-02-24    |OTHER        |RUNNING          |73       |2026-02-24 10:44:17.572374|2026-02-24 10:44:17.572374|
|2026-02-24    |COMPLETE     |COMPLETE         |64       |2026-02-24 10:44:17.572374|2026-02-24 10:44:17.572374|
|2026-02-24    |START        |START            |35       |2026-02-24 10:44:17.572374|2026-02-24 10:44:17.572374|
+--------------+-------------+-----------------+---------+--------------------------+--------------------------+

+----------------+
|target_row_count|
+----------------+
|3               |
+----------------+



## Notes

- SQL operations are done through `spark.sql(...)` only.
- Target table is unique and does not overwrite previous notebook outputs.
