# Bronze to Silver (Iceberg -> Parquet + ORC)

This notebook:
- reads Bronze Iceberg table from the source-to-bronze notebook
- applies simple transformations (filter, rename, cast) and writes Parquet
- applies simple aggregations (group by) and writes ORC


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os


In [8]:
# Iceberg + output settings
WAREHOUSE_PATH = "/home/jovyan/work/data/lakehouse/warehouse"
BRONZE_TABLE = "local.bronze.marquez_raw"

SILVER_BASE_DIR = "/home/jovyan/work/data/silver"
SILVER_TRANSFORMED_PATH = os.path.join(SILVER_BASE_DIR, "marquez_transformed_parquet")
SILVER_AGG_PATH = os.path.join(SILVER_BASE_DIR, "marquez_aggregated_orc")

spark = (
    SparkSession.builder
    .appName("bronze-to-silver-marquez")
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1")
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.local.type", "hadoop")
    .config("spark.sql.catalog.local.warehouse", WAREHOUSE_PATH)
    .getOrCreate()
)

spark.version


'3.5.3'

In [9]:
bronze_df = spark.table(BRONZE_TABLE)
print("Bronze rows:", bronze_df.count())
bronze_df.printSchema()
bronze_df.show(20, truncate=False)


Bronze rows: 153
root
 |-- event_time: timestamp (nullable = true)
 |-- event: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- job_name: string (nullable = true)
 |-- job_namespace: string (nullable = true)
 |-- producer: string (nullable = true)
 |-- run_uuid: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- _event_type: string (nullable = true)
 |-- ingestion_ts: timestamp (nullable = true)

+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [10]:
# Helpers for dynamic column selection
columns = bronze_df.columns

preferred_business_cols = ["event_type", "name", "namespace_name", "namespace", "type", "job_name"]
business_col = next((c for c in preferred_business_cols if c in columns), None)
if business_col is None:
    business_col = next((c for c in columns if c != "ingestion_ts"), None)

if business_col is None:
    raise RuntimeError("No business column found to transform")

print("Business column selected:", business_col)


Business column selected: event_type


In [12]:
# 2.1 Transformations: filter + rename + cast -> Parquet
silver_transformed_df = (
    bronze_df
    .filter(F.col(business_col).isNotNull())
    .withColumn("entity_value", F.col(business_col).cast("string"))
    .withColumn("ingestion_ts", F.col("ingestion_ts").cast("timestamp"))
    .withColumn("ingestion_date", F.to_date("ingestion_ts"))
    .drop(business_col)
)

silver_transformed_df.show(20, truncate=False)
os.makedirs(SILVER_BASE_DIR, exist_ok=True)
silver_transformed_df.write.mode("overwrite").parquet(SILVER_TRANSFORMED_PATH)

print("Transformed Parquet saved to:", SILVER_TRANSFORMED_PATH)


+-----------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
# 2.2 Aggregations: group by -> ORC
silver_agg_df = (
    silver_transformed_df
    .groupBy("ingestion_date", "entity_value")
    .agg(F.count(F.lit(1)).alias("row_count"))
    .orderBy(F.col("row_count").desc(), F.col("ingestion_date"), F.col("entity_value"))
)

silver_agg_df.show(50, truncate=False)
silver_agg_df.write.mode("overwrite").orc(SILVER_AGG_PATH)

print("Aggregated ORC saved to:", SILVER_AGG_PATH)


+--------------+------------+---------+
|ingestion_date|entity_value|row_count|
+--------------+------------+---------+
|2026-02-24    |RUNNING     |49       |
|2026-02-24    |COMPLETE    |43       |
|2026-02-24    |START       |21       |
+--------------+------------+---------+

Aggregated ORC saved to: /home/jovyan/work/data/silver/marquez_aggregated_orc


## Notes

- This notebook expects `local.bronze.marquez_raw` to already exist.
- If you changed Bronze table name, update `BRONZE_TABLE` before running.
