In [0]:
# Read Bronze Orders (Validation)

bronze_orders_df = spark.read.table("bronze.orders")
bronze_orders_df.display()

order_id,customer_id,order_date,amount,ingestion_timestamp,source_system
1,101,2024-01-01,250.0,2026-01-15T11:03:37.120062Z,oracle
2,102,2024-01-02,300.0,2026-01-15T11:03:37.120062Z,oracle


In [0]:
# Inspect Bronze Schema

bronze_orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_system: string (nullable = true)



In [0]:
# Silver transformation

from pyspark.sql.functions import col, to_date, row_number
from pyspark.sql.window import Window


# Read Bronze
bronze_orders_df = spark.read.table("bronze.orders")

# Deduplication window
window_spec = Window.partitionBy("order_id").orderBy(col("ingestion_timestamp").desc())

silver_orders_df = (
    bronze_orders_df
    # Data quality filters
    .filter(col("order_id").isNotNull())
    .filter(col("customer_id").isNotNull())
    .filter(col("amount") > 0)

    # Convert order_date from int to date (yyyyMMdd)
    .withColumn(
        "order_date",
        to_date(col("order_date").cast("string"))
    )

    # Deduplication
    .withColumn("row_num", row_number().over(window_spec))
    .filter(col("row_num") == 1)
    .drop("row_num")
)

In [0]:
silver_orders_df.display()

order_id,customer_id,order_date,amount,ingestion_timestamp,source_system
1,101,2024-01-01,250.0,2026-01-15T11:03:37.120062Z,oracle
2,102,2024-01-02,300.0,2026-01-15T11:03:37.120062Z,oracle


In [0]:
# write silver orders table

silver_orders_df.write.mode("overwrite").saveAsTable("silver.orders")

In [0]:
spark.read.table("silver.orders").printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- amount: double (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_system: string (nullable = true)

