Data Quality Issues to Fix:
- Duplicate order_ids
- Timestamp format inconsistencies
- Invalid order statuses
- Logical inconsistencies: Delivered orders without delivery dates

Requirements:

- Deduplicate on order_id (keep first occurrence)
- Convert all timestamp columns to ISO 8601 datetime format:
  - order_purchase_timestamp
  - order_approved_at
  - order_delivered_carrier_date
  - order_delivered_customer_date
  - order_estimated_delivery_date

- Validate order_status is one of: ['delivered', 'shipped', 'canceled', 'unavailable', 'invoiced', 'processing', 'created', 'approved']
- Business rule: If order_status = 'delivered', then order_delivered_customer_date must NOT be null
- Business rule: order_delivered_customer_date >= order_purchase_timestamp (if not null)
- Validate: No null values in order_id, customer_id, order_status, order_purchase_timestamp

In [0]:
from pyspark.sql import functions as F


In [0]:
bronze_orders = spark.read.table("golden_360.bronze.orders")

bronze_orders.show()

In [0]:
silver_orders_1 = (
    bronze_orders.
    withColumn("order_purchase_timestamp",F.date_format("order_purchase_timestamp","yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
    .withColumn("order_approved_at",F.date_format("order_approved_at","yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
    .withColumn("order_delivered_carrier_date",F.date_format("order_delivered_carrier_date","yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
    .withColumn("order_delivered_customer_date",F.date_format("order_delivered_customer_date","yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
    .withColumn("order_estimated_delivery_date",F.date_format("order_estimated_delivery_date","yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
)
silver_orders_1.show()

In [0]:
valid_statuses = ['delivered', 'shipped', 'canceled', 'unavailable', 'invoiced', 'processing', 'created', 'approved']

silver_orders_2 = silver_orders_1.filter(F.col("order_status").isin(valid_statuses))
silver_orders_2.show()

In [0]:
silver_orderrs_3 = silver_orders_2.where(
    (F.col("order_status") == "delivered") & 
    (F.col("order_delivered_customer_date").isNotNull())
)
silver_orderrs_3.show()

In [0]:
silver_orders_4 = silver_orderrs_3.where(
    (F.col("order_delivered_customer_date") >= F.col("order_purchase_timestamp"))
)
silver_orders_4.show()

In [0]:
silver_final_order = silver_orders_4.where(
    F.col("order_id").isNotNull()&
    F.col("customer_id").isNotNull()&
    F.col("order_status").isNotNull() & 
    F.col("order_purchase_timestamp").isNotNull()
)

In [0]:
silver_final_order.write.format("delta").mode("overwrite").saveAsTable("golden_360.silver.orders")

In [0]:
from pyspark.sql import functions as F

bronze_orders = spark.read.table("golden_360.bronze.orders")
df_deduped = bronze_orders.dropDuplicates(["order_id"])

df_valid_nulls = df_deduped.dropna(subset=[
    "order_id", "customer_id", "order_status", "order_purchase_timestamp"
])

valid_statuses = ['delivered', 'shipped', 'canceled', 'unavailable', 'invoiced', 'processing', 'created', 'approved']

silver_filtered = df_valid_nulls.where(
    # Rule: Status must be valid
    (F.col("order_status").isin(valid_statuses)) &
    # Rule: If delivered, must have a date. If not delivered, it's fine.
    ((F.col("order_status") != "delivered") | (F.col("order_delivered_customer_date").isNotNull())) &
    # Rule: Delivered date >= Purchase date (if delivered date exists)
    ((F.col("order_delivered_customer_date").isNull()) | 
     (F.col("order_delivered_customer_date") >= F.col("order_purchase_timestamp")))
)

# 4. Final Formatting to ISO 8601 Strings
timestamp_cols = [
    "order_purchase_timestamp", "order_approved_at", 
    "order_delivered_carrier_date", "order_delivered_customer_date", 
    "order_estimated_delivery_date"
]

silver_final = silver_filtered
for c in timestamp_cols:
    silver_final = silver_final.withColumn(c, F.date_format(F.col(c), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))

# 5. Save
silver_final.write.format("delta").mode("overwrite").saveAsTable("golden_360.silver.orders")
