In [0]:
# Read silver orders
silver_orders_df = spark.read.table("silver.orders")
silver_orders_df.display()

order_id,customer_id,order_date,amount,ingestion_timestamp,source_system
1,101,2024-01-01,250.0,2026-01-15T11:03:37.120062Z,oracle
2,102,2024-01-02,300.0,2026-01-15T11:03:37.120062Z,oracle


In [0]:
# Business Question
# How much revenue do we generate per day?

from pyspark.sql.functions import sum

gold_daily_sales_df = (
    silver_orders_df
    .groupBy("order_date")
    .agg(
        sum("amount").alias("daily_sales")
    )
    .orderBy("order_date")
)

In [0]:
gold_daily_sales_df.display()

order_date,daily_sales
2024-01-01,250.0
2024-01-02,300.0


In [0]:
gold_daily_sales_df.write.mode("overwrite").saveAsTable("gold.daily_sales")

In [0]:
%sql
SELECT * FROM gold.daily_sales;

order_date,daily_sales
2024-01-01,250.0
2024-01-02,300.0


In [0]:
# Revenue per customer

from pyspark.sql.functions import sum

gold_revenue_per_customer_df = (
    silver_orders_df
    .groupBy("customer_id")
    .agg(
        sum("amount").alias("total_revenue")
    )
    .orderBy("total_revenue", ascending=False)
)

In [0]:
gold_revenue_per_customer_df.display()

customer_id,total_revenue
102,300.0
101,250.0


In [0]:
gold_revenue_per_customer_df.write.mode("overwrite").saveAsTable("gold.revenue_per_customer")

In [0]:
# Enriched Gold Aggregation

silver_orders_df = spark.read.table("silver.orders")
silver_customers_df = spark.read.table("silver.customers")

In [0]:
gold_daily_sales_country_df = (
    silver_orders_df.alias("o")
    .join(
        silver_customers_df.alias("c"),
        on="customer_id",
        how="inner"
    )
    .groupBy("order_date", "country")
    .agg(
        sum("amount").alias("daily_sales")
    )
    .orderBy("order_date", "country")
)

In [0]:
gold_daily_sales_country_df.display()

order_date,country,daily_sales
2024-01-01,India,250.0
2024-01-02,USA,300.0
